In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif

#Load data with PANDAS

In [17]:
def load(path):
    """
        Carica i dati da un file CSV in base al path indicato.
        :param path: percorso del CSV da caricare
        :return: dataframe contenente i dati
    """
    return pd.read_csv(path)

TrainXpath="EmberXTrain.csv"
TrainYpath="EmberYTrain.csv"

X=load(TrainXpath)
Y=load(TrainYpath)

#Pre-elaborate data with PANDAS

In [24]:
file_descrizione_x = 'descrizione_X.csv'
file_descrizione_y = 'descrizione_Y.csv'

def preElaborationData(data, filename):
    """
        Fornisce le statistiche degli attributi del dataset
        I valori min,max, avg ecc sono salvati su un file csv
        :param data: dataset
        :param filename: nome del file csv in cui salvare i dati
    """
    features_list = data.columns
    values = []
    for feature in features_list:
        values.append(data[feature].describe())
    statistics = pd.DataFrame(values)
    statistics.to_csv(filename, sep=";")

preElaborationData(X, file_descrizione_x)
preElaborationData(Y, file_descrizione_y)

In [19]:
def preBoxPlotAnalysisData(X, Y):
  """
    Crea un box plot per ciascun attributo elencato nella lista raggrupati per classe
    Il box plot Ã¨ salvato come immagine nella cartella boxplot
    :param X: Dataframe X
    :param Y: Dataframe Y
  """
  output_directory = 'boxplot'

  if not os.path.exists(output_directory):
    os.makedirs(output_directory)

  for column in X.columns:
    data = pd.DataFrame(X[column])
    data['Label'] = Y['Label']
        
    fig, ax = plt.subplots()
    data.boxplot(by='Label', ax=ax)
    ax.set_title(f'Boxplot for {column}')
        
    output_path = os.path.join(output_directory, f'boxplot_{column}.png')
    plt.savefig(output_path)
    plt.close()

preBoxPlotAnalysisData(X, Y)

In [20]:
def mutualInfoRank(X,Y):
    """
    Calcola la mutualInfoClassif sulla lista delle variabili indipendenti
    :param X: Dataframe X
    :param Y: Dataframe Y
    :return: lista di tuple (coppie nome/valore) ordinate in modo decrescente per mutual info
    """
    print("Computing mutual info ranking...")
    independentList=list(X.columns.values)
    res = dict( zip( independentList, mutual_info_classif( X, np.ravel(Y), discrete_features=False, random_state=seed)))
    sorted_x = sorted(res.items(), key=lambda kv: kv[1], reverse=True)
    print("Computing mutual info ranking...completed")
    return sorted_x


seed=42
np.random.seed(seed)
rank=mutualInfoRank(X,Y)
print(rank)

Computing mutual info ranking...
Computing mutual info ranking...completed
[('directories_2355', 0.34521156106013606), ('head_627', 0.3345374707294313), ('directories_2356', 0.328537284298108), ('section_837', 0.3146524744895214), ('gen_617', 0.3120527197617291), ('section_787', 0.28093125547600417), ('head_686', 0.273320806494868), ('byte_511', 0.272648014473607), ('byte_510', 0.271785301570886), ('byte_508', 0.2709661801495409), ('byte_499', 0.2698000680727528), ('byte_509', 0.2694238385197638), ('byte_512', 0.268688847649065), ('byte_497', 0.26793786774569983), ('byte_507', 0.267833780720234), ('byte_502', 0.26720322807429286), ('byte_504', 0.2669905414791378), ('hist_140', 0.26663261288391005), ('byte_503', 0.2665299633416607), ('byte_501', 0.2660800523923599), ('byte_506', 0.26559301688176284), ('byte_500', 0.2639737206906283), ('hist_138', 0.2638698821893539), ('byte_498', 0.26319685861156117), ('byte_505', 0.2627774977257966), ('hist_175', 0.2621379570781601), ('hist_232', 0.262