In [None]:
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from config import *
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import classification_report
import shap
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
import ast
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:
fakenewsnet = pd.read_csv('../data/wf/FakeNewsNet_wf.csv')
isot = pd.read_csv('../data/wf/FakeNewsISOT_wf.csv')
fakenewskaggle = pd.read_csv('../data/wf/FakeNewsKaggle_wf.csv')
buzfeed_political = pd.read_csv('../data/wf/FakeNewsBuzfeedPolitical_wf.csv')
celebrity = pd.read_csv('../data/wf/FakeNewsCelebrity_wf.csv')
fakenewsamt = pd.read_csv('../data/wf/FakeNewsAMT_wf.csv')
fn_randompolitical = pd.read_csv('../data/wf/FakeNewsRandomPolitical_wf.csv')
fn_politfalse = pd.read_csv('../data/wf/FakeNewsPolitFalse_wf.csv')
fn_satirical = pd.read_csv('../data/wf/FakeNewsSatirical_wf.csv')
fn_mcintire = pd.read_csv('../data/wf/FakeNewsMcintire_wf.csv')

datasets = {
    'FakeNewsNet' : fakenewsnet,
    'ISOT' : isot,
    'FakeNewsKaggle' : fakenewskaggle,
    'FakeNewsAMT' : fakenewsamt,
    'FakeNewsRandomPolitical' : fn_randompolitical,
    'FakeNewsCelebrity' : celebrity,
    'FakeNewsBuzfeedPolitical' : buzfeed_political,
    'FakeNewsPolitFalse' : fn_politfalse,
    'FakeNewsSatirical' : fn_satirical,
}

In [3]:
# create a dataframe with some statistics about all datasets (number of words of the 'text' column, etc.), and the populate it with information about each dataset
def get_dataset_stats(dataset, name):
    dataset_stats = pd.DataFrame()
    dataset_stats['dataset'] = [name]
    dataset_stats['num_rows'] = [dataset.shape[0]]
    dataset_stats['num_cols'] = [dataset.shape[1]]
    dataset_stats['avg_word_len'] = [dataset['text'].str.split().str.len().mean()]
    dataset_stats['avg_char_len'] = [dataset['text'].str.len().mean()]
    dataset_stats['num_unique_words'] = [dataset['text'].str.split().apply(lambda x: len(set(x))).sum()]
    return dataset_stats

dataset_stats = pd.DataFrame()
for dataset_name, dataset in datasets.items():
    dataset_stats = pd.concat([dataset_stats, get_dataset_stats(dataset, dataset_name)])



In [None]:
for dataset_name, dataset in datasets.items():
    print(dataset_name)
    print(dataset['label'].value_counts())

In [None]:
dataset_stats

In [6]:
feature_sets = {
    'Moral' : MORAL_FEATURES,
    'ReadabilityGrades' : READABILITY_GRADE_FEATURES,
    'ReadabilitySentenceInfo' : READABILITY_SENTENCEINFO_FEATURES,
    'ReadabilitySentenceBegininng' : READABILITY_SENTENCEBEGINNING_FEATURES,
    'ReadabilityWordUsage' : READABILITY_WORDUSAGE_FEATURES,
    'AllReadability' : READABILITY_GRADE_FEATURES + READABILITY_SENTENCEINFO_FEATURES + READABILITY_SENTENCEBEGINNING_FEATURES + READABILITY_WORDUSAGE_FEATURES,
    'Sentiment' : SENTIMENT_FEATURES,
    'Subjectivity' : ['subjectivity'],
    'Emotion' : EMOTION_FEATURES,
    'LIWCLinguistic' : LIWC_LINGUISTIC_FEATURES,
    'LIWCAffectiveProcesses' : LIWC_AFFECTIVEPROCESSES_FEATURES,
    'LIWCSocialProcesses' : LIWC_SOCIALPROCESSES_FEATURES,
    'LIWCCognitiveProcesses' : LIWC_COGNITIVEPROCESSES_FEATURES,
    'LIWCPerceptualProcesses' : LIWC_PERCEPTUALPROCESSES_FEATURES,
    'LIWCBiologicalProcesses' : LIWC_BIOLOGICALPROCESSES_FEATURES,
    'LIWCDrives' : LIWC_DRIVES_FEATURES,
    'LIWCTimeOrientation' : LIWC_TIMEORIENTATION_FEATURES,
    'LIWCRelativity' : LIWC_RELATIVITY_FEATURES,
    'LIWCPersonalConcerns' : LIWC_PERSONALCONCERNS_FEATURES,
    'LIWCInformalLanguage' : LIWC_INFORMALLANGUAGE_FEATURES,
    'AllLIWC' : LIWC_LINGUISTIC_FEATURES + LIWC_AFFECTIVEPROCESSES_FEATURES + LIWC_SOCIALPROCESSES_FEATURES + LIWC_COGNITIVEPROCESSES_FEATURES + LIWC_PERCEPTUALPROCESSES_FEATURES + LIWC_BIOLOGICALPROCESSES_FEATURES 
        + LIWC_DRIVES_FEATURES + LIWC_TIMEORIENTATION_FEATURES + LIWC_RELATIVITY_FEATURES + LIWC_PERSONALCONCERNS_FEATURES + LIWC_INFORMALLANGUAGE_FEATURES,
    'All' : MORAL_FEATURES + READABILITY_GRADE_FEATURES + READABILITY_SENTENCEINFO_FEATURES + READABILITY_SENTENCEBEGINNING_FEATURES + READABILITY_WORDUSAGE_FEATURES + SENTIMENT_FEATURES + LIWC_LINGUISTIC_FEATURES + LIWC_AFFECTIVEPROCESSES_FEATURES + LIWC_SOCIALPROCESSES_FEATURES + LIWC_COGNITIVEPROCESSES_FEATURES + LIWC_PERCEPTUALPROCESSES_FEATURES + LIWC_BIOLOGICALPROCESSES_FEATURES 
        + LIWC_DRIVES_FEATURES + LIWC_TIMEORIENTATION_FEATURES + LIWC_RELATIVITY_FEATURES + LIWC_PERSONALCONCERNS_FEATURES + LIWC_INFORMALLANGUAGE_FEATURES + ['subjectivity'] + EMOTION_FEATURES

}

In [7]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

def get_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                #print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    # x = x.drop(columns=drops)
    return drops

def get_low_score_features(df, feature_names):
    np.random.seed(24091993)
    df['random_var_1'] = np.random.random(size=len(df))
    df['random_var_2'] = np.random.random(size=len(df))
    
    X = df[feature_names]
    y = df['label']

    ig = mutual_info_regression(X, y)

    # Create a dictionary of feature importance scores
    feature_scores = {}
    for i in range(len(feature_names)):
        feature_scores[feature_names[i]] = ig[i]
    # Add the random variables to the feature scores dictionary
    feature_scores['random_var_1'] = ig[-2]  # Assuming ig has the importance scores for the added random variables
    feature_scores['random_var_2'] = ig[-1]

    # Sort the features by importance score in descending order
    sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the importance score of 'random_var_1'
    random_var_1_score = feature_scores['random_var_1']
    random_var_2_score = feature_scores['random_var_2']

    # Get the features whose importance score is lower than the importance score of 'random_var_1'
    less_important_features = [feature for feature, score in sorted_features if (score < random_var_1_score or score < random_var_2_score) ]

    return set(less_important_features)

In [None]:
cols_to_drop = []
for dataset_name, dataset in datasets.items():
    print('---Dataset {dataset_name}---'.format(dataset_name=dataset_name))
    df = dataset[feature_sets['All'] + ['label']]
    df = df.loc[:,~df.columns.duplicated()].copy()
    datasets[dataset_name] = reduce_memory_usage(df)    

## Feature analysis

In [9]:
# Lista de nombres de las features
features = feature_sets['All']

In [None]:

# Definir el umbral de correlación alta
correlation_threshold = 0.9

# Función para encontrar las features altamente correlacionadas en un dataset
def get_highly_correlated_features(df, threshold):
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    highly_correlated = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    return highly_correlated

# Encontrar las features altamente correlacionadas en todos los datasets
all_highly_correlated_features = []
for dataset_name, dataset in datasets.items():
    correlated_features = get_highly_correlated_features(dataset[features], correlation_threshold)
    all_highly_correlated_features.append(set(correlated_features))

# Encontrar la intersección de features altamente correlacionadas en todos los datasets
common_highly_correlated_features = set.intersection(*all_highly_correlated_features)

print("Features altamente correlacionadas en todos los datasets:")
print(common_highly_correlated_features)

# Eliminar las features altamente correlacionadas de los datasets
datasets = {name: df.drop(columns=common_highly_correlated_features) for name, df in datasets.items()}

In [None]:
# remove common_highly_correlated_features from features
features = [feature for feature in features if feature not in common_highly_correlated_features]

# Crear un DataFrame vacío para almacenar los resultados
results_fa = pd.DataFrame(columns=features, index=datasets.keys())

# Iterar sobre cada dataset
for dataset_name, df in datasets.items():
    print('---- Dataset %s' % dataset_name)

    df = df.copy()
    
    # scale dataset
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])


    # Separar las noticias falsas y legítimas
    df_false = df[df['label'] == 1]
    df_legit = df[df['label'] == 0]
    
    # Calcular la diferencia de medias para cada feature
    for feature in features:
        mean_false = df_false[feature].mean()
        mean_legit = df_legit[feature].mean()
        difference = mean_false - mean_legit
        
        # Almacenar la diferencia en el DataFrame de resultados
        results_fa.at[dataset_name, feature] = difference

In [12]:
# Asumiendo que ya tienes el DataFrame results_df con las diferencias normalizadas

# Crear una lista para almacenar las top 20 features de cada dataset
top_features_per_dataset_list = []

# Iterar sobre cada dataset para encontrar las 20 features con mayor diferencia positiva
for dataset_name in results_fa.index:
    # Ordenar las features por la diferencia positiva
    top_features = results_fa.loc[dataset_name].sort_values(ascending=False).head(20).index
    # Añadir las features a la lista
    for feature in top_features:
        top_features_per_dataset_list.append({'Feature': feature, 'Dataset': dataset_name})

# Convertir la lista a un DataFrame
top_features_per_dataset = pd.DataFrame(top_features_per_dataset_list)

# Contar cuántas veces aparece cada feature en los top 20 de todos los datasets
feature_counts = top_features_per_dataset['Feature'].value_counts()



In [None]:
feature_counts.head(10)

In [14]:
# Asumiendo que ya tienes el DataFrame results_df con las diferencias normalizadas

# Crear una lista para almacenar las top 20 features con mayor diferencia negativa de cada dataset
top_negative_features_per_dataset_list = []

# Iterar sobre cada dataset para encontrar las 20 features con mayor diferencia negativa
for dataset_name in results_fa.index:
    # Ordenar las features por la diferencia negativa
    top_negative_features = results_fa.loc[dataset_name].sort_values(ascending=True).head(20).index
    # Añadir las features a la lista
    for feature in top_negative_features:
        top_negative_features_per_dataset_list.append({'Feature': feature, 'Dataset': dataset_name})

# Convertir la lista a un DataFrame
top_negative_features_per_dataset = pd.DataFrame(top_negative_features_per_dataset_list)

# Contar cuántas veces aparece cada feature en los top 20 de todos los datasets
negative_feature_counts = top_negative_features_per_dataset['Feature'].value_counts()


In [None]:
# Mostrar los resultados
negative_feature_counts.head(10)

In [16]:
# Asumiendo que ya tienes el DataFrame results_df con las diferencias normalizadas

# Función para obtener las 20 features con menor diferencia en cada dataset
def get_top_20_smallest_diff_features(dataframe):
    top_20_features = {}
    for dataset in dataframe.index:
        smallest_diff_features = dataframe.loc[dataset].abs().sort_values().head(20).index
        top_20_features[dataset] = smallest_diff_features
    return top_20_features

# Obtener las 20 features con menor diferencia en cada dataset
top_20_features = get_top_20_smallest_diff_features(results_fa)

# Contar la frecuencia de cada feature en los top 20 de todos los datasets
feature_counts = pd.Series(np.concatenate(list(top_20_features.values()))).value_counts()

# Si deseas ver el resultado de una manera más estructurada
top_20_features_df = pd.DataFrame.from_dict(top_20_features, orient='index').T


In [None]:
feature_counts.head(10)

In [None]:
# Asumiendo que ya tienes el DataFrame results_fa con las diferencias normalizadas

# Calcular la media de la influencia de cada feature en cada dataset
mean_influences = results_fa.mean(axis=0)

# Identificar las 10 features con la mayor influencia positiva
top_10_positive_features = mean_influences.sort_values(ascending=False).head(10).index

# Identificar las 10 features con la mayor influencia negativa
top_10_negative_features = mean_influences.sort_values(ascending=True).head(10).index

# Identificar las 20 features con la mayor influencia positiva
top_20_positive_features = mean_influences.sort_values(ascending=False).head(40).index

# Identificar las 20 features con la mayor influencia negativa
top_20_negative_features = mean_influences.sort_values(ascending=True).head(40).index

# Filtrar el DataFrame results_fa para estas features
top_positive_df = results_fa[top_10_positive_features]
top_negative_df = results_fa[top_10_negative_features]

# Convertir todas las columnas a tipo numérico por si acaso
top_positive_df = top_positive_df.apply(pd.to_numeric)
top_negative_df = top_negative_df.apply(pd.to_numeric)

# Función para anotar los outliers con el nombre del dataset
def annotate_outliers(ax, data, dataset_names):
    for i, feature in enumerate(data.columns):
        y = data[feature]
        x = np.full(len(y), i + 1)
        Q1 = np.percentile(y, 25)
        Q3 = np.percentile(y, 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = y[(y < lower_bound) | (y > upper_bound)]
        for outlier_idx in outliers.index:
            xi = i + 1
            yi = outliers[outlier_idx]
            dataset_name = outlier_idx
            ax.text(xi, yi, dataset_name, ha='right', fontsize=12)

# Aumentar el tamaño de las fuentes para todos los textos
plt.rcParams.update({'font.size': 18})

# Crear un diagrama de cajas para las 10 features con la mayor influencia positiva
plt.figure(figsize=(16, 10))
plt.yticks(fontsize=16)
plt.xticks(fontsize=16)
ax = sns.boxplot(data=top_positive_df, palette="Blues")
plt.title('Top 10 Features with Highest Positive Influence', fontsize=22)
plt.ylabel('Normalized Influence', fontsize=18)
plt.xticks(rotation=45, fontsize=16)
annotate_outliers(ax, top_positive_df, results_fa.index)
plt.show()

# Crear un diagrama de cajas para las 10 features con la mayor influencia negativa
plt.figure(figsize=(16, 10))
plt.yticks(fontsize=16)
plt.xticks(fontsize=16)
ax = sns.boxplot(data=top_negative_df, palette="Reds")
plt.title('Top 10 Features with Highest Negative Influence', fontsize=22)
plt.ylabel('Normalized Influence', fontsize=18)
plt.xticks(rotation=45, fontsize=16)
annotate_outliers(ax, top_negative_df, results_fa.index)
plt.show()


In [None]:
top_10_positive_features

In [None]:
top_10_negative_features

In [21]:
top_20_global_features = top_10_positive_features.tolist() + top_10_negative_features.tolist()

In [22]:
top_40_global_features = top_20_positive_features.tolist() + top_20_negative_features.tolist()

In [None]:
top_40_global_features

In [None]:
top_20_global_features

## Analysis by algorithms

In [25]:
algorithms = [
    'DecisionTree',
    'LinearSVC' ,
    'LogisticRegression',
    'RandomForest',
    'XGBoost',
    'CatBoost',
]

def get_algorithm(name):
    if name == 'XGBoost' : 
        return XGBClassifier(n_jobs=-1)
    elif name == 'CatBoost' :
        return CatBoostClassifier(verbose=False)
    elif name == 'DecisionTree' :
        return DecisionTreeClassifier(class_weight='balanced')
    elif name == 'SVC' :
        return SVC(class_weight='balanced')
    elif name == 'LinearSVC':
        return LinearSVC(class_weight='balanced')
    elif name == 'RandomForest' :
        return RandomForestClassifier(class_weight='balanced', n_jobs=-1)
    elif name == 'LogisticRegression' :
        return LogisticRegression(class_weight='balanced', n_jobs=-1, max_iter=1000000)

In [None]:
np.random.seed(240993)

# create dataframe for results
results_df = pd.DataFrame(columns=['dataset', 'algorithm', 'fit_time', 'accuracy_mean', 'accuracy_std', 'precision_weighted_mean', 'precision_weighted_std', 'recall_weighted_mean', 'recall_weighted_std', 'f1_weighted_mean', 'f1_weighted_std', 'precision_macro_mean', 'precision_macro_std', 'recall_macro_mean', 'recall_macro_std', 'f1_macro_mean', 'f1_macro_std', 'precision_micro_mean', 'precision_micro_std', 'recall_micro_mean', 'recall_micro_std', 'f1_micro_mean', 'f1_micro_std'])
for dataset_name, dataset in datasets.items():
    print('---Dataset {dataset_name}---'.format(dataset_name=dataset_name))
    
    X = dataset[features].fillna(0)
    y = dataset['label']

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    for algorithm_name in algorithms:
        print('---Algorithm {algorithms_name}---'.format(algorithms_name=algorithm_name))
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24091993)
        scores = cross_validate(get_algorithm(algorithm_name), X, y, cv=cv, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'precision_micro', 'recall_micro', 'f1_micro', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)

        # add results to dataframe using concat method
        results_df = pd.concat([results_df, pd.DataFrame({
            'dataset' : [dataset_name],
            'algorithm' : [algorithm_name],
            'fit_time' : np.mean(scores['fit_time']),
            'score_time' : np.mean(scores['score_time']),
            'accuracy_mean' : np.mean(scores['test_accuracy']),
            'accuracy_std' : np.std(scores['test_accuracy']),
            'precision_weighted_mean' : np.mean(scores['test_precision_weighted']),
            'precision_weighted_std' : np.std(scores['test_precision_weighted']),
            'recall_weighted_mean' : np.mean(scores['test_recall_weighted']),
            'recall_weighted_std' : np.std(scores['test_recall_weighted']),
            'f1_weighted_mean' : np.mean(scores['test_f1_weighted']),
            'f1_weighted_std' : np.std(scores['test_f1_weighted']),
            'precision_macro_mean' : np.mean(scores['test_precision_macro']),
            'precision_macro_std' : np.std(scores['test_precision_macro']),
            'recall_macro_mean' : np.mean(scores['test_recall_macro']),
            'recall_macro_std' : np.std(scores['test_recall_macro']),
            'f1_macro_mean' : np.mean(scores['test_f1_macro']),
            'f1_macro_std' : np.std(scores['test_f1_macro']),
            'precision_micro_mean' : np.mean(scores['test_precision_micro']),
            'precision_micro_std' : np.std(scores['test_precision_micro']),
            'recall_micro_mean' : np.mean(scores['test_recall_micro']),
            'recall_micro_std' : np.std(scores['test_recall_micro']),
            'f1_micro_mean' : np.mean(scores['test_f1_micro']),
            'f1_micro_std' : np.std(scores['test_f1_micro'])
        })], ignore_index=True)

In [None]:
results_df.groupby(['algorithm']).mean(numeric_only=True)[['accuracy_mean', 'accuracy_std']]

In [None]:
results_df.groupby(['dataset']).mean(numeric_only=True)[['f1_weighted_mean']]

In [None]:
results_df[results_df['algorithm']=='LogisticRegression'][['dataset', 'precision_weighted_mean', 'recall_weighted_mean', 'f1_weighted_mean']]

In [None]:
top_40_global_features

In [None]:
top_40_global_features[5:]

In [None]:
top_40_global_features[40:45]

## Algorithms with differen number of features

In [None]:
np.random.seed(240993)

# create dataframe for results
results_df_mf = pd.DataFrame(columns=['dataset', 'num_features', 'algorithm', 'fit_time', 'accuracy_mean', 'accuracy_std', 'precision_weighted_mean', 'precision_weighted_std', 'recall_weighted_mean', 'recall_weighted_std', 'f1_weighted_mean', 'f1_weighted_std', 'precision_macro_mean', 'precision_macro_std', 'recall_macro_mean', 'recall_macro_std', 'f1_macro_mean', 'f1_macro_std', 'precision_micro_mean', 'precision_micro_std', 'recall_micro_mean', 'recall_micro_std', 'f1_micro_mean', 'f1_micro_std'])
for dataset_name, dataset in datasets.items():
    print('---Dataset {dataset_name}---'.format(dataset_name=dataset_name))
    possible_num_features = [3, 5, 8, 10, 15, 20, 25, 30, 35, 40]
    for num_features in possible_num_features:
        print('---Number of features {num_features}---'.format(num_features=num_features))
        
        # select the top num_features from top_20_global_features head and the top num_features from the tail
        top_num_features_head = top_40_global_features[:num_features]
        top_num_features_tail = top_40_global_features[max(possible_num_features):(max(possible_num_features)+num_features)]
        top_num_features = top_num_features_head + top_num_features_tail

        X = dataset[top_num_features]
        y = dataset['label']

        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        for algorithm_name in algorithms:
            print('---Algorithm {algorithms_name}---'.format(algorithms_name=algorithm_name))
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24091993)
            scores = cross_validate(get_algorithm(algorithm_name), X, y, cv=cv, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'precision_micro', 'recall_micro', 'f1_micro', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)

            # add results to dataframe using concat method
            results_df_mf = pd.concat([results_df_mf, pd.DataFrame({
                'dataset' : [dataset_name],
                'num_features' : [num_features],
                'algorithm' : [algorithm_name],
                'fit_time' : np.mean(scores['fit_time']),
                'accuracy_mean' : np.mean(scores['test_accuracy']),
                'accuracy_std' : np.std(scores['test_accuracy']),
                'precision_weighted_mean' : np.mean(scores['test_precision_weighted']),
                'precision_weighted_std' : np.std(scores['test_precision_weighted']),
                'recall_weighted_mean' : np.mean(scores['test_recall_weighted']),
                'recall_weighted_std' : np.std(scores['test_recall_weighted']),
                'f1_weighted_mean' : np.mean(scores['test_f1_weighted']),
                'f1_weighted_std' : np.std(scores['test_f1_weighted']),
                'precision_macro_mean' : np.mean(scores['test_precision_macro']),
                'precision_macro_std' : np.std(scores['test_precision_macro']),
                'recall_macro_mean' : np.mean(scores['test_recall_macro']),
                'recall_macro_std' : np.std(scores['test_recall_macro']),
                'f1_macro_mean' : np.mean(scores['test_f1_macro']),
                'f1_macro_std' : np.std(scores['test_f1_macro']),
                'precision_micro_mean' : np.mean(scores['test_precision_micro']),
                'precision_micro_std' : np.std(scores['test_precision_micro']),
                'recall_micro_mean' : np.mean(scores['test_recall_micro']),
                'recall_micro_std' : np.std(scores['test_recall_micro']),
                'f1_micro_mean' : np.mean(scores['test_f1_micro']),
                'f1_micro_std' : np.std(scores['test_f1_micro'])
            })], ignore_index=True)

In [None]:
results_df_mf.groupby(['num_features', 'algorithm'])[['fit_time', 'f1_weighted_mean', 'f1_weighted_std']].mean()

In [None]:
results_df_mf.groupby(['num_features', 'dataset'])[['fit_time', 'f1_weighted_mean', 'f1_weighted_std']].mean()

In [None]:
results_df_mf[(results_df_mf['num_features']==20) & (results_df_mf['algorithm']=='XGBoost')]

In [None]:
results_df_mf[(results_df_mf['num_features']==20) & (results_df_mf['algorithm']=='XGBoost')].groupby(['dataset'])[['fit_time', 'f1_weighted_mean', 'f1_weighted_std']].mean()

In [None]:
results_df_mf[(results_df_mf['algorithm']=='XGBoost')].groupby(['num_features', 'dataset'])[['fit_time', 'f1_weighted_mean', 'f1_weighted_std']].mean()

In [None]:
grouped_results = results_df_mf.groupby(['num_features', 'dataset'])[['f1_weighted_mean']].mean().reset_index()

# Hacer el plot
plt.figure(figsize=(12, 8))

# Obtener el número único de datasets
datasets = grouped_results['dataset'].unique()

# Plotear cada dataset
for dataset in datasets:
    subset = grouped_results[grouped_results['dataset'] == dataset]
    plt.plot(subset['num_features']*2, subset['f1_weighted_mean'], marker='o', label=dataset)

# Configurar el gráfico
plt.title('F1 Weighted Mean vs Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('F1 Weighted Mean')
plt.legend(title='Dataset')
plt.grid(True)

# Mostrar el gráfico
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Agrupar por 'num_features' y 'dataset', y calcular la media
grouped_results = results_df_mf.groupby(['num_features', 'dataset'])[['f1_weighted_mean']].mean().reset_index()

# Calcular la media global para cada 'num_features' (independientemente del dataset)
mean_overall = grouped_results.groupby('num_features')['f1_weighted_mean'].mean().reset_index()

# Filtrar para que solo incluya num_features * 2 <= 40
grouped_results = grouped_results[grouped_results['num_features'] * 2 <= 50]
mean_overall = mean_overall[mean_overall['num_features'] * 2 <= 50]

# Hacer el plot
plt.figure(figsize=(12, 8))

# Obtener el número único de datasets
datasets = grouped_results['dataset'].unique()

# Plotear cada dataset
for dataset in datasets:
    subset = grouped_results[grouped_results['dataset'] == dataset]
    plt.plot(subset['num_features']*2, subset['f1_weighted_mean'], marker='o', label=dataset)

# Plotear la media global
plt.plot(mean_overall['num_features']*2, mean_overall['f1_weighted_mean'], marker='x', linestyle='--', color='black', label='Average')

# Configurar el gráfico
plt.title('F1 Weighted Mean vs Number of Features', fontsize=16)  # Título más grande
plt.xlabel('Number of Features', fontsize=14)  # Texto del eje x más grande
plt.ylabel('F1 Weighted Mean', fontsize=14)  # Texto del eje y más grande
plt.legend(title='Dataset', fontsize=12, title_fontsize=14)  # Leyenda con textos más grandes
plt.grid(True)

# Aumentar el tamaño de las etiquetas de los ejes
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Mostrar el gráfico
plt.show()


In [None]:
results_df_mf.groupby(['num_features'])[['fit_time']].mean()

In [None]:
results_df_mf[(results_df_mf['num_features']==15)].groupby(['algorithm'])[['fit_time']].mean()

In [None]:
import matplotlib.pyplot as plt

# Agrupar por 'num_features' y 'dataset', y calcular la media
grouped_results = results_df_mf.groupby(['num_features', 'algorithm'])[['fit_time']].mean().reset_index()

# Calcular la media global para cada 'num_features' (independientemente del dataset)
mean_overall = grouped_results.groupby('num_features')['fit_time'].mean().reset_index()

# Filtrar para que solo incluya num_features * 2 <= 40
grouped_results = grouped_results[grouped_results['num_features'] * 2 <= 50]
mean_overall = mean_overall[mean_overall['num_features'] * 2 <= 50]

# Hacer el plot
plt.figure(figsize=(12, 8))

# Obtener el número único de datasets
datasets = grouped_results['algorithm'].unique()

# Plotear cada dataset
for algorithm_name in algorithms:
    subset = grouped_results[grouped_results['algorithm'] == algorithm_name]
    plt.plot(subset['num_features']*2, subset['fit_time'], marker='o', label=algorithm_name)

# Plotear la media global
plt.plot(mean_overall['num_features']*2, mean_overall['fit_time'], marker='x', linestyle='--', color='black', label='Average')

# Configurar el gráfico
plt.title('F1 Weighted Mean vs Number of Features', fontsize=16)  # Título más grande
plt.xlabel('Number of Features', fontsize=14)  # Texto del eje x más grande
plt.ylabel('F1 Weighted Mean', fontsize=14)  # Texto del eje y más grande
plt.legend(title='Dataset', fontsize=12, title_fontsize=14)  # Leyenda con textos más grandes
plt.grid(True)

# Aumentar el tamaño de las etiquetas de los ejes
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Mostrar el gráfico
plt.show()


## Explainability for LogReg

In [None]:
top_20_global_features

In [None]:
# create dataframe for results
results_shap = {}

np.random.seed(240993)

algorithm_name = 'XGBoost'

for datasets_name, dataset in datasets.items():
    print('---Dataset {datasets_name}---'.format(datasets_name=datasets_name))
    results_shap[datasets_name] = {}

    X = dataset[top_20_global_features]
    y = dataset['label']
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # create model

    model = get_algorithm(algorithm_name)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2409199)
    
    model.fit(X_train, y_train)

    # print classification report
    print(classification_report(y_test, model.predict(X_test)))
    
    model = get_algorithm(algorithm_name).fit(X, y)
    
    background_X = shap.maskers.Independent(X, max_samples=100)

    # create explainer
    explainer = shap.Explainer(model.predict, background_X)

    # create shap values
    shap_values = explainer(X)

    results_shap[datasets_name]['features'] = top_20_global_features
    results_shap[datasets_name]['shap_values'] = shap_values
    

In [None]:
sv_agg = results_shap['ISOT']['shap_values'].values

for dataset_name in datasets.keys():
    sv_agg = np.concatenate((sv_agg, results_shap[dataset_name]['shap_values'].values), axis=0)

shap.summary_plot(sv_agg, feature_names=results_shap[dataset_name]['features'], plot_type="bar")