In [59]:
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from config import *
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import classification_report
import shap
from sklearn.feature_selection import mutual_info_regression

In [54]:
fakenewsnet = pd.read_csv('../data/fakenewsnet_wf.csv')
isot = pd.read_csv('../data/fn_isot_wf.csv')
fakenewskaggle = pd.read_csv('../data/fn_kaggle_wf.csv')
buzfeed_political = pd.read_csv('../data/fn_buzfeed_wf.csv')
celebrity = pd.read_csv('../data/fn_celebrity_wf.csv')
fakenewsamt = pd.read_csv('../data/fakenewsamt_wf.csv')
fn_randompolitical = pd.read_csv('../data/fn_randompolitical_wf.csv')

datasets = {
    'FakeNewsNet' : fakenewsnet,
    'ISOT' : isot,
    'FakeNewsKaggle' : fakenewskaggle,
    'FakeNewsAMT' : fakenewsamt,
    'FakeNewsRandomPolitical' : fn_randompolitical,
    'FakeNewsCelebrity' : celebrity,
    'FakeNewsBuzfeedPolitical' : buzfeed_political,
}

In [55]:
feature_sets = {
    'Moral' : MORAL_FEATURES,
    'ReadabilityGrades' : READABILITY_GRADE_FEATURES,
    'ReadabilitySentenceInfo' : READABILITY_SENTENCEINFO_FEATURES,
    'ReadabilitySentenceBegininng' : READABILITY_SENTENCEBEGINNING_FEATURES,
    'ReadabilityWordUsage' : READABILITY_WORDUSAGE_FEATURES,
    'AllReadability' : READABILITY_GRADE_FEATURES + READABILITY_SENTENCEINFO_FEATURES + READABILITY_SENTENCEBEGINNING_FEATURES + READABILITY_WORDUSAGE_FEATURES,
    'Sentiment' : SENTIMENT_FEATURES,
    'LIWCLinguistic' : LIWC_LINGUISTIC_FEATURES,
    'LIWCAffectiveProcesses' : LIWC_AFFECTIVEPROCESSES_FEATURES,
    'LIWCSocialProcesses' : LIWC_SOCIALPROCESSES_FEATURES,
    'LIWCCognitiveProcesses' : LIWC_COGNITIVEPROCESSES_FEATURES,
    'LIWCPerceptualProcesses' : LIWC_PERCEPTUALPROCESSES_FEATURES,
    'LIWCBiologicalProcesses' : LIWC_BIOLOGICALPROCESSES_FEATURES,
    'LIWCDrives' : LIWC_DRIVES_FEATURES,
    'LIWCTimeOrientation' : LIWC_TIMEORIENTATION_FEATURES,
    'LIWCRelativity' : LIWC_RELATIVITY_FEATURES,
    'LIWCPersonalConcerns' : LIWC_PERSONALCONCERNS_FEATURES,
    'LIWCInformalLanguage' : LIWC_INFORMALLANGUAGE_FEATURES,
    'AllLIWC' : LIWC_LINGUISTIC_FEATURES + LIWC_AFFECTIVEPROCESSES_FEATURES + LIWC_SOCIALPROCESSES_FEATURES + LIWC_COGNITIVEPROCESSES_FEATURES + LIWC_PERCEPTUALPROCESSES_FEATURES + LIWC_BIOLOGICALPROCESSES_FEATURES 
        + LIWC_DRIVES_FEATURES + LIWC_TIMEORIENTATION_FEATURES + LIWC_RELATIVITY_FEATURES + LIWC_PERSONALCONCERNS_FEATURES + LIWC_INFORMALLANGUAGE_FEATURES,
    'All' : MORAL_FEATURES + READABILITY_GRADE_FEATURES + READABILITY_SENTENCEINFO_FEATURES + READABILITY_SENTENCEBEGINNING_FEATURES + READABILITY_WORDUSAGE_FEATURES + SENTIMENT_FEATURES + LIWC_LINGUISTIC_FEATURES + LIWC_AFFECTIVEPROCESSES_FEATURES + LIWC_SOCIALPROCESSES_FEATURES + LIWC_COGNITIVEPROCESSES_FEATURES + LIWC_PERCEPTUALPROCESSES_FEATURES + LIWC_BIOLOGICALPROCESSES_FEATURES 
        + LIWC_DRIVES_FEATURES + LIWC_TIMEORIENTATION_FEATURES + LIWC_RELATIVITY_FEATURES + LIWC_PERSONALCONCERNS_FEATURES + LIWC_INFORMALLANGUAGE_FEATURES

}

In [56]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                #print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    # x = x.drop(columns=drops)
    return drops

In [58]:
cols_to_drop = []
for dataset_name, dataset in datasets.items():
    print('---Dataset {dataset_name}---'.format(dataset_name=dataset_name))
    df = dataset[feature_sets['All']]
    df = df.loc[:,~df.columns.duplicated()].copy()
    datasets[dataset_name] = reduce_memory_usage(df)
    cols_to_drop.append(remove_collinear_features(datasets[dataset_name] ,0.95))

cols_to_drop = set.intersection(*cols_to_drop)
print('Removing {cols_to_drop} collinear features'.format(cols_to_drop=len(cols_to_drop)))

for dataset_name, dataset in datasets.items():
    datasets[dataset_name] = datasets[dataset_name].drop(cols_to_drop, axis=1)

---Dataset FakeNewsNet---
Memory usage of dataframe is 0.08171844482421875 MB
Memory usage of dataframe after reduction 0.08171844482421875 MB
Reduced by 0.0 % 
---Dataset ISOT---
Memory usage of dataframe is 9.591863632202148 MB
Memory usage of dataframe after reduction 9.591863632202148 MB
Reduced by 0.0 % 
---Dataset FakeNewsKaggle---
Memory usage of dataframe is 3.9293441772460938 MB
Memory usage of dataframe after reduction 3.9293441772460938 MB
Reduced by 0.0 % 
---Dataset FakeNewsAMT---
Memory usage of dataframe is 0.10540771484375 MB
Memory usage of dataframe after reduction 0.10540771484375 MB
Reduced by 0.0 % 
---Dataset FakeNewsRandomPolitical---
Memory usage of dataframe is 0.033023834228515625 MB
Memory usage of dataframe after reduction 0.033023834228515625 MB
Reduced by 0.0 % 
---Dataset FakeNewsCelebrity---
Memory usage of dataframe is 0.10979461669921875 MB
Memory usage of dataframe after reduction 0.10979461669921875 MB
Reduced by 0.0 % 
---Dataset FakeNewsBuzfeedPoli

In [3]:
algorithms = [
    'DecisionTree',
    'SVC' ,
    'LogisticRegression',
    'RandomForest',
    'XGBoost',
    'CatBoost',
]

def get_algorithm(name):
    if name == 'XGBoost' : 
        return XGBClassifier(n_jobs=-1)
    elif name == 'CatBoost' :
        return CatBoostClassifier(verbose=False)
    elif name == 'DecisionTree' :
        return DecisionTreeClassifier(class_weight='balanced')
    elif name == 'SVC' :
        return SVC(class_weight='balanced')
    elif name == 'LinearSVC':
        return LinearSVC(class_weight='balanced')
    elif name == 'RandomForest' :
        return RandomForestClassifier(class_weight='balanced', n_jobs=-1)
    elif name == 'LogisticRegression' :
        return LogisticRegression(class_weight='balanced', n_jobs=-1, max_iter=1000000)

## Analysis by algorithms

In [5]:
np.random.seed(240993)

# create dataframe for results
results_df = pd.DataFrame(columns=['dataset', 'algorithm', 'f1_weighted_mean', 'f1_weighted_std'])
for dataset_name, dataset in datasets.items():
    print('---Dataset {dataset_name}---'.format(dataset_name=dataset_name))

    X = dataset[feature_sets['All']]
    y = dataset['label']

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    for algorithm_name in algorithms:
        print('---Algorithm {algorithms_name}---'.format(algorithms_name=algorithm_name))
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24091993)
        scores = cross_val_score(get_algorithm(algorithm_name), X, y, cv=cv, scoring='f1_weighted', n_jobs=-1)
        print(np.mean(scores))
        print()

        # add results to dataframe using concat method
        results_df = pd.concat([results_df, pd.DataFrame({
            'dataset' : [dataset_name],
            'algorithm' : [algorithm_name],
            'f1_weighted_mean' : np.mean(scores),
            'f1_weighted_std' : np.std(scores)
        })], ignore_index=True)

---Dataset FakeNewsNet---
---Algorithm DecisionTree---
0.6557379434687944

---Algorithm SVC---


  results_df = pd.concat([results_df, pd.DataFrame({


0.71991547475758

---Algorithm LogisticRegression---
0.7007210501388558

---Algorithm RandomForest---
0.7074419556214377

---Algorithm XGBoost---
0.7275630067839993

---Algorithm CatBoost---
0.7319970390007949

---Dataset ISOT---
---Algorithm DecisionTree---
0.9370923483977449

---Algorithm SVC---
0.9737012919016859

---Algorithm LogisticRegression---
0.9617416918716714

---Algorithm RandomForest---
0.9702461457904057

---Algorithm XGBoost---
0.9776094813986049

---Algorithm CatBoost---
0.9787764713349894

---Dataset FakeNewsKaggle---
---Algorithm DecisionTree---
0.7694419787641034

---Algorithm SVC---
0.8761908651061081

---Algorithm LogisticRegression---
0.8446943142089631

---Algorithm RandomForest---
0.853593897189511

---Algorithm XGBoost---
0.8824279758461856

---Algorithm CatBoost---
0.8820630384813628

---Dataset FakeNewsAMT---
---Algorithm DecisionTree---
0.5422862586096804

---Algorithm SVC---
0.6090561410278544

---Algorithm LogisticRegression---
0.6762662561896093

---Algor

In [6]:
results_df

Unnamed: 0,dataset,algorithm,f1_weighted_mean,f1_weighted_std
0,FakeNewsNet,DecisionTree,0.655738,0.043068
1,FakeNewsNet,SVC,0.719915,0.038878
2,FakeNewsNet,LogisticRegression,0.700721,0.04241
3,FakeNewsNet,RandomForest,0.707442,0.027259
4,FakeNewsNet,XGBoost,0.727563,0.016008
5,FakeNewsNet,CatBoost,0.731997,0.032924
6,ISOT,DecisionTree,0.937092,0.001938
7,ISOT,SVC,0.973701,0.001852
8,ISOT,LogisticRegression,0.961742,0.001283
9,ISOT,RandomForest,0.970246,0.000964


In [7]:
results_df.groupby('algorithm').mean(numeric_only=True)

Unnamed: 0_level_0,f1_weighted_mean,f1_weighted_std
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
CatBoost,0.809559,0.039117
DecisionTree,0.718277,0.036247
LogisticRegression,0.782243,0.039685
RandomForest,0.785668,0.037041
SVC,0.78643,0.037375
XGBoost,0.785986,0.032589


## GridSearch for SVC

In [None]:

np.random.seed(240993)


algorithm_name = 'SVC'

for datasets_name, dataset in datasets.items():
    print('---Dataset {datasets_name}---'.format(datasets_name=datasets_name))

    X = dataset[feature_sets['All']]
    y = dataset['label']
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24091993)

    # create model

    model = get_algorithm(algorithm_name)

    # create parameter grid
    param_grid = {
        'C' : [0.001, 0.01, 0.1, 1, 10],
        'gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    }

    # create grid search object
    grid_search = GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1)

    # fit grid search object
    grid_search.fit(X, y)

    # print best parameters
    print(grid_search.best_params_)

    # print best score
    print(grid_search.best_score_)

## Explainability for SVC

In [25]:
# create dataframe for results
results_shap = pd.DataFrame(columns=['dataset', 'col_name','feature_importance_vals'])

np.random.seed(240993)

algorithm_name = 'SVC'

for datasets_name, dataset in datasets.items():
    print('---Dataset {datasets_name}---'.format(datasets_name=datasets_name))

    X = dataset[feature_sets['All']]
    y = dataset['label']
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # create model

    model = get_algorithm(algorithm_name)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2409199)
    
    model.fit(X_train, y_train)

    # print classification report
    print(classification_report(y_test, model.predict(X_test)))

    model = get_algorithm(algorithm_name).fit(X, y)
    
    background_X = shap.maskers.Independent(X, max_samples=100)

    # create explainer
    explainer = shap.Explainer(model.predict, background_X)

    # create shap values
    shap_values = explainer(X)

    feature_names = feature_sets['All']


    vals = np.abs(shap_values.values).mean(0)

    shap_importance = pd.DataFrame(list(zip(feature_names, vals)),
                                  columns=['col_name','feature_importance_vals'])
    shap_importance['dataset'] = datasets_name
    shap_importance.sort_values(by=['feature_importance_vals'],
                               ascending=False, inplace=True)
    shap_importance.head()
    results_shap = pd.concat([results_shap, shap_importance], ignore_index=True)
    

---Dataset FakeNewsNet---
              precision    recall  f1-score   support

           0       0.84      0.63      0.72        43
           1       0.63      0.84      0.72        32

    accuracy                           0.72        75
   macro avg       0.74      0.74      0.72        75
weighted avg       0.75      0.72      0.72        75



PermutationExplainer explainer: 373it [08:34,  1.41s/it]                         
  results_shap = pd.concat([results_shap, shap_importance], ignore_index=True)


---Dataset ISOT---
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4289
           1       0.97      0.98      0.97      4457

    accuracy                           0.97      8746
   macro avg       0.97      0.97      0.97      8746
weighted avg       0.97      0.97      0.97      8746



PermutationExplainer explainer:   0%|          | 9/43729 [04:42<428:34:28, 35.29s/it]


KeyboardInterrupt: 

In [26]:
results_shap

Unnamed: 0,dataset,col_name,feature_importance_vals
0,FakeNewsNet,liwc_hear,0.062265
1,FakeNewsNet,liwc_tentat,0.041552
2,FakeNewsNet,liwc_function,0.039899
3,FakeNewsNet,liwc_percept,0.036714
4,FakeNewsNet,liwc_they,0.034919
...,...,...,...
113,FakeNewsNet,readability_subordination,0.001815
114,FakeNewsNet,readability_pronoun,0.001707
115,FakeNewsNet,readability_tobeverb,0.001344
116,FakeNewsNet,readability_paragraphs,0.000000


In [22]:
len(feature_names)

118

In [23]:
pd.DataFrame(shap_values, columns=feature_names)

ValueError: DataFrame constructor not properly called!