In [8]:
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from config import *
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import classification_report
import shap
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
import ast
from matplotlib import pyplot as plt
import seaborn as sns
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score


In [9]:
fakenewsnet = pd.read_csv('../data/wf/FakeNewsNet_wf.csv')
isot = pd.read_csv('../data/wf/FakeNewsISOT_wf.csv')
fakenewskaggle = pd.read_csv('../data/wf/FakeNewsKaggle_wf.csv')
buzfeed_political = pd.read_csv('../data/wf/FakeNewsBuzfeedPolitical_wf.csv')
celebrity = pd.read_csv('../data/wf/FakeNewsCelebrity_wf.csv')
fakenewsamt = pd.read_csv('../data/wf/FakeNewsAMT_wf.csv')
fn_randompolitical = pd.read_csv('../data/wf/FakeNewsRandomPolitical_wf.csv')
fn_politfalse = pd.read_csv('../data/wf/FakeNewsPolitFalse_wf.csv')
fn_satirical = pd.read_csv('../data/wf/FakeNewsSatirical_wf.csv')

datasets = {
    'FakeNewsNet' : fakenewsnet,
    'ISOT' : isot,
    'FakeNewsKaggle' : fakenewskaggle,
    'FakeNewsAMT' : fakenewsamt,
    'FakeNewsRandomPolitical' : fn_randompolitical,
    'FakeNewsCelebrity' : celebrity,
    'FakeNewsBuzfeedPolitical' : buzfeed_political,
    'FakeNewsPolitFalse' : fn_politfalse,
    'FakeNewsSatirical' : fn_satirical,
}

## Analysis by bert

In [11]:
np.random.seed(240993)

    
# create dataframe for results
results_df = pd.DataFrame(columns=['dataset', 'algorithm', 'fit_time', 'precision_weighted_mean', 'precision_weighted_std', 'recall_weighted_mean', 'recall_weighted_std', 'f1_weighted_mean', 'f1_weighted_std', 'precision_macro_mean', 'precision_macro_std', 'recall_macro_mean', 'recall_macro_std', 'f1_macro_mean', 'f1_macro_std', 'precision_micro_mean', 'precision_micro_std', 'recall_micro_mean', 'recall_micro_std', 'f1_micro_mean', 'f1_micro_std'])
for dataset_name, dataset in datasets.items():
    print('---Dataset {dataset_name}---'.format(dataset_name=dataset_name))

    train_data = pd.DataFrame(dataset[['text', 'label']])
 
    # prepare cross validation
    n=5
    kf = KFold(n_splits=n, random_state=24091993, shuffle=True)
 
    results = []
 
    for train_index, val_index in kf.split(train_data):
    		# splitting Dataframe (dataset not included)
        train_df = train_data.iloc[train_index]
        val_df = train_data.iloc[val_index]
        # Defining Model
        model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=False)
    		# train the model
        model.train_model(train_df)
    		# validate the model
        result, model_outputs, wrong_predictions = model.eval_model(val_df, f1=f1_score)
        print(result['f1'])
    		# append model score
        results.append(result['f1'])
 
 
    print("results",results)
    print(f"Mean-Precision: {sum(results) / len(results)}")

---Dataset FakeNewsNet---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.09it/s]
Epochs 1/1. Running Loss:    0.7092: 100%|██████████| 38/38 [01:19<00:00,  2.10s/it]
Epoch 1 of 1: 100%|██████████| 1/1 [01:20<00:00, 80.83s/it]
0it [11:37, ?it/s]


KeyboardInterrupt: 

In [14]:
results

[]

In [2]:
# calculate mean values of each column
mean_values = results.mean(numeric_only=True)

In [3]:
mean_values

f1_weighted_mean    0.824660
f1_weighted_std     0.045549
p_weighted_mean     0.851190
p_weighted_std      0.037056
r_weighted_mean     0.828796
r_weighted_std      0.041426
dtype: float64