In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import os
import pickle as pk
import copy
import operator
from tqdm import tqdm
import seaborn as sns
import io
from scipy import misc

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid 

from sklearn import tree
from sklearn import naive_bayes 
from sklearn import neural_network
from sklearn import ensemble
from sklearn import neighbors
from sklearn import svm
from sklearn import linear_model

import sklearn.metrics as mt 
from sklearn.metrics import roc_auc_score

from sklearn.utils import shuffle

In [None]:
# Some address
processedFeatureSerializationAdr = "./Serialization/Features/Processed features/"

In [None]:
# loading the features
kn = pk.load(open(processedFeatureSerializationAdr+"keralaNonRumours_4thOrderPreprocessing.pk","rb"))
kr = pk.load(open(processedFeatureSerializationAdr+"keralaRumours_4thOrderPreprocessing.pk","rb"))
fn = pk.load(open(processedFeatureSerializationAdr+"florenceNonRumours_4thOrderPreprocessing.pk","rb"))
fr = pk.load(open(processedFeatureSerializationAdr+"florenceRumours_4thOrderPreprocessing.pk","rb"))

In [None]:
#Assigning labels

kr["label"] = 1
kn["label"] = 0
fr["label"] = 1
fn["label"] = 0

In [None]:
# Merging the rumour and non-rumours and separating their labels from the data
Xf = shuffle(pd.concat([fr,fn]))
yf = Xf["label"]
Xf = Xf.drop(columns=["label"], axis="columns")

Xk = shuffle(pd.concat([kr,kn]))
yk = Xk["label"]
Xk = Xk.drop(columns=["label"], axis="columns")

In [None]:
# # Spliting to train, test, and validation
# # We do not use it for now ... 

# X_train_test_f, X_valid_f, y_train_test_f, y_valid_f = train_test_split(Xf, yf, test_size=0.33, random_state=42)
# X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_train_test_f, y_train_test_f, test_size=0.33, random_state=42)

# X_train_test_k, X_valid_k, y_train_test_k, y_valid_k = train_test_split(Xk, yk, test_size=0.33, random_state=42)
# X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_train_test_k, y_train_test_k, test_size=0.33, random_state=42)

In [None]:
# Hyper parameters
hyperParams = {}
hyperParams.update({"bnb":{"alpha":np.arange(0,1.1,0.1), "binarize":[], "fit_prior":[True, False]}})
hyperParams.update({"gnb":{"priors":[], "var_smoothing":[]}})
hyperParams.update({"mnb":{"alpha":[], "fit_prior":[], "class_prior":[]}})
hyperParams.update({"cnb":{"alpha":[], "fit_prior":[], "class_prior":[], "norm":[]}})

hyperParams.update({"palm":{}})
hyperParams.update({"lrlm":{}})
hyperParams.update({"rlm":{}})
hyperParams.update({"sgdlm":{}})

hyperParams.update({"lsvm":{}})
hyperParams.update({"nusvm":{}})
hyperParams.update({"csvm":{}})

hyperParams.update({"dt":{}})
hyperParams.update({"etdt":{}})

hyperParams.update({"bnn":{}})
hyperParams.update({"mlpnn":{}})

hyperParams.update({"knn":{}})
hyperParams.update({"rknn":{}})

hyperParams.update({"bgmm":{}})
hyperParams.update({"gmm":{}})
hyperParams.update({"gp":{}})

hyperParams.update({"ada":{}})
hyperParams.update({"bag":{}})
hyperParams.update({"ete":{}})
hyperParams.update({"gb":{}})
hyperParams.update({"iso":{}})
hyperParams.update({"rf":{}})
hyperParams.update({"rt":{}})
hyperParams.update({"vot":{}})
hyperParams.update({"his":{}})

In [None]:
# Model setup

bnb = naive_bayes.BernoulliNB()
gnb = naive_bayes.GaussianNB()
mnb = naive_bayes.MultinomialNB()
cnb = naive_bayes.ComplementNB()

prior= 0.5

palm = linear_model.PassiveAggressiveClassifier()
lrlm = linear_model.LogisticRegression()
sgdlm = linear_model.SGDClassifier()

csvm = svm.SVC()
kernel: 0:3:0.2
c: 0:10:1


dt = tree.DecisionTreeClassifier()
etdt = tree.ExtraTreeClassifier()

bnn = neural_network.BernoulliRBM()
mlpnn = neural_network.MLPClassifier()

activation:tanh
learning rate: adaptive
    

knn = neighbors.KNeighborsClassifier()
rknn = neighbors.RadiusNeighborsClassifier()

bgmm = mixture.BayesianGaussianMixture()
gmm = mixture.GaussianMixture()
n_components: 2:5:1
gp = gaussian_process.GaussianProcessClassifier()
kernel obkect

ada = ensemble.AdaBoostClassifier()
bag = ensemble.BaggingClassifier()
ete = ensemble.ExtraTreesClassifier()
gb = ensemble.GradientBoostingClassifier()
iso = ensemble.IsolationForest()
rf = ensemble.RandomForestClassifier()
rt = ensemble.RandomTreesEmbedding()
n_estimator
vot = ensemble.VotingClassifier()
his = ensemble.HistGradientBoostingClassifier()

models = {bnb : "bnb" , gnb : "gnb" , mnb : "mnb" , cnb : "cnb" , palm : "palm" , # lrlm : "# lrlm" ,\
          lrlm : "lrlm" , rlm : "rlm" , sgdlm : "sgdlm" , lsvm : "lsvm" , nusvm : "nusvm" , \
          csvm : "csvm" , dt : "dt" , etdt : "etdt" , bnn : "bnn" , mlpnn : "mlpnn" , knn : "knn" ,\
          rknn : "rknn" , bgmm : "bgmm" , gmm : "gmm" , gp : "gp" , ada : "ada" , bag : "bag" , ete : "ete" ,\
          gb : "gb" , iso : "iso" , rf : "rf" , rt : "rt" , vot : "vot" , his : "his"}

In [None]:
# Train with kerala and test with florence

results = {}
for model,modelName in models.items():
    params = ParameterGrid(hyperParams[modelName])
    
    best_score = -1

    results[modelName] = {}
    t1 = datetime.now()
    for g in params:
        model.set_params(**g)
        model.fit(X_train_k, y_train_k)
        precision, recall, thresholds = mt.precision_recall_curve(y_train_k, model.predict_proba(X_train_k)[:,1])
        auprc = auc(recall, precision)
        if  auprc > best_score:
            best_score = auprc
            best_grid = g
            
    t2 = datetime.now()        
    model.set_params(**best_grid)
    model.fit(X_train_k, y_train_k)
    t3 = datetime.now()
    y_pred = model.predict(X_test_f)
    t4 = datetime.now()
    
    tn, fp, fn, tp = mt.confusion_matrix(y_test_f, y_pred).ravel()
    pr = mt.precision_score(y_test_f, y_pred)
    re = mt.recall_score(y_test_f, y_pred)
    ac = mt.accuracy_score(y_test_f, y_pred)
    f1 = mt.f1_score(y_test_f, y_pred)
    precision, recall, thresholds = mt.precision_recall_curve(y_train_k, model.predict_proba(X_train_k)[:,1])
    auprc = auc(recall, precision)
    auroc = mt.roc_auc_score(y_test_f, y_pred)
    
    
    results[modelName]["t1"] = t1
    results[modelName]["t2"] = t2
    results[modelName]["t3"] = t3
    results[modelName]["t4"] = t4
    
    results[modelName]["tn"] = tn
    results[modelName]["fp"] = fp
    results[modelName]["fn"] = fn 
    results[modelName]["tp"] = tp
    
    results[modelName]["pr"] = pr 
    results[modelName]["re"] = re
    results[modelName]["f1"] = f1
    results[modelName]["ac"] = ac
    
    results[modelName]["auprc"] = auprc 
    results[modelName]["auroc"] = auroc

    results["bestParameters"] = best_grid
    results["model"] = model

pk.dump(results, open(f'./Serialization/Results/{results}_trainWithKerala_balanced.pk', "wb"))  

In [None]:
parameters = {"max_features": np.arange(1,10,1)}
params = ParameterGrid(parameters)
dt = tree.DecisionTreeClassifier()

best_score = 0.0

for g in params:
    dt.set_params(**g)
    dt.fit(X_train_k, y_train_k)
    f1 = float(mt.f1_score(y_train_k, dt.predict(X_train_k)))
    print(f1)
    if  f1 > best_score:
        best_score = f1
        best_grid = g

In [None]:
dt = tree.DecisionTreeClassifier()
clf = GridSearchCV(dt, param_grid=parameters, cv=1)
clf.fit(X_train_k, y_train_k)



for g in ParameterGrid(grid):
    rf.set_params(**g)
    rf.fit(X,y)
    # save if best
    if rf.oob_score_ > best_score:
        best_score = rf.oob_score_
        best_grid = g

In [None]:
#Train with Florence / Test with Florence

dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)
lr = LogisticRegression()


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize", lr:"logistic regression"}


classifiers = [dt, lr, rf, gnb, bnb, mlp, mnb, ada, clf]


for model in classifiers:
    t1 = datetime.now()
    model.fit(X_train_f, y_train_f)
    t2 = datetime.now()
    y_pred = model.predict(X_test_k)
    t3 = datetime.now()
    tn, fp, fn, tp = mt.confusion_matrix(y_test_k, y_pred).ravel()
    print("-------", nameDict[model], "-------")
    print(mt.f1_score(y_test_k, y_pred))
    print("training time: ", (t2-t1).seconds)
    print("training time: ", (t3-t2).seconds)

In [None]:
y_pred = model.predict(X_test_k)

In [None]:
tn, fp, fn, tp = mt.confusion_matrix(y_test_k, y_pred).ravel()

In [None]:
print("-------", nameDict[model], "-------")
print(mt.f1_score(y_test_k, y_pred))
print("training time: ", (t2-t1).seconds)
print("training time: ", (t3-t2).seconds)

In [None]:
    y_pred = model.predict(X_test_k)
    t3 = datetime.now()
    tn, fp, fn, tp = mt.confusion_matrix(y_test_k, y_pred).ravel()
    print("-------", nameDict[model], "-------")
    print(mt.f1_score(y_test_k, y_pred))
    print("training time: ", (t2-t1).seconds)
    print("training time: ", (t3-t2).seconds)

In [None]:
dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)
lr = LogisticRegression()


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize", lr:"logistic regression"}


classifiers = [dt, lr, rf, gnb, bnb, mlp, mnb, ada, clf]


for model in classifiers:
    t1 = datetime.now()
    model.fit(X_train_f, y_train_f)
    t2 = datetime.now()
    y_pred = dt.predict(X_test_f)
    t3 = datetime.now()
    tn, fp, fn, tp = mt.confusion_matrix(y_test_f, y_pred).ravel()
    print("-------", nameDict[model], "-------")
    print(mt.f1_score(y_test_f, y_pred))
    print("training time: ", (t2-t1).seconds)
    print("training time: ", (t3-t2).seconds)

In [None]:
#Experiments for Zubiaga
dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)
lr = LogisticRegression()


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize", lr:"logistic regression"}


classifiers = [lr,dt, rf, gnb, bnb, mlp, mnb, ada, clf]


for model in classifiers:
    kf =  KFold(n_splits=3)
    for train_index, test_index in kf.split(zubiaga):
        X_Z = zubiaga.iloc[train_index].drop(columns=["label"])
        Y_Z = zubiaga.iloc[train_index]["label"]
        _X_Z = zubiaga.iloc[test_index].drop(columns=["label"])
        _Y_Z = zubiaga.iloc[test_index]["label"]
        
        X_train = X_Z
        Y_train = Y_Z
        X_test = _X_Z
        Y_test = _Y_Z

        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        tn, fp, fn, tp = mt.confusion_matrix(Y_test, y_pred).ravel()
        PR_T = tp/(tp+fp)
        RE_T = tp/(tp+fn)
        F1_Score = (2*PR_T*RE_T)/(PR_T+RE_T)
        ACC_T = (tp+tn)/(tp+fp+tn+fn)
        print(nameDict[model], ":", tp, tn, fp, fn, ACC_T, F1_Score, PR_T, RE_T)

In [None]:
# #Feature selection

# # This cell should become active only after doing feature evaluation, because filters variable would be available after feature evaluation
# kerala_nr = (pd.DataFrame.from_csv("./kerala_nr.csv"))[filters]
# kerala_r = (pd.DataFrame.from_csv("./kerala_r.csv"))[filters]
# florence_nr = (pd.DataFrame.from_csv("./florence_nr.csv"))[filters]
# florence_r = (pd.DataFrame.from_csv("./florence_r.csv"))[filters]
# zubiaga_nr = (pd.DataFrame.from_csv("./zubiaga_nr.csv"))[filters]
# zubiaga_r = (pd.DataFrame.from_csv("./zubiaga_r.csv"))[filters] 

# kerala_nr.to_csv("./MATLAB/zubiaga/kerala_nr_filters.csv") 
# kerala_r.to_csv("./MATLAB/zubiaga/kerala_r_filters.csv")
# florence_nr.to_csv("./MATLAB/zubiaga/florence_nr_filters.csv")
# florence_r.to_csv("./MATLAB/zubiaga/florence_r_filters.csv")
# zubiaga_nr.to_csv("./MATLAB/zubiaga/zubiaga_nr_filters.csv")
# zubiaga_r.to_csv("./MATLAB/zubiaga/zubiaga_r_filters.csv")


# kerala_nr = (pd.DataFrame.from_csv("./kerala_nr.csv"))[significantNotOurs]
# kerala_r = (pd.DataFrame.from_csv("./kerala_r.csv"))[significantNotOurs]
# florence_nr = (pd.DataFrame.from_csv("./florence_nr.csv"))[significantNotOurs]
# florence_r = (pd.DataFrame.from_csv("./florence_r.csv"))[significantNotOurs]
# zubiaga_nr = (pd.DataFrame.from_csv("./zubiaga_nr.csv"))[significantNotOurs]
# zubiaga_r = (pd.DataFrame.from_csv("./zubiaga_r.csv"))[significantNotOurs] 


# kerala_nr.to_csv("./MATLAB/zubiaga/kerala_nr_significantNotOurs.csv") 
# kerala_r.to_csv("./MATLAB/zubiaga/kerala_r_significantNotOurs.csv")
# florence_nr.to_csv("./MATLAB/zubiaga/florence_nr_significantNotOurs.csv")
# florence_r.to_csv("./MATLAB/zubiaga/florence_r_significantNotOurs.csv")
# zubiaga_nr.to_csv("./MATLAB/zubiaga/zubiaga_nr_significantNotOurs.csv")
# zubiaga_r.to_csv("./MATLAB/zubiaga/zubiaga_r_significantNotOurs.csv")

In [None]:
kerala_nr = (pd.DataFrame.from_csv("./kerala_nr.csv"))
kerala_r = (pd.DataFrame.from_csv("./kerala_r.csv"))
florence_nr = (pd.DataFrame.from_csv("./florence_nr.csv"))
florence_r = (pd.DataFrame.from_csv("./florence_r.csv"))
zubiaga_nr = (pd.DataFrame.from_csv("./zubiaga_nr.csv"))
zubiaga_r = (pd.DataFrame.from_csv("./zubiaga_r.csv"))

In [None]:
florence_nr.head()

In [None]:
print(len(kerala_nr))
print(len(kerala_r))
print(len(florence_nr))
print(len(florence_r))
print(len(zubiaga_nr))
print(len(zubiaga_r))

In [None]:
# Adding rumour/non-rumour label
#rumour=1 non-rumour=-1
# kerala_r["label"] = pd.Series(1, index=kerala_r.index)
# kerala_nr["label"] = pd.Series(-1, index=kerala_nr.index)
# florence_r["label"] = pd.Series(1, index=florence_r.index)
# florence_nr["label"] = pd.Series(-1, index=florence_nr.index)
# zubiaga_r["label"] = pd.Series(1, index=zubiaga_r.index)
# zubiaga_nr["label"] = pd.Series(-1, index=zubiaga_nr.index)

kerala_r["label"] = 1
kerala_nr["label"] = -1
florence_r["label"] = 1
florence_nr["label"] = -1
zubiaga_r["label"] = 1
zubiaga_nr["label"] = -1

In [None]:
# labels = list(kerala_r.columns)

In [None]:
linguisticFeatures = ["exclamationMarkCount","questionMarkCount","characterCount","tokenCount","subjectivity",\
                      "polarity","uppercaseCount","lowerCaseCount","firstPersonPronounCount","secondPersonPronounCount",\
                      "thirdPersonPronounCount","capitalWordsCount","averageWordComplexity","vuglarTermsCount",\
                      "emoticonCount","abbreviationCount","emojiCount","posCoordinatingConjunctionCount", "posAdjectiveCount","posAdpositionCount",\
                      "posAdverbCount","posAuxiliaryCount","posConjunctionCount","posDeterminerCount",\
                      "posInterjectionCount","posNounCount","posNumeralCount","posParticleCount","posPronounCount",\
                      "posProperNounCount","posPunctuationCount","posSubordinatingConjunctionCount","posSymbolCount",\
                      "posVerbCount","posOtherCount","posSpaceCount","nerPersonCount","nerNationalityCount",\
                      "nerBuildingCount","nerOrganizationCount","nerCountriesCount","nerLocationCount","nerProductCount",\
                      "nerEventCount","nerArtCount","nerLawCount","nerLanguageCount","nerDateCount","nerTimeCount",\
                      "nerMoneyCount","nerQuantityCount","nerOrdinalCount","nerCardinalCount","insight","tentative",\
                      "positiveEmotion","negativeEmotion","anxiety","certainty","tone","sentenceComplexity"]
userFeatures = ["hasProfileDescription","isVerifiedAccount","statusCount","followingCount",\
                "influnece","userRole","totalProfileLikesCount","accountAge","protectedProfile",\
                "hasProfileLocation","hasProfilePicture","averageFollowSpeed",\
                "averageBeingFollowedSpeed","averageLikeSpeed","averageStatusSpeed","screenNameLength",\
                "screenNameDigitCount"]
metaFeatures = ["hashtagCount","mentionCount","hasUrl","geoEnabled", "multimediaCounter"]
# , "tweetPostTime"

In [None]:
literatureFeatures = ["exclamationMarkCount","questionMarkCount","characterCount","tokenCount","subjectivity",\
                      "polarity","uppercaseCount","lowerCaseCount","firstPersonPronounCount","secondPersonPronounCount",\
                      "thirdPersonPronounCount","capitalWordsCount","averageWordComplexity","vuglarTermsCount",\
                      "emoticonCount","abbreviationCount","insight","tentative",\
                      "positiveEmotion","negativeEmotion","anxiety","sentenceComplexity",\
                     "hasProfileDescription","isVerifiedAccount","statusCount","followingCount",\
                "influnece","userRole", "accountAge",\
                "hasProfileLocation","hashtagCount","mentionCount","hasUrl",  "multimediaCounter"]

inspiredFeatures = ["posCoordinatingConjunctionCount", "posAdjectiveCount","posAdpositionCount",\
                      "posAdverbCount","posAuxiliaryCount","posConjunctionCount","posDeterminerCount",\
                      "posInterjectionCount","posNounCount","posNumeralCount","posParticleCount","posPronounCount",\
                      "posProperNounCount","posPunctuationCount","posSubordinatingConjunctionCount","posSymbolCount",\
                      "posVerbCount","posOtherCount","posSpaceCount","hasProfilePicture","screenNameLength",\
                "screenNameDigitCount"]

developedFeatures = ["emojiCount", "tone", "nerPersonCount","nerNationalityCount",\
                      "nerBuildingCount","nerOrganizationCount","nerCountriesCount","nerLocationCount","nerProductCount",\
                      "nerEventCount","nerArtCount","nerLawCount","nerLanguageCount","nerDateCount","nerTimeCount",\
                      "nerMoneyCount","nerQuantityCount","nerOrdinalCount","nerCardinalCount", "certainty", "totalProfileLikesCount", "protectedProfile","averageFollowSpeed",\
                "averageBeingFollowedSpeed","averageLikeSpeed","averageStatusSpeed", "geoEnabled"]


In [None]:
#Separating kerala rumour features by the feature class
kerala_r_ling = kerala_r[linguisticFeatures]
kerala_r_user = kerala_r[userFeatures]
kerala_r_meta = kerala_r[metaFeatures]

kerala_r_ling["label"] = 1 
kerala_r_user["label"] = 1 
kerala_r_meta["label"] = 1 

#Separating kerala non rumour features by the feature class
kerala_nr_ling = kerala_nr[linguisticFeatures]
kerala_nr_user = kerala_nr[userFeatures]
kerala_nr_meta = kerala_nr[metaFeatures]

kerala_nr_ling["label"] = -1
kerala_nr_user["label"] = -1
kerala_nr_meta["label"] = -1

In [None]:
#Separating florence rumour features by the feature class
florence_r_ling = florence_r[linguisticFeatures]
florence_r_user = florence_r[userFeatures]
florence_r_meta = florence_r[metaFeatures]

florence_r_ling["label"] = 1
florence_r_user["label"] = 1
florence_r_meta["label"] = 1

#Separating florence non rumour features by the feature class
florence_nr_ling = florence_nr[linguisticFeatures]
florence_nr_user = florence_nr[userFeatures]
florence_nr_meta = florence_nr[metaFeatures]

florence_nr_ling["label"] = -1
florence_nr_user["label"] = -1
florence_nr_meta["label"] = -1

In [None]:
#Separating zubiaga rumour features by the feature class
zubiaga_r_ling = zubiaga_r[linguisticFeatures]
zubiaga_r_user = zubiaga_r[userFeatures]
zubiaga_r_meta = zubiaga_r[metaFeatures]

zubiaga_r_ling["label"] = 1
zubiaga_r_user["label"] = 1
zubiaga_r_meta["label"] = 1

#Separating zubiaga non rumour features by the feature class
zubiaga_nr_ling = zubiaga_nr[linguisticFeatures]
zubiaga_nr_user = zubiaga_nr[userFeatures]
zubiaga_nr_meta = zubiaga_nr[metaFeatures]

zubiaga_nr_ling["label"] = -1
zubiaga_nr_user["label"] = -1
zubiaga_nr_meta["label"] = -1

In [None]:
# #Feature experiment

# #"~averageFollowSpeed","+averageBeingFollowedSpeed","+averageLikeSpeed","averageStatusSpeed"

# florence_nr_without = florence_nr.drop(columns=["averageFollowSpeed","averageBeingFollowedSpeed","averageLikeSpeed","averageStatusSpeed"])
# florence_r_without = florence_r.drop(columns=["averageFollowSpeed","averageBeingFollowedSpeed","averageLikeSpeed","averageStatusSpeed"])

# florence_nr_with = florence_nr.drop(columns=["averageFollowSpeed", "averageBeingFollowedSpeed","averageLikeSpeed"])
# florence_r_with = florence_r.drop(columns=["averageFollowSpeed", "averageBeingFollowedSpeed","averageLikeSpeed"])

# kerala_nr_without = kerala_nr.drop(columns=["averageFollowSpeed","averageBeingFollowedSpeed","averageLikeSpeed","averageStatusSpeed"])
# kerala_r_without = kerala_r.drop(columns=["averageFollowSpeed","averageBeingFollowedSpeed","averageLikeSpeed","averageStatusSpeed"])

# kerala_nr_with = kerala_nr.drop(columns=["averageFollowSpeed", "averageBeingFollowedSpeed","averageLikeSpeed"])
# kerala_r_with = kerala_r.drop(columns=["averageFollowSpeed", "averageBeingFollowedSpeed","averageLikeSpeed"])



# florence_with = pd.concat([florence_nr_with, florence_r_with], ignore_index=True)
# florence_without = pd.concat([florence_nr_without, florence_r_without], ignore_index=True)

# kerala_with = pd.concat([kerala_nr_with, kerala_r_with], ignore_index=True)
# kerala_without = pd.concat([kerala_nr_without, kerala_r_without], ignore_index=True)


# Y_Florence_with_pd = florence_with[["label"]]
# X_Florence_with_pd = florence_with.drop(columns=["label"])
# Y_Florence_without_pd = florence_without[["label"]]
# X_Florence_without_pd = florence_without.drop(columns=["label"])


# Y_Kerala_with_pd = kerala_with[["label"]]
# X_Kerala_with_pd = kerala_with.drop(columns=["label"])
# Y_Kerala_without_pd = kerala_without[["label"]]
# X_Kerala_without_pd = kerala_without.drop(columns=["label"])


# Y_Florence_with = Y_Florence_with_pd.values
# X_Florence_with = X_Florence_with_pd.values
# Y_Florence_without = Y_Florence_without_pd.values
# X_Florence_without = X_Florence_without_pd.values
# Y_Kerala_with = Y_Kerala_with_pd.values
# X_Kerala_with = X_Kerala_with_pd.values
# Y_Kerala_without = Y_Kerala_without_pd.values
# X_Kerala_without = X_Kerala_without_pd.values

In [None]:
#Merging the datas and shuffling them and separating label from training data
# Also, making training and test set
kerala = pd.concat([kerala_r, kerala_nr], ignore_index=True)
florence = pd.concat([florence_r, florence_nr], ignore_index=True)
zubiaga = pd.concat([zubiaga_r, zubiaga_nr], ignore_index=True)

# kerala_ling = pd.concat([kerala_r_ling, kerala_nr_ling], ignore_index=True)
# florence_ling = pd.concat([florence_r_ling, florence_nr_ling], ignore_index=True)
# zubiaga_ling = pd.concat([zubiaga_r_ling, zubiaga_nr_ling], ignore_index=True)

# kerala_user = pd.concat([kerala_r_user, kerala_nr_user], ignore_index=True)
# florence_user = pd.concat([florence_r_user, florence_nr_user], ignore_index=True)
# zubiaga_user = pd.concat([zubiaga_r_user, zubiaga_nr_user], ignore_index=True)

# kerala_meta = pd.concat([kerala_r_meta, kerala_nr_meta], ignore_index=True)
# florence_meta = pd.concat([florence_r_meta, florence_nr_meta], ignore_index=True)
# zubiaga_meta = pd.concat([zubiaga_r_meta, zubiaga_nr_meta], ignore_index=True)

# kerala = kerala.sample(frac=1)
# florence = florence.sample(frac=1)
# zubiaga = zubiaga.sample(frac=1)

# kerala_ling = kerala_ling.sample(frac=1)
# florence_ling = florence_ling.sample(frac=1)
# zubiaga_ling = zubiaga_ling.sample(frac=1)

# kerala_user = kerala_user.sample(frac=1)
# florence_user = florence_user.sample(frac=1)
# zubiaga_user = zubiaga_user.sample(frac=1)

# kerala_meta = kerala_meta.sample(frac=1)
# florence_meta = florence_meta.sample(frac=1)
# zubiaga_meta = zubiaga_meta.sample(frac=1)

# kerala = kerala.reset_index(drop=True)
# florence = florence.reset_index(drop=True)
# zubiaga = zubiaga.reset_index(drop=True)
# kerala_ling = kerala_ling.reset_index(drop=True)
# florence_ling = florence_ling.reset_index(drop=True)
# zubiaga_ling = zubiaga_ling.reset_index(drop=True)
# kerala_user = kerala_user.reset_index(drop=True)
# florence_user = florence_user.reset_index(drop=True)
# zubiaga_user = zubiaga_user.reset_index(drop=True)
# kerala_meta = kerala_meta.reset_index(drop=True)
# florence_meta = florence_meta.reset_index(drop=True)
# zubiaga_meta = zubiaga_meta.reset_index(drop=True)

Y_Kerala_pd = kerala[["label"]]
X_Kerala_pd = kerala.drop(columns=["label"])
Y_Florence_pd = florence[["label"]]
X_Florence_pd = florence.drop(columns=["label"])
Y_Zubiaga_pd = zubiaga[["label"]]
X_Zubiaga_pd = zubiaga.drop(columns=["label"])


Y_Kerala = Y_Kerala_pd.values
X_Kerala = X_Kerala_pd.values
Y_Florence = Y_Florence_pd.values
X_Florence = X_Florence_pd.values
Y_Zubiaga = Y_Zubiaga_pd.values
X_Zubiaga = X_Zubiaga_pd.values

In [None]:
# Appending datasets to each others
# Y_Kerala_Florence_pd = pd.concat([Y_Kerala_pd, Y_Florence_pd], ignore_index=True)
# X_Kerala_Florence_pd = pd.concat([X_Kerala_pd, X_Florence_pd], ignore_index=True)
# Y_Kerala_Florence_pd.to_csv("./MATLAB/kerala_florence_label.csv")
# X_Kerala_Florence_pd.to_csv("./MATLAB/kerala_florence_data.csv")


# Y_Kerala_Zubiaga_pd = pd.concat([Y_Kerala_pd, Y_Zubiaga_pd], ignore_index=True)
# X_Kerala_Zubiaga_pd = pd.concat([X_Kerala_pd, X_Zubiaga_pd], ignore_index=True)
# Y_Kerala_Zubiaga_pd.to_csv("./MATLAB/kerala_zubiaga_label.csv")
# X_Kerala_Zubiaga_pd.to_csv("./MATLAB/kerala_zubiaga_data.csv")


# Y_Florence_Zubiaga_pd = pd.concat([Y_Florence_pd, Y_Zubiaga_pd], ignore_index=True)
# X_Florence_Zubiaga_pd = pd.concat([X_Florence_pd, X_Zubiaga_pd], ignore_index=True)
# Y_Florence_Zubiaga_pd.to_csv("./MATLAB/florence_zubiaga_label.csv")
# X_Florence_Zubiaga_pd.to_csv("./MATLAB/florence_zubiaga_data.csv")

# Y_Kerala_Florence = Y_Kerala_Florence_pd.values
# X_Kerala_Florence = X_Kerala_Florence_pd.values
# Y_Kerala_Zubiaga = Y_Kerala_Zubiaga_pd.values
# X_Kerala_Zubiaga = X_Kerala_Zubiaga_pd.values
# Y_Florence_Zubiaga = Y_Florence_Zubiaga_pd.values
# X_Florence_Zubiaga = X_Florence_Zubiaga_pd.values

### Note
1. The Zubiaga Dataset is the one that is refered to in "Learning Reporting Dynamics during Breaking News for Rumour Detection in Social Media"
2. The Kerala dataset is collected by me
3. The Florence datset is collected by me

In [None]:
## Training set
# X : Data  --- Y: Label
# For the datasets the the training set and test set are specidied as follows:

# Zubiaga Experiment: Train=> X_Z | Y_Z
#                     Test=> _X_Z | _Y_Z
# Five fold Cross Validation
# X_Z = 
# Y_Z = 
# _X_Z = 
# _Y_Z = 

# Kerala Experiment: Train=> X_K | Y_K
#                    Test=> _X_K | _Y_K
X_K = X_Kerala
Y_K = Y_Kerala
_X_K = X_Florence
_Y_K = Y_Florence


# Florence Experiment: Train=> X_F | Y_F
#                      Test=> _X_F | _Y_F

X_F = X_Florence
Y_F = Y_Florence
_X_F = X_Kerala
_Y_F = Y_Kerala

In [None]:
#Experiments for Zubiaga
dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)
lr = LogisticRegression()


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize", lr:"logistic regression"}



classifiers = [lr,dt, clf, rf, gnb, bnb, mlp, mnb, ada, clf_2]
# classifiers = [dt]

for model in classifiers:
    zubiaga = zubiaga.sample(frac=1)
    kf =  KFold(n_splits=5)
    for train_index, test_index in kf.split(zubiaga):
        X_Z = zubiaga.iloc[train_index].drop(columns=["label"])
        Y_Z = zubiaga.iloc[train_index]["label"]
        _X_Z = zubiaga.iloc[test_index].drop(columns=["label"])
        _Y_Z = zubiaga.iloc[test_index]["label"]
        
        X_train = X_Z
        Y_train = Y_Z
        X_test = _X_Z
        Y_test = _Y_Z

        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        tn, fp, fn, tp = mt.confusion_matrix(Y_test, y_pred).ravel()
        PR_T = tp/(tp+fp)
        RE_T = tp/(tp+fn)
        F1_Score = (2*PR_T*RE_T)/(PR_T+RE_T)
        ACC_T = (tp+tn)/(tp+fp+tn+fn)
        print(nameDict[model], ":", tp, tn, fp, fn, ACC_T, F1_Score, PR_T, RE_T)

In [None]:
#Experiments for Florence
dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize"}



classifiers = [dt, clf, rf, gnb, bnb, mlp, mnb, ada, clf_2]
# classifiers = [clf_2]

for model in classifiers:
    florence = florence.sample(frac=1)
    kf =  KFold(n_splits=5)
    for train_index, test_index in kf.split(florence):

        X_F = florence.iloc[train_index].drop(columns=["label"])
        Y_F = florence.iloc[train_index]["label"]
        _X_F = florence.iloc[test_index].drop(columns=["label"])
        _Y_F = florence.iloc[test_index]["label"]
        
        X_train = X_F
        Y_train = Y_F
        X_test = _X_F
        Y_test = _Y_F

        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        tn, fp, fn, tp = mt.confusion_matrix(Y_test, y_pred).ravel()
        PR_T = tp/(tp+fp)
        RE_T = tp/(tp+fn)
        F1_Score = (2*PR_T*RE_T)/(PR_T+RE_T)
        ACC_T = (tp+tn)/(tp+fp+tn+fn)
        print(nameDict[model], ":", tp, tn, fp, fn, ACC_T, F1_Score, PR_T, RE_T)


In [None]:
#Experiments for Kerala
dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize"}



classifiers = [dt, clf, rf, gnb, bnb, mlp, mnb, ada, clf_2]
# classifiers = [clf_2]

for model in classifiers:
    kerala = kerala.sample(frac=1)
    kf =  KFold(n_splits=5)
    for train_index, test_index in kf.split(kerala):

        X_K = kerala.iloc[train_index].drop(columns=["label"])
        Y_K = kerala.iloc[train_index]["label"]
        _X_K = kerala.iloc[test_index].drop(columns=["label"])
        _Y_K = kerala.iloc[test_index]["label"]
        
        X_train = X_K
        Y_train = Y_K
        X_test = _X_K
        Y_test = _Y_K

        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        tn, fp, fn, tp = mt.confusion_matrix(Y_test, y_pred).ravel()
        PR_T = tp/(tp+fp)
        RE_T = tp/(tp+fn)
        F1_Score = (2*PR_T*RE_T)/(PR_T+RE_T)
        ACC_T = (tp+tn)/(tp+fp+tn+fn)
        print(nameDict[model], ":", tp, tn, fp, fn, ACC_T, F1_Score, PR_T, RE_T)


In [None]:
#Experiments for Kerala and Florence => Experiment via  Swapping the datasets
dt = DecisionTreeClassifier()
clf = svm.SVC()
rf = RandomForestClassifier()
gnb = GaussianNB()
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
knn = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
clf_2 = SVC(kernel='linear', class_weight='balanced', probability=True)


nameDict = {dt:"Tree", clf:"SVM", rf:"random forest", gnb:"NB", mlp:"MLP", mnb:"MNB", ada:"ADA", bnb:"BNB", clf_2:"penalize"}

X_train = X_Florence
Y_train = Y_Florence
X_test = X_Kerala
Y_test = Y_Kerala

classifiers = [clf_2]

for model in classifiers:
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = mt.confusion_matrix(Y_test, y_pred).ravel()
    PR_T = tp/(tp+fp)
    RE_T = tp/(tp+fn)
    F1_Score = (2*PR_T*RE_T)/(PR_T+RE_T)
    ACC_T = (tp+tn)/(tp+fp+tn+fn)
    print(nameDict[model], ":", tp, tn, fp, fn, ACC_T, F1_Score, PR_T, RE_T)
    
    prob_y_2 = clf_2.predict_proba(X_train)
    prob_y_2 = [p[1] for p in prob_y_2]
    print( roc_auc_score(Y_train, prob_y_2) )
    
    

# From now on, we do feature evaluation

In [None]:
#####Feature evaluation######
# First dataset

c = DecisionTreeClassifier()
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
extra = ExtraTreesClassifier()

ada.fit(X_Kerala, Y_Kerala)
featureImportance_ada_k = dict(zip(labels, ada.feature_importances_))

c.fit(X_Kerala, Y_Kerala)
featureImportance_c_k = dict(zip(labels, c.feature_importances_))

extra.fit(X_Kerala, Y_Kerala)
featureImportance_extra_k = dict(zip(labels, extra.feature_importances_))

featureImportance_k = {p:featureImportance_ada_k[p]+featureImportance_c_k[p]+featureImportance_extra_k[p] for p in featureImportance_ada_k}
maxNum = max(featureImportance_k.values())
featureImportance_k = {p:featureImportance_k[p]/maxNum for p in featureImportance_k}
featureImportance_sorted_k = sorted(featureImportance_k.items(), key=operator.itemgetter(1), reverse=True)
featureImportance_pd_k = pd.DataFrame.from_dict(featureImportance_k, orient='index')

In [None]:
#####Feature evaluation######
# Second dataset

c = DecisionTreeClassifier()
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
ada = AdaBoostClassifier(n_estimators=30, random_state=7)
extra = ExtraTreesClassifier()

ada.fit(X_Florence, Y_Florence)
featureImportance_ada_f = dict(zip(labels, ada.feature_importances_))

c.fit(X_Florence, Y_Florence)
featureImportance_c_f = dict(zip(labels, c.feature_importances_))

extra.fit(X_Florence, Y_Florence)
featureImportance_extra_f = dict(zip(labels, extra.feature_importances_))

featureImportance_f = {p:featureImportance_ada_f[p]+featureImportance_c_f[p]+featureImportance_extra_f[p] for p in featureImportance_ada_f}
maxNum = max(featureImportance_f.values())
featureImportance_f = {p:featureImportance_f[p]/maxNum for p in featureImportance_f}
featureImportance_sorted_f = sorted(featureImportance_f.items(), key=operator.itemgetter(1), reverse=True)
featureImportance_pd_f = pd.DataFrame.from_dict(featureImportance_f, orient='index')

In [None]:
# featureImportance_pd_total = featureImportance_pd_f + featureImportance_pd_k

In [None]:
featureImportance = featureImportance_pd_f
featureImportance["1"] = featureImportance_pd_k[0]

In [None]:
featureImportance = featureImportance.drop(index=['engagementScore', 'credibilityScore', 'tweetPostTime'])

In [None]:
featureImportance = featureImportance.rename(columns={"1":1})

In [None]:
featureImportanceCopy = copy.deepcopy(featureImportance)
featureImportanceCopy["3"] = featureImportanceCopy[0]+featureImportanceCopy[1]
featureImportanceCopy = featureImportanceCopy.drop(columns=[0,1])
featureImportanceCopy = featureImportanceCopy.rename(columns={"3":0})

In [None]:
filters = list(featureImportanceCopy[featureImportanceCopy[0]>0.001].index)

In [None]:
consistents = [i for i in featureImportance.index if (featureImportance.loc[i][0] > 0.001 and featureImportance.loc[i][1] > 0.001)]

In [None]:
len(set(consistents) & set(linguisticFeatures))

In [None]:
len(consistents)

In [None]:
print(f'literatureFeatures: {len(set(list(featureImportance_pd_total[featureImportance_pd_total[0]>0.001].index)) & set(literatureFeatures))}')
print(f'inspiredFeatures: {len(set(list(featureImportance_pd_total[featureImportance_pd_total[0]>0.001].index)) & set(inspiredFeatures))}')
print(f'developedFeatures: {len(set(list(featureImportance_pd_total[featureImportance_pd_total[0]>0.001].index)) & set(developedFeatures))}')

In [None]:
print(f'consistent literatureFeatures: {len((set(list(featureImportance_pd_f[featureImportance_pd_f[0]>0.001].index)) & set(list(featureImportance_pd_k[featureImportance_pd_k[0]>0.001].index))) & set(literatureFeatures))}')
print(f'consistent inspiredFeatures: {len((set(list(featureImportance_pd_f[featureImportance_pd_f[0]>0.001].index)) & set(list(featureImportance_pd_k[featureImportance_pd_k[0]>0.001].index))) & set(inspiredFeatures))}')
print(f'consistent developedFeatures: {len((set(list(featureImportance_pd_f[featureImportance_pd_f[0]>0.001].index)) & set(list(featureImportance_pd_k[featureImportance_pd_k[0]>0.001].index))) & set(developedFeatures))}')

In [None]:
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("Counter","#"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("Count","#"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("Organization","Org"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("average","avg"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("first","1st"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("second","2nd"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("third","3rd"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("Coordinating","Coord"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("Subordinating","Sub"))
featureImportance['index'] = featureImportance['index'].apply(lambda x: x.replace("Conjunction","Conj"))

In [None]:
featureImportance.index = featureImportance["index"]

In [None]:
featureImportance = featureImportance.drop(columns=["index"])

In [None]:
featureImportance.loc['character#'] = [0.86,0.134229]
featureImportance.loc['token#'] = [0.019403,0.97]

In [None]:
cumval=0
fig = plt.figure(figsize=(15,20))
for col in featureImportance.columns:
    plt.barh(featureImportance.index, featureImportance[col], left=cumval, label=col)
    cumval = cumval+featureImportance[col]
    
ax = plt.subplot(111)
ax.legend(["Florence Dataset", "Kerala Dataset"], loc='upper center',
          ncol=2, fancybox=True, shadow=True)
plt.savefig("featureEvaluation.eps", bbox_inches = 'tight')
plt.show()

In [None]:
featureImportance[2] = featureImportance[0] + featureImportance[1]

In [None]:
featureImportance_Sorted = featureImportance.sort_values(by=[2], ascending=False)

In [None]:
featureImportance_Sorted = featureImportance_Sorted.drop(columns=[2])

In [None]:
cumval=0
fig = plt.figure(figsize=(15,20))
for col in featureImportance_Sorted.columns:
    plt.barh(featureImportance_Sorted.index, featureImportance_Sorted[col], left=cumval, label=col)
    cumval = cumval+featureImportance_Sorted[col]
    
ax = plt.subplot(111)
ax.legend(["Florence Dataset", "Kerala Dataset"], loc='upper center',
          ncol=2, fancybox=True, shadow=True)
plt.savefig("featureEvaluation.eps", bbox_inches = 'tight')
plt.show()

In [None]:
#The spaces in column names are becasue of order of the columns :D
df_featureCategory1 = pd.DataFrame({" Linguistic & Content":[61,50,38], " User":[17,11,4], "Meta-Message":[5,4,3]})
df_featureCategory2 = pd.DataFrame({" Literature":[34,30,20], "Proposed":[49,35,25]})

In [None]:
df_featureCategory1.index = ["Total","Significant", "Consistently significant"]
df_featureCategory2.index = ["Total","Significant", "Consistently significant"]

In [None]:
df_featureCategory1.transpose().plot(kind='bar')
plt.ylabel("Number of features")
plt.savefig("featureComparison1.eps", bbox_inches='tight')
plt.show()

In [None]:
df_featureCategory2.transpose().plot(kind='bar')
plt.ylabel("Number of features")
plt.savefig("featureComparison2.eps", bbox_inches='tight')
plt.show()

In [None]:
df_featureCategory1.transpose()

In [None]:
cumval=0
fig = plt.figure()
for col in df_featureCategory1.transpose().columns:
    plt.bar(df_featureCategory1.transpose().index, df_featureCategory1.transpose()[col], bottom=cumval, label=col)
    cumval = cumval+df_featureCategory1.transpose()[col]
plt.legend()

In [None]:
cumval=0
fig = plt.figure()
for col in df_featureCategory2.transpose().columns:
    plt.bar(df_featureCategory2.transpose().index, df_featureCategory2.transpose()[col], bottom=cumval, label=col)
    cumval = cumval+df_featureCategory2.transpose()[col]
plt.legend()