# Learning to Detect Fake Content on Twitter

## Contents

* Introduction
* Data
* Feature Engineering
* Classification
* Results
* Tweet Verification Assistant

In [1]:
# import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
pd.options.display.max_columns = 100

## Data
### Load and Explore the Data

In [2]:
# load the data
data  = pd.read_csv('data/tweets_info.csv')
data['id'] = data['id'].astype(str)
data.shape

(17857, 4)

In [3]:
data.head()

Unnamed: 0,id,text,event,label
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,Boston,fake
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,Boston,fake
2,325152091423248385,I'm not completely convinced that it's this Su...,Boston,fake
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,Boston,fake
4,324315545572896768,4chan and the bombing. just throwing it out th...,Boston,fake


In [4]:
# load the tweet data
with open('data/tweetsFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
tweet_data  = pd.DataFrame(lines)
tweet_data.drop(columns=['_id', 'annotation', 
                         'alexaCountryRank', 'alexaDeltaRank', 'alexaPopularity', 'alexaReachRank',
                         'urlHarmonic', 'urlIndegree', 'wotSafe', 'wotTrust'], 
                inplace=True)
tweet_data['id'] = tweet_data['id'].astype(str)
print('Data has {} samples with {} attributes'.format(*tweet_data.shape))

Data has 17857 samples with 25 attributes


In [5]:
tweet_data.tail()

Unnamed: 0,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount
17852,False,False,False,False,False,False,False,True,True,False,712226185707712512,129,0,1,0,0.0,5.0,0.0,0,1.0,2,14,13,41.852,0
17853,False,False,False,False,False,False,False,True,True,False,712226175721066496,119,0,1,0,0.0,5.0,0.0,0,1.0,2,6,12,61.325,63
17854,False,False,False,False,False,False,False,True,False,False,712222764514484224,135,0,0,0,0.0,9.0,0.0,0,2.0,1,3,19,61.89,0
17855,False,False,False,False,False,False,False,True,False,False,712210186438631424,115,0,0,0,0.0,5.0,0.0,0,0.0,1,9,13,37.455,0
17856,False,False,False,False,False,False,False,True,False,False,712210186350493696,115,0,0,0,0.0,5.0,0.0,0,0.0,1,9,13,37.455,0


In [6]:
# load the user data
with open('data/userFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
user_data = pd.DataFrame(lines)
user_data.drop(columns=['_id', 'annotation',
                        'alexaCountryRank', 'alexaDeltaRank', 'alexaPopularity', 'alexaReachRank',
                        'harmonic', 'indegree', 'wotTrustUser'], 
               inplace=True)
user_data['id'] = user_data['id'].astype(str)

print('User data has {} samples with {} attributes'.format(*user_data.shape))


User data has 17857 samples with 18 attributes


In [7]:
user_data.head()

Unnamed: 0,FolFrieRatio,accountAge,hasBio,hasExistingLocation,hasHeaderImg,hasLocation,hasProfileImg,hasURL,id,isVerified,numFavorites,numFollowers,numFriends,numMediaContent,numTweets,timesListed,tweetRatio,username
0,5.833922,1234982831,True,False,False,True,True,False,263046056240115712,False,5243,1651,283,1069,43811,64,32.47286,iAnnieM
1,43.68981,1284854676,True,True,False,True,True,True,262995061304852481,False,45,95637,2189,10400,54293,737,70.346275,CarlosVerareal
2,0.862576,1271108498,True,False,False,True,True,False,262979898002534400,False,274,1701,1972,6455,34414,13,36.97036,LucasPalape
3,0.88015,1295804773,True,False,False,True,True,False,262996108400271360,False,6,235,267,0,17837,0,27.651606,Haaaaarryyy
4,0.937824,1297372694,True,False,False,True,True,False,263018881839411200,False,2867,181,193,1223,25754,0,41.076424,princess__natt


### Check the missing values

In [8]:
features=tweet_data.columns.values
for feature in features:
    print(feature,'- Missing:', 
          sum(tweet_data[feature].isnull()),
          '- Unique:', len(tweet_data[feature].unique()))

containsExclamationMark - Missing: 0 - Unique: 2
containsFirstOrderPron - Missing: 2710 - Unique: 3
containsHappyEmo - Missing: 0 - Unique: 2
containsQuestionMark - Missing: 0 - Unique: 2
containsSadEmo - Missing: 0 - Unique: 2
containsSecondOrderPron - Missing: 2710 - Unique: 3
containsThirdOrderPron - Missing: 2710 - Unique: 3
hasColon - Missing: 0 - Unique: 2
hasExternalLink - Missing: 0 - Unique: 2
hasPlease - Missing: 0 - Unique: 2
id - Missing: 0 - Unique: 17857
itemLength - Missing: 0 - Unique: 128
numExclamationMark - Missing: 0 - Unique: 15
numHashtags - Missing: 0 - Unique: 17
numMentions - Missing: 0 - Unique: 8
numNegSentiWords - Missing: 2710 - Unique: 8
numNouns - Missing: 2603 - Unique: 26
numPosSentiWords - Missing: 2710 - Unique: 8
numQuestionMark - Missing: 0 - Unique: 8
numSlangs - Missing: 2820 - Unique: 11
numURLs - Missing: 0 - Unique: 5
numUppercaseChars - Missing: 0 - Unique: 78
numWords - Missing: 0 - Unique: 34
readability - Missing: 4151 - Unique: 645
retweet

In [9]:
features=user_data.columns.values
for feature in features:
    print(feature,'- Missing:', 
          sum(user_data[feature].isnull()),
          '- Unique:', len(user_data[feature].unique()))

FolFrieRatio - Missing: 0 - Unique: 14427
accountAge - Missing: 0 - Unique: 16838
hasBio - Missing: 0 - Unique: 2
hasExistingLocation - Missing: 0 - Unique: 2
hasHeaderImg - Missing: 0 - Unique: 2
hasLocation - Missing: 0 - Unique: 2
hasProfileImg - Missing: 0 - Unique: 2
hasURL - Missing: 0 - Unique: 2
id - Missing: 0 - Unique: 17857
isVerified - Missing: 0 - Unique: 2
numFavorites - Missing: 0 - Unique: 2943
numFollowers - Missing: 0 - Unique: 3654
numFriends - Missing: 0 - Unique: 2669
numMediaContent - Missing: 0 - Unique: 3313
numTweets - Missing: 0 - Unique: 11004
timesListed - Missing: 0 - Unique: 725
tweetRatio - Missing: 0 - Unique: 17840
username - Missing: 0 - Unique: 16843


#### Number of positive/negative words

Quick check to see what is the nature of the tweets that present no positive sentiment words.

In [10]:
tweet_data.loc[tweet_data['numPosSentiWords'].isna()].head()

Unnamed: 0,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount
7,False,,False,False,False,,,False,False,False,263111677485142017,69,0,5,0,,4.0,,0,,1,0,5,,0
8,False,,False,False,False,,,False,False,False,262977091983785985,36,0,2,0,,1.0,,0,,1,0,2,,0
10,False,,False,False,False,,,False,False,False,263129115207536640,45,0,3,0,,,,0,,1,0,3,,0
11,False,,False,False,False,,,False,False,False,263091320871063552,66,0,5,0,,1.0,,0,,1,0,5,,0
24,False,,False,False,False,,,False,False,False,263047501433688064,53,0,1,1,,,,0,,1,1,6,,1


Most of them are tweets with lots of hashtags, which were not taken into account while looking for sentiment words. Let's fill the nan values with 0.

In [11]:
tweet_data['numPosSentiWords'].fillna(0, inplace=True)
tweet_data['numNegSentiWords'].fillna(0, inplace=True)
tweet_data['numNouns'].fillna(0, inplace=True)
tweet_data['numSlangs'].fillna(0, inplace=True)
tweet_data['readability'].fillna(0, inplace=True)

### Feature Engineering

#### Merge the features

In [12]:
# add prefix to tweet data
tweet_data = tweet_data.add_prefix('tweet_')
# add prefix to user data
user_data = user_data.add_prefix('user_')
# merge the tweet and user features
features_raw = pd.merge(tweet_data, user_data, left_on='tweet_id', right_on='user_id')
# merge with the info
features_raw = pd.merge(data,features_raw, left_on='id', right_on='tweet_id')

In [13]:
features_raw.head()

Unnamed: 0,id,text,event,label,tweet_containsExclamationMark,tweet_containsFirstOrderPron,tweet_containsHappyEmo,tweet_containsQuestionMark,tweet_containsSadEmo,tweet_containsSecondOrderPron,tweet_containsThirdOrderPron,tweet_hasColon,tweet_hasExternalLink,tweet_hasPlease,tweet_id,tweet_itemLength,tweet_numExclamationMark,tweet_numHashtags,tweet_numMentions,tweet_numNegSentiWords,tweet_numNouns,tweet_numPosSentiWords,tweet_numQuestionMark,tweet_numSlangs,tweet_numURLs,tweet_numUppercaseChars,tweet_numWords,tweet_readability,tweet_retweetCount,user_FolFrieRatio,user_accountAge,user_hasBio,user_hasExistingLocation,user_hasHeaderImg,user_hasLocation,user_hasProfileImg,user_hasURL,user_id,user_isVerified,user_numFavorites,user_numFollowers,user_numFriends,user_numMediaContent,user_numTweets,user_timesListed,user_tweetRatio,user_username
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,Boston,fake,True,True,False,False,False,False,False,False,True,False,324597532548276224,88,2,2,0,0.0,2.0,0.0,0,0.0,1,1,13,112.085,0,0.438754,1350475461,True,True,False,True,True,True,324597532548276224,False,168,634,1445,0,903,1,4.953727,SantaCruzShred
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,Boston,fake,False,False,False,False,False,True,True,True,False,False,325145334739267584,135,0,1,0,1.0,7.0,0.0,0,1.0,1,17,18,30.333,0,0.479646,1235638869,True,True,False,True,True,False,325145334739267584,False,879,271,565,133,11972,11,7.913144,Oscar_Wang
2,325152091423248385,I'm not completely convinced that it's this Su...,Boston,fake,False,False,False,False,False,False,False,False,False,False,325152091423248385,88,0,0,0,0.0,2.0,0.0,0,1.0,1,3,13,46.605,0,1.126736,1222220242,True,True,False,True,True,True,325152091423248385,False,22,649,576,762,8313,28,4.983059,jamwil
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,Boston,fake,False,False,False,False,False,False,False,False,True,False,324554646976868352,137,0,2,0,1.0,0.0,0.0,0,1.0,1,1,19,0.0,0,0.543956,1306061808,True,False,False,True,True,True,324554646976868352,False,69,297,546,545,2049,4,2.943054,rubenson80
4,324315545572896768,4chan and the bombing. just throwing it out th...,Boston,fake,False,False,False,False,False,False,True,True,False,False,324315545572896768,96,0,0,0,0.0,1.0,0.0,0,1.0,2,0,9,94.3,0,0.983607,1282240608,True,False,False,False,True,False,324315545572896768,False,1,60,61,66,816,0,0.840142,Slimlenny


In [14]:
features_raw.shape

(17857, 47)

In [15]:
# keep the target and binarise
targets = features_raw['label'].map({'fake': 1, 'real': 0})
# keep the event they belong to
events = features_raw['event']
# keep their text
texts = features_raw['text']
# keep the ids
ids = features_raw['id']
# drop columns
features_raw.drop(columns=['tweet_id', 'user_id', 'user_username', 'label', 'event', 'text', 'id'], inplace=True)

In [16]:
features_raw.head()

Unnamed: 0,tweet_containsExclamationMark,tweet_containsFirstOrderPron,tweet_containsHappyEmo,tweet_containsQuestionMark,tweet_containsSadEmo,tweet_containsSecondOrderPron,tweet_containsThirdOrderPron,tweet_hasColon,tweet_hasExternalLink,tweet_hasPlease,tweet_itemLength,tweet_numExclamationMark,tweet_numHashtags,tweet_numMentions,tweet_numNegSentiWords,tweet_numNouns,tweet_numPosSentiWords,tweet_numQuestionMark,tweet_numSlangs,tweet_numURLs,tweet_numUppercaseChars,tweet_numWords,tweet_readability,tweet_retweetCount,user_FolFrieRatio,user_accountAge,user_hasBio,user_hasExistingLocation,user_hasHeaderImg,user_hasLocation,user_hasProfileImg,user_hasURL,user_isVerified,user_numFavorites,user_numFollowers,user_numFriends,user_numMediaContent,user_numTweets,user_timesListed,user_tweetRatio
0,True,True,False,False,False,False,False,False,True,False,88,2,2,0,0.0,2.0,0.0,0,0.0,1,1,13,112.085,0,0.438754,1350475461,True,True,False,True,True,True,False,168,634,1445,0,903,1,4.953727
1,False,False,False,False,False,True,True,True,False,False,135,0,1,0,1.0,7.0,0.0,0,1.0,1,17,18,30.333,0,0.479646,1235638869,True,True,False,True,True,False,False,879,271,565,133,11972,11,7.913144
2,False,False,False,False,False,False,False,False,False,False,88,0,0,0,0.0,2.0,0.0,0,1.0,1,3,13,46.605,0,1.126736,1222220242,True,True,False,True,True,True,False,22,649,576,762,8313,28,4.983059
3,False,False,False,False,False,False,False,False,True,False,137,0,2,0,1.0,0.0,0.0,0,1.0,1,1,19,0.0,0,0.543956,1306061808,True,False,False,True,True,True,False,69,297,546,545,2049,4,2.943054
4,False,False,False,False,False,False,True,True,False,False,96,0,0,0,0.0,1.0,0.0,0,1.0,2,0,9,94.3,0,0.983607,1282240608,True,False,False,False,True,False,False,1,60,61,66,816,0,0.840142


#### One-hot encoding
Some of our features are categorical, thus we need to apply one-hot encoding to them

In [17]:
cat_cols = []
num_cols = []
for col in features_raw.columns:
    if features_raw[col].dtype==bool or features_raw[col].dtype==object:
        cat_cols.append(col)
    else:
        num_cols.append(col)

In [18]:
features_raw = pd.get_dummies(features_raw, columns=cat_cols)
features_raw.head()

Unnamed: 0,tweet_itemLength,tweet_numExclamationMark,tweet_numHashtags,tweet_numMentions,tweet_numNegSentiWords,tweet_numNouns,tweet_numPosSentiWords,tweet_numQuestionMark,tweet_numSlangs,tweet_numURLs,tweet_numUppercaseChars,tweet_numWords,tweet_readability,tweet_retweetCount,user_FolFrieRatio,user_accountAge,user_numFavorites,user_numFollowers,user_numFriends,user_numMediaContent,user_numTweets,user_timesListed,user_tweetRatio,tweet_containsExclamationMark_False,tweet_containsExclamationMark_True,tweet_containsFirstOrderPron_False,tweet_containsFirstOrderPron_True,tweet_containsHappyEmo_False,tweet_containsHappyEmo_True,tweet_containsQuestionMark_False,tweet_containsQuestionMark_True,tweet_containsSadEmo_False,tweet_containsSadEmo_True,tweet_containsSecondOrderPron_False,tweet_containsSecondOrderPron_True,tweet_containsThirdOrderPron_False,tweet_containsThirdOrderPron_True,tweet_hasColon_False,tweet_hasColon_True,tweet_hasExternalLink_False,tweet_hasExternalLink_True,tweet_hasPlease_False,tweet_hasPlease_True,user_hasBio_False,user_hasBio_True,user_hasExistingLocation_False,user_hasExistingLocation_True,user_hasHeaderImg_False,user_hasHeaderImg_True,user_hasLocation_False,user_hasLocation_True,user_hasProfileImg_False,user_hasProfileImg_True,user_hasURL_False,user_hasURL_True,user_isVerified_False,user_isVerified_True
0,88,2,2,0,0.0,2.0,0.0,0,0.0,1,1,13,112.085,0,0.438754,1350475461,168,634,1445,0,903,1,4.953727,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0
1,135,0,1,0,1.0,7.0,0.0,0,1.0,1,17,18,30.333,0,0.479646,1235638869,879,271,565,133,11972,11,7.913144,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,1,0
2,88,0,0,0,0.0,2.0,0.0,0,1.0,1,3,13,46.605,0,1.126736,1222220242,22,649,576,762,8313,28,4.983059,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0
3,137,0,2,0,1.0,0.0,0.0,0,1.0,1,1,19,0.0,0,0.543956,1306061808,69,297,546,545,2049,4,2.943054,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0,1,1,0
4,96,0,0,0,0.0,1.0,0.0,0,1.0,2,0,9,94.3,0,0.983607,1282240608,1,60,61,66,816,0,0.840142,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,1,0,0,1,1,0,1,0


### Looking at outliers

Notes/Ideas
* predict num of nouns for missing data by looking into the ratio nouns per number of words on the rest of the tweets
* linear regression for the credibility metrics

## Classification

In [19]:
targets.value_counts()

1    10634
0     7223
Name: label, dtype: int64

In [20]:
# baseline
accuracy = sum(targets==1) / len(targets)
recall = 1
precision = sum(targets==1) / len(targets) # tp + fp

f_score = 2* (precision*recall) / (precision+recall)

print('Our baseline: Accuracy {:.4f}, F-score {:.4f}'.format(accuracy, f_score))

Our baseline: Accuracy 0.5955, F-score 0.7465


### Splitting into training and testing data

In [59]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(features_raw, targets, test_size=0.2, random_state=40)
print('Train Shape:', Xtrain.shape)
print('Test Shape:', Xtest.shape)

Train Shape: (14285, 57)
Test Shape: (3572, 57)


### Scaling

In [60]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)

Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

  return self.partial_fit(X, y)
  """
  


### Implementation: Cross validation 

In [61]:
from sklearn.model_selection import cross_val_score

def do_cv(predictor, X, y, cv):
    """
    Executes cross validation and display scores
    """
    print('\n### -- ### -- ' + str(type(predictor)).split('.')[-1][:-2] + ' -- ### -- ###')
    cv_score = cross_val_score(predictor, X, y, scoring='f1', cv=cv)
    print ('Mean F1 score after a 10-fold cross validation: ', cv_score.mean())
    print ('F1 score of each fold: ', cv_score)

In [62]:
# Helper function to help evaluating the model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

def display_scores(predictor, X, y):
    """
    Calculates metrics and display it
    """
    print('\n### -- ### -- ' + str(type(predictor)).split('.')[-1][:-2] + ' -- ### -- ###')
    # Getting the predicted values
    ypred = predictor.predict(X)
    ypred_score = predictor.predict_proba(X)
    
    # calculating metrics
    accuracy = accuracy_score(y, ypred)
    roc = roc_auc_score(y, pd.DataFrame(ypred_score)[1])
    confusion = confusion_matrix(y, ypred)
    fscore = f1_score(y, ypred)
    precision = precision_score(y, ypred)
    recall = recall_score(y, ypred)
    
    print('Confusion Matrix: ', confusion)
    print('Accuracy: ', accuracy)
    print('AUC: ', roc)
    print('recall:', recall)
    print('precision:', precision)
    print('F1-score:', fscore)

### Choosing a classifier

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings

warnings.filterwarnings("ignore", category=FutureWarning) 

# Initialize the models
clf_A = RandomForestClassifier(random_state=0)
clf_B = LogisticRegression(random_state=0, solver='lbfgs')
clf_C = GradientBoostingClassifier(random_state=0)
clf_D = AdaBoostClassifier(random_state=0)

for learner in [clf_A, clf_B, clf_C, clf_D]:
    
    learner.fit(Xtrain, ytrain)
    do_cv(learner, Xtrain, ytrain, 10)
    display_scores(learner, Xtrain, ytrain)
    display_scores(learner, Xtest, ytest)


### -- ### -- RandomForestClassifier -- ### -- ###
Mean F1 score after a 10-fold cross validation:  0.7665021848335719
F1 score of each fold:  [0.7696862  0.76522768 0.77711561 0.75866189 0.75029586 0.77159763
 0.76995305 0.75059952 0.76002394 0.79186047]

### -- ### -- RandomForestClassifier -- ### -- ###
Confusion Matrix:  [[5710   36]
 [  61 8478]]
Accuracy:  0.9932096604830242
AUC:  0.9995912572795641
recall: 0.9928563063590584
precision: 0.9957716701902748
F1-score: 0.9943118512871636

### -- ### -- RandomForestClassifier -- ### -- ###
Confusion Matrix:  [[ 987  490]
 [ 485 1610]]
Accuracy:  0.727043673012318
AUC:  0.7965838965974699
recall: 0.7684964200477327
precision: 0.7666666666666667
F1-score: 0.767580452920143

### -- ### -- LogisticRegression -- ### -- ###
Mean F1 score after a 10-fold cross validation:  0.740719517667918
F1 score of each fold:  [0.74581759 0.73990307 0.74853177 0.71780822 0.74030172 0.73101952
 0.75635593 0.75041322 0.72       0.75704413]

### -- ### -- 

### Evaluating only with Tweet features

In [46]:
from sklearn.model_selection import train_test_split

tweet_feats_cols = [col for col in features_raw.columns if col.startswith('tweet_')]
Xtrain_tweet, Xtest_tweet, ytrain_tweet, ytest_tweet = train_test_split(
    features_raw[tweet_feats_cols], 
    targets, 
    test_size=0.2, 
    random_state=40)
print('Train Shape:', Xtrain_tweet.shape)
print('Test Shape:', Xtest_tweet.shape)

scaler = StandardScaler()
scaler.fit(Xtrain_tweet)

Xtrain_tweet = scaler.transform(Xtrain_tweet)
Xtest_tweet = scaler.transform(Xtest_tweet)

Train Shape: (14285, 34)
Test Shape: (3572, 34)


  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the models
clf_A = RandomForestClassifier(random_state=0)
clf_B = LogisticRegression(random_state=0, solver='lbfgs')
clf_C = GradientBoostingClassifier(random_state=0)
clf_D = AdaBoostClassifier(random_state=0)

for learner in [clf_A, clf_B, clf_C, clf_D]:
    
    learner.fit(Xtrain_tweet, ytrain)
    do_cv(learner, Xtrain_tweet, ytrain_tweet, 10)
    display_scores(learner, Xtrain_tweet, ytrain_tweet)
    display_scores(learner, Xtest_tweet, ytest_tweet)


### -- ### -- RandomForestClassifier -- ### -- ###
Mean F1 score after a 10-fold cross validation:  0.7933527638637266
F1 score of each fold:  [0.81562321 0.79768786 0.79812756 0.78752207 0.78777393 0.79460094
 0.79050926 0.77380952 0.77889745 0.80897583]

### -- ### -- RandomForestClassifier -- ### -- ###
Confusion Matrix:  [[5637  109]
 [ 101 8438]]
Accuracy:  0.9852992649632482
AUC:  0.9981166142268065
recall: 0.9881719170863099
precision: 0.9872469872469872
F1-score: 0.9877092356315113

### -- ### -- RandomForestClassifier -- ### -- ###
Confusion Matrix:  [[1022  455]
 [ 430 1665]]
Accuracy:  0.7522396416573348
AUC:  0.8311212659344638
recall: 0.7947494033412887
precision: 0.785377358490566
F1-score: 0.790035587188612

### -- ### -- LogisticRegression -- ### -- ###
Mean F1 score after a 10-fold cross validation:  0.7393053484391232
F1 score of each fold:  [0.73791622 0.74316354 0.7597471  0.72447859 0.73594001 0.73546985
 0.74538745 0.73730202 0.72522764 0.74842105]

### -- ### --

### Evaluating only with User features

In [56]:
from sklearn.model_selection import train_test_split

tweet_feats_cols = [col for col in features_raw.columns if col.startswith('user_')]
Xtrain_user, Xtest_user, ytrain_user, ytest_user = train_test_split(
    features_raw[tweet_feats_cols], 
    targets, 
    test_size=0.2, 
    random_state=40)
print('Train Shape:', Xtrain_user.shape)
print('Test Shape:', Xtest_user.shape)

scaler = StandardScaler()
scaler.fit(Xtrain_user)

Xtrain = scaler.transform(Xtrain_user)
Xtest = scaler.transform(Xtest_user)

Train Shape: (14285, 23)
Test Shape: (3572, 23)


  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the models
clf_A = RandomForestClassifier(random_state=0)
clf_B = LogisticRegression(random_state=0, solver='lbfgs')
clf_C = GradientBoostingClassifier(random_state=0)
clf_D = AdaBoostClassifier(random_state=0)

for learner in [clf_A, clf_B, clf_C, clf_D]:
    
    learner.fit(Xtrain_user, ytrain_user)
    do_cv(learner, Xtrain_user, ytrain_user, 10)
    display_scores(learner, Xtrain_user, ytrain_user)
    display_scores(learner, Xtest_user, ytest_user)


### -- ### -- RandomForestClassifier -- ### -- ###
Mean F1 score after a 10-fold cross validation:  0.679111613735812
F1 score of each fold:  [0.68195534 0.67713787 0.65862485 0.65602837 0.69454123 0.6829559
 0.68449198 0.67820896 0.67564403 0.70152761]

### -- ### -- RandomForestClassifier -- ### -- ###
Confusion Matrix:  [[5663   83]
 [ 133 8406]]
Accuracy:  0.9848792439621981
AUC:  0.9987552963823936
recall: 0.9844244056681111
precision: 0.9902226410649075
F1-score: 0.9873150105708245

### -- ### -- RandomForestClassifier -- ### -- ###
Confusion Matrix:  [[ 811  666]
 [ 687 1408]]
Accuracy:  0.6212206047032475
AUC:  0.6469619285690047
recall: 0.6720763723150358
precision: 0.6788813886210222
F1-score: 0.6754617414248022

### -- ### -- LogisticRegression -- ### -- ###
Mean F1 score after a 10-fold cross validation:  0.7482474873243588
F1 score of each fold:  [0.74813841 0.74813841 0.74813841 0.74813841 0.74813841 0.74813841
 0.74846626 0.74846626 0.74846626 0.74824561]

### -- ### --

### Agreement-based retraining technique

In [91]:
import numpy as np
rf_tweet = RandomForestClassifier(random_state=0)
rf_user = RandomForestClassifier(random_state=0)

rf_tweet.fit(Xtrain_tweet, ytrain_tweet)
preds_tweet = rf_tweet.predict(Xtest_tweet)

rf_user.fit(Xtrain_user, ytrain_user)
preds_user = rf_user.predict(Xtest_user)
np.where(preds_tweet==preds_user)
# get data where predictions agree
Xagreed = Xtest_tweet[np.where(preds_tweet==preds_user)]
yagreed = ytest_tweet[np.where(preds_tweet==preds_user)[0]]
yagreed
# rf_new = RandomForestClassifier(random_state=0)
# rf_new.fit(Xagreed, yagreed)

# # # get data where predictions don't agree
# Xdisagreed = Xtest_tweet[np.where(preds_tweet!=preds_user)]
# rf_new.predict(Xdisagreed)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
9       NaN
12      NaN
16      NaN
17      NaN
18      NaN
19      NaN
21      NaN
24      NaN
25      1.0
27      NaN
28      NaN
29      1.0
30      NaN
32      1.0
35      NaN
36      NaN
39      NaN
42      NaN
46      NaN
49      NaN
50      NaN
51      NaN
       ... 
3513    NaN
3515    1.0
3516    NaN
3518    1.0
3519    NaN
3521    NaN
3522    NaN
3525    NaN
3526    NaN
3527    1.0
3535    NaN
3537    NaN
3538    1.0
3539    NaN
3540    NaN
3542    NaN
3543    NaN
3544    NaN
3545    NaN
3546    NaN
3550    NaN
3551    1.0
3552    NaN
3554    NaN
3557    NaN
3562    NaN
3564    NaN
3568    NaN
3570    NaN
3571    NaN
Name: label, Length: 2128, dtype: float64