In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import scipy as sp

from pprint import pprint
from time import time
from textblob import TextBlob, Word

import logging
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\599701\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tweet_data = pd.read_csv("./datasets/combined_data.csv", sep=";", 
                         dtype={'tweet_id':str, 'author_id':str, 'publish_date':str, 
                                'content':str, 'link_url':str, 'account_category':str, 
                                'author':str, 'account_type':str})

In [3]:
tweet_data = pd.get_dummies(tweet_data, columns=['account_category'], drop_first=False)

In [4]:
df_Troll = tweet_data[tweet_data.account_category_Troll == 1]
df_Pol = tweet_data[tweet_data.account_category_Politician == 1]
df_News = tweet_data[tweet_data.account_category_US_News == 1]

In [5]:
print(df_Troll.shape)
print(df_Pol.shape)
print(df_News.shape)

(41069, 12)
(41066, 12)
(41069, 12)


In [6]:
# Some different testing sets
df_Trolls_News = pd.concat([df_Troll, df_News])
df_Trolls_Pol = pd.concat([df_Troll, df_Pol])

We'll start by comparing trolls and news outlets

In [45]:
X = df_Trolls_News['content']
y = df_Trolls_News['account_category_Troll']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

Define functions to stem or lemmatze text in a tweet. Also create the list of stop words to use with them.

In [8]:
# Define a function that accepts text and returns a list of stems.
stemmer = SnowballStemmer('english')
def split_into_stems(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [stemmer.stem(word) for word in words]

In [9]:
stemmed_stops = [stemmer.stem(Word(x)) for x in stopwords.words('english')]

In [10]:
# Define a function that accepts text and returns a list of lemmas.
def split_into_lemmas(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [11]:
lemmed_stops = [Word(x).lemmatize() for x in stopwords.words('english')]

Now I need to define the pipline and gridsearch parameters.

In [12]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())])

In [19]:
parameters = {
    'vect__analyzer': (split_into_lemmas, split_into_stems),
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (1, 2, 3),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2))  # unigrams or bigrams
}

In [20]:
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           cv=3,
                           verbose=10)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8319941563184806, total=  36.4s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8322294730690561, total=  37.9s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.1min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8259472095061848, total=  37.1s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.1min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2), score=0.8319941563184806, total=  37.2s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.1min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2), score=0.8322294730690561, total=  37.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.2min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2), score=0.8259472095061848, total=  58.1s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.6min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1), score=0.8287314341368396, total=  42.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.8min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1), score=0.827943897925392, total=  39.9s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.9min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1), score=0.821759033797604, total=  44.0s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 10.1min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2), score=0.8287314341368396, total=  46.3s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2), score=0.827943897925392, total=  44.3s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2), score=0.821759033797604, total=  36.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.75, vect__min_df=3, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__ma

[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8378378378378378, total=  54.4s
[CV] vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8385117366319276, total=  56.1s
[CV] vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8311580792831401, total=  47.7s
[CV] vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=

[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, vect__min_df=2, vect__ngram_range=(1, 2), score=0.8354516678841003, total=  56.9s
[CV] vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, vect__min_df=2, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, vect__min_df=2, vect__ngram_range=(1, 2), score=0.8342261614882633, total=  50.1s
[CV] vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, vect__min_df=2, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, vect__min_df=2, vect__ngram_range=(1, 2), score=0.8288204928411416, total=  43.1s
[CV] vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, vect__min_df=3, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_stems at 0x000002076DC30AE8>, vect__max_df=1.0, v

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 89.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__analyzer': (<function split_into_lemmas at 0x000002077A15F7B8>, <function split_into_stems at 0x000002076DC30AE8>), 'vect__max_df': (0.75, 1.0), 'vect__min_df': (1, 2, 3), 'vect__ngram_range': ((1, 1), (1, 2))},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

Print out the results of my gridsearch

In [22]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
print(metrics.classification_report(y_test, grid_search.predict(X_test)))

Best score: 0.836
Best parameters set:
	vect__analyzer: <function split_into_stems at 0x000002076DC30AE8>
	vect__max_df: 1.0
	vect__min_df: 1
	vect__ngram_range: (1, 1)
             precision    recall  f1-score   support

          0       0.81      0.90      0.85     10258
          1       0.89      0.78      0.83     10277

avg / total       0.85      0.84      0.84     20535



Add in the stop words and see that our results aren't much different

In [46]:
vect = CountVectorizer(analyzer=split_into_stems, max_df=1.0, min_df=1, stop_words=stemmed_stops, ngram_range=(1,1))
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print(metrics.classification_report(y_test, y_pred_class))

             precision    recall  f1-score   support

          0       0.81      0.90      0.85     10258
          1       0.89      0.78      0.83     10277

avg / total       0.85      0.84      0.84     20535



Now I want some functions to further explore my resulting model. First I define a function that lets me grab tweets and print them. Then I'm going to look at what tweets look like that were falsely marked as trolls or not

In [31]:
def grab_tweet(tweet):
    print("Author:", tweet['author'])
    print("Probability troll:", tweet['proba_troll'])
    print("Tweet text:", tweet['content'])
    print()

In [32]:
df_dtm = vect.transform(df_Trolls_News['content'])

In [33]:
proba = nb.predict_proba(df_dtm)
df_Trolls_News['proba_troll'] = proba[:,1]

In [34]:
print("Most Troll Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=False).head().apply(grab_tweet, axis=1)
print("Least Troll Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=True).head().apply(grab_tweet, axis=1)

Most Troll Like
Author: nytimes
Probability troll: 0.9999999745969038
Tweet text: RT @jennydeluxe: "Being black in the age of wokeness" is one of my fave episodes to date. Listen &amp; LMK what you think &gt;&gt;&gt;&gt; https://t.co/r4T…

Author: USATODAY
Probability troll: 0.9999999433271982
Tweet text: "All I really want to do is tell you that I'm feeling great. I'm glad I spent that evening in the hospital, and it did me a lot of good." -Stan Lee https://t.co/JZg09bqS1g

Author: FoxNews
Probability troll: 0.9999997658795213
Tweet text: .@POTUS on Democrats: "I don't think they want to solve the DACA problem. I think they wanna talk about it. I think they wanna obstruct." https://t.co/zBPxHDzk6E

Author: FoxNews
Probability troll: 0.9999995789219475
Tweet text: Huckabee: "The greatest single characteristic of people on the far left is they have zero sense of humor. I mean, these are the most bitter, angry, really disappointing and disgusting people because they're so sad with life. 

147799    None
135104    None
136099    None
147265    None
132540    None
dtype: object

In [47]:
print("Most News Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==1].sort_values(by='proba_troll', ascending=True).head().apply(grab_tweet, axis=1)
print("Least News Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==1].sort_values(by='proba_troll', ascending=False).head().apply(grab_tweet, axis=1)

Most News Like
Author: KANIJJACKSON
Probability troll: 3.7776428745550067e-08
Tweet text: Confirmed: Michael Cohen received hundreds of thousands of dollars  from a Russian oligarch Viktor Vekselberg.  The money was paid to a First Republic Bank account Cohen created  for Essential Consultants. This is the same bank  account Cohen used to pay Stormy Daniels $130,000

Author: IMAPHARRELFAKE
Probability troll: 2.4569481184123893e-07
Tweet text: Former Cuban President Fidel Castro dies at age 90, his brother, President Raul Castro announces. https://t.co/gHGSyRFlBi

Author: COVFEFENATIONUS
Probability troll: 7.531496579616669e-07
Tweet text: Randall Saito was arrested in Stockton, CA this morning, per  San Joaquin Co Sheriff’s FB page  SJCO credits “a tip from an alert taxi cab driver”  Saito escaped from Hawaii State Hospital on Oahu,caught a taxi to the Honolulu airport, chartered a flight to Maui, then flew to CA

Author: FIGHTTORESIST
Probability troll: 2.249889063318448e-06
Tweet tex

38591    None
14485    None
19947    None
27469    None
35863    None
dtype: object

In [36]:
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].groupby(by='author').mean().proba_troll.sort_values()

author
Reuters           0.017009
AP                0.018675
chicagotribune    0.042039
ABC               0.046095
WSJ               0.054319
politico          0.060109
NPR               0.076523
CNN               0.097069
USATODAY          0.118057
nytimes           0.128135
washingtonpost    0.130681
Forbes            0.138451
FoxNews           0.171251
nypost            0.185465
HuffPost          0.222791
Name: proba_troll, dtype: float64