In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import scipy as sp

from pprint import pprint
from time import time
from textblob import TextBlob, Word

import logging
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\599701\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tweet_data = pd.read_csv("./datasets/combined_data.csv", sep=";", 
                         dtype={'tweet_id':str, 'author_id':str, 'publish_date':str, 
                                'content':str, 'link_url':str, 'account_category':str, 
                                'author':str, 'account_type':str})

In [3]:
tweet_data = pd.get_dummies(tweet_data, columns=['account_category'], drop_first=False)

In [4]:
df_Troll = tweet_data[tweet_data.account_category_Troll == 1]
df_Pol = tweet_data[tweet_data.account_category_Politician == 1]
df_News = tweet_data[tweet_data.account_category_US_News == 1]

In [5]:
print(df_Troll.shape)
print(df_Pol.shape)
print(df_News.shape)

(41069, 12)
(41066, 12)
(41069, 12)


In [6]:
# Some different testing sets
df_Trolls_News = pd.concat([df_Troll, df_News])
df_Trolls_Pol = pd.concat([df_Troll, df_Pol])

We'll start by comparing trolls and news outlets

In [7]:
X = df_Trolls_News['content']
y = df_Trolls_News['account_category_Troll']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [8]:
# Define a function that accepts text and returns a list of stems.
stemmer = SnowballStemmer('english')
def split_into_stems(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [stemmer.stem(word) for word in words]

In [9]:
stemmed_stops = [stemmer.stem(Word(x)) for x in stopwords.words('english')]

In [10]:
# Define a function that accepts text and returns a list of lemmas.
def split_into_lemmas(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [11]:
lemmed_stops = [Word(x).lemmatize() for x in stopwords.words('english')]

In [12]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())])

In [14]:
parameters = {
    'vect__analyzer': (split_into_lemmas, split_into_stems, None),
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (1, 2, 3),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2))  # unigrams or bigrams
}

In [None]:
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           cv=3,
                           verbose=10)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8319941563184806, total=  38.2s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8322294730690561, total=  54.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8259472095061848, total=  49.3s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.8min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2), score=0.8319941563184806, total=  45.9s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.0min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2), score=0.8322294730690561, total=  39.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.1min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2), score=0.8259472095061848, total=  42.5s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  7.3min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 1), score=0.8287314341368396, total=  39.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  8.4min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 1), score=0.827943897925392, total=  41.7s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  9.5min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 1), score=0.821759033797604, total=  36.4s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 10.5min remaining:    0.0s


[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 2), score=0.8287314341368396, total=  35.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 2), score=0.827943897925392, total=  42.2s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 2) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=2, vect__ngram_range=(1, 2), score=0.821759033797604, total=  37.6s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0.5, vect__min_df=3, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=0

[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=1.0, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8309715120525931, total=  40.5s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=1.0, vect__min_df=1, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=1.0, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8334956657251388, total=  45.0s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=1.0, vect__min_df=1, vect__ngram_range=(1, 1) 
[CV]  vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=1.0, vect__min_df=1, vect__ngram_range=(1, 1), score=0.8255576117658517, total=  40.3s
[CV] vect__analyzer=<function split_into_lemmas at 0x000002077A15F7B8>, vect__max_df=1.0, vect__min_df=1, vect__ngram_range=(1, 2) 


In [None]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

In [None]:
def grab_tweet(tweet):
    print("Author:", tweet['author'])
    print("Probability troll:", tweet['proba_troll'])
    print("Tweet text:", tweet['content'])
    print()

In [None]:
df_dtm = vect.transform(df_Trolls_News['content'])

In [None]:
proba = nb.predict_proba(df_dtm)
df_Trolls_News['proba_troll'] = proba[:,1]

In [None]:
print("Most Troll Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=False).head().apply(grab_tweet, axis=1)
print("Least Troll Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=True).head().apply(grab_tweet, axis=1)

In [None]:
def grab_tweet(tweet):
    print("Author:", tweet['author'])
    print("Probability troll:", tweet['proba_troll'])
    print("Tweet text:", tweet['content'])
    print()
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=True).head().apply(grab_tweet, axis=1)

In [None]:
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].groupby(by='author').mean().proba_troll.sort_values()

In [None]:
X = df_Trolls_Pol['content']
y = df_Trolls_Pol['account_category_Troll']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [None]:
vect = CountVectorizer(analyzer=split_into_lemmas, stop_words=lemmed_stops, min_df=2, max_df=.75)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

In [None]:
print(metrics.classification_report(y_test, y_pred_class))

In [None]:
df_dtm = vect.transform(df_Trolls_Pol['content'])

In [None]:
proba = nb.predict_proba(df_dtm)
df_Trolls_Pol['proba_troll'] = proba[:,1]

In [None]:
df_Trolls_Pol.loc[df_Trolls_Pol.account_category_Troll==0].sort_values(by='proba_troll', ascending=False).head().apply(grab_tweet, axis=1)

In [None]:
df_Trolls_Pol.loc[df_Trolls_Pol.account_category_Troll==0].groupby(by='author').mean().proba_troll.sort_values()