In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

from pprint import pprint
from time import time
from textblob import TextBlob, Word

import logging
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.utils import resample

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\599701\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tweet_data = pd.read_csv("./datasets/combined_data.csv", sep=";", 
                         dtype={'tweet_id':str, 'author_id':str, 'publish_date':str, 
                                'content':str, 'link_url':str, 'account_category':str, 
                                'author':str, 'account_type':str})

In [3]:
tweet_data = pd.get_dummies(tweet_data, columns=['account_type', 'account_category'], drop_first=False)

In [4]:
df_Troll = tweet_data[tweet_data.account_category_Troll == 1]
df_Pol = tweet_data[tweet_data.account_category_Politician == 1]
df_News = tweet_data[tweet_data.account_category_US_News == 1]

In [5]:
print(df_Troll.shape)
print(df_Pol.shape)
print(df_News.shape)

(41069, 18)
(41066, 18)
(41069, 18)


In [6]:
# Some Different testing sets
df_Trolls_News = pd.concat([df_Troll, df_News])
df_Trolls_Pol = pd.concat([df_Troll, df_Pol])

We'll start by comparing trolls and news outlets

In [7]:
X = df_Trolls_News['content']
y = df_Trolls_News['account_category_Troll']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [8]:
# Define a function that accepts a vectorizer and calculates the accuracy.
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print(('Features: ', X_train_dtm.shape[1]))
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    #print(('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)))
    print(metrics.classification_report(y_test, y_pred_class))

In [9]:
# First Try
vect = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=2)
tokenize_test(vect)

('Features: ', 88822)
             precision    recall  f1-score   support

          0       0.83      0.88      0.85     10258
          1       0.87      0.82      0.84     10277

avg / total       0.85      0.85      0.85     20535



In [10]:
# Define a function that accepts text and returns a list of stems.
stemmer = SnowballStemmer('english')
def split_into_stems(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [stemmer.stem(word) for word in words]

In [11]:
# Use split_into_stems as the feature extraction function (Warning: SLOW!).
vect = CountVectorizer(analyzer=split_into_stems, decode_error='replace', ngram_range=(1,2), stop_words="english", min_df=2, max_df=.75)
tokenize_test(vect)

('Features: ', 20761)
             precision    recall  f1-score   support

          0       0.82      0.87      0.84     10258
          1       0.86      0.80      0.83     10277

avg / total       0.84      0.84      0.84     20535



In [12]:
# Define a function that accepts text and returns a list of lemmas.
def split_into_lemmas(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [13]:
# Use split_into_lemmas as the feature extraction function (Warning: SLOW!).
vect = CountVectorizer(analyzer=split_into_lemmas, decode_error='replace', stop_words="english", min_df=2)
tokenize_test(vect)

('Features: ', 25090)
             precision    recall  f1-score   support

          0       0.81      0.87      0.84     10258
          1       0.86      0.79      0.83     10277

avg / total       0.84      0.83      0.83     20535



In [36]:
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

In [17]:
print(metrics.classification_report(y_test, y_pred_class))

             precision    recall  f1-score   support

          0       0.81      0.87      0.84     10258
          1       0.86      0.79      0.83     10277

avg / total       0.84      0.83      0.83     20535



In [30]:
# neg_class_prob_sorted = nb.feature_log_prob_[0, :].argsort()
# pos_class_prob_sorted = nb.feature_log_prob_[1, :].argsort()

# print(np.take(vect.get_feature_names(), neg_class_prob_sorted[:10]))
# print(np.take(vect.get_feature_names(), pos_class_prob_sorted[:10]))

In [31]:
# df_Trolls_News.loc[df_Trolls_News['content'].str.contains('coe'), ['content', 'account_category_Troll']]

In [38]:
df_dtm = vect.transform(df_Trolls_News['content'])

ValueError: Wrong number of items passed 2, placement implies 1

In [53]:
proba = nb.predict_proba(df_dtm)
df_Trolls_News['proba_troll'] = proba[:,1]

In [61]:
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=False)[['author', 'content', 'proba_troll']]

Unnamed: 0,author,content,proba_troll
169463,FoxNews,"Huckabee: ""The greatest single characteristic ...",1.000000e+00
155992,FoxNews,".@POTUS on Democrats: ""I don't think they want...",1.000000e+00
167543,nytimes,"RT @jennydeluxe: ""Being black in the age of wo...",1.000000e+00
148629,FoxNews,".@POTUS to U.S. servicemembers: ""We support yo...",9.999999e-01
157242,USATODAY,"""All I really want to do is tell you that I'm ...",9.999999e-01
148739,CNN,"Imagine ""you're an NBA coach...and you're gonn...",9.999995e-01
143938,FoxNews,".@TGowdySC: ""Jim Comey said, 'I don't do sneak...",9.999994e-01
143921,FoxNews,".@TGowdySC: ""Jim Comey said, 'I don't do sneak...",9.999994e-01
170873,FoxNews,".@POTUS: ""May God bless you. May God bless our...",9.999992e-01
148784,CNN,"""Sometimes I don't know what this world has co...",9.999992e-01


In [None]:
vect = CountVectorizer(ngram_range=(1, 2), stop_words="english")
tokenize_test(vect)

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),])

In [None]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (1, 2, 3),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
}

In [None]:
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           cv=3,
                           verbose=1)

grid_search.fit(X_train.values.tolist(), y=y_train.values.tolist())

In [None]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
X = tweet_sample['content']
y = tweet_sample['account_category_Troll']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [None]:
vect = CountVectorizer(max_df=.75, ngram_range=(1, 2), stop_words="english", min_df=2)
tokenize_test(vect)

In [None]:
X = tweet_sample['content']
y = tweet_sample['account_type_Right']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [None]:
vect = CountVectorizer(max_df=.75, ngram_range=(1, 2), stop_words="english", min_df=2)
tokenize_test(vect)

In [None]:
from sklearn.linear_model import LogisticRegression
# Define a function that accepts a vectorizer and calculates the accuracy.
def tokenize_test_log(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print(('Features: ', X_train_dtm.shape[1]))
    X_test_dtm = vect.transform(X_test)
    log = LogisticRegression()
    log.fit(X_train_dtm, y_train)
    y_pred_class = log.predict(X_test_dtm)
    #print(('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)))
    print(metrics.classification_report(y_test, y_pred_class))

In [None]:
vect = CountVectorizer(max_df=.75, ngram_range=(1, 2), stop_words="english", min_df=2)
tokenize_test_log(vect)

In [None]:
from sklearn.svm import SVC

def tokenize_test_svc(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print(('Features: ', X_train_dtm.shape[1]))
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    y_pred_class = svc.predict(X_test_dtm)
    #print(('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)))
    print(metrics.classification_report(y_test, y_pred_class))

In [None]:
vect = CountVectorizer(max_df=.75, ngram_range=(1, 2), stop_words="english", min_df=2)
tokenize_test_svc(vect)