In [83]:
import pandas as pd
import pandas as pd
import numpy as np
import scipy as sp
import math

from textblob import TextBlob, Word

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\599701\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tweet_data = pd.read_csv("./datasets/combined_data.csv", sep=";", 
                         dtype={'tweet_id':str, 'author_id':str, 'publish_date':str, 
                                'content':str, 'link_url':str, 'account_category':str, 
                                'author':str, 'account_type':str})

In [3]:
tweet_data = pd.get_dummies(tweet_data, columns=['account_category'], drop_first=False)

In [4]:
df_Troll = tweet_data[tweet_data.account_category_Troll == 1]
df_Pol = tweet_data[tweet_data.account_category_Politician == 1]
df_News = tweet_data[tweet_data.account_category_US_News == 1]
# Some different testing sets
df_Trolls_News = pd.concat([df_Troll, df_News])

In [55]:
# Define a function that accepts text and returns a list of stems.
stemmer = SnowballStemmer('english')
stemmed_stops = [stemmer.stem(Word(x)) for x in stopwords.words('english')] + ["'s ", "`", '"',"“", "’", "http", "https", "n't"]
stemmed_stops = [item for item in stemmed_stops if item not in ['i', 'me', 'my', 'myself', 'we', 'our', 'our', 'ourselv', 'you', "you'r", "you'v", "you'll", "you'd", 'your', 'your', 'yourself', 'yourselv', 'he', 'him', 'his', 'himself', 'she', 'she', 'her', 'her', 'herself']]

def split_into_stems(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return list(set([stemmer.stem(word) for word in words]) - set(stemmed_stops))

In [6]:
X = df_Trolls_News['content']
y = df_Trolls_News['account_category_Troll']
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

In [56]:
vect = CountVectorizer(analyzer=split_into_stems, max_df=1.0, min_df=1, ngram_range=(1,1))
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print(metrics.classification_report(y_test, y_pred_class))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84     10258
          1       0.88      0.78      0.83     10277

avg / total       0.84      0.84      0.84     20535



In [8]:
def grab_tweet(tweet):
    print("Author:", tweet['author'])
    print("Probability troll:", tweet['proba_troll'])
    print("Tweet text:", tweet['content'])
    print()

In [9]:
df_dtm = vect.transform(df_Trolls_News['content'])

In [10]:
proba = nb.predict_proba(df_dtm)
df_Trolls_News['proba_troll'] = proba[:,1]

In [11]:
print("Most Troll Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=False).head().apply(grab_tweet, axis=1)
print("Least Troll Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].sort_values(by='proba_troll', ascending=True).head().apply(grab_tweet, axis=1)

Most Troll Like
Author: nytimes
Probability troll: 0.9999999790185827
Tweet text: RT @jennydeluxe: "Being black in the age of wokeness" is one of my fave episodes to date. Listen &amp; LMK what you think &gt;&gt;&gt;&gt; https://t.co/r4T…

Author: USATODAY
Probability troll: 0.999999959013878
Tweet text: "All I really want to do is tell you that I'm feeling great. I'm glad I spent that evening in the hospital, and it did me a lot of good." -Stan Lee https://t.co/JZg09bqS1g

Author: FoxNews
Probability troll: 0.9999998102654113
Tweet text: .@POTUS on Democrats: "I don't think they want to solve the DACA problem. I think they wanna talk about it. I think they wanna obstruct." https://t.co/zBPxHDzk6E

Author: FoxNews
Probability troll: 0.9999997752128391
Tweet text: Huckabee: "The greatest single characteristic of people on the far left is they have zero sense of humor. I mean, these are the most bitter, angry, really disappointing and disgusting people because they're so sad with life. I

147799    None
135104    None
136099    None
147265    None
132540    None
dtype: object

In [12]:
print("Most News Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==1].sort_values(by='proba_troll', ascending=True).head().apply(grab_tweet, axis=1)
print("Least News Like")
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==1].sort_values(by='proba_troll', ascending=False).head().apply(grab_tweet, axis=1)

Most News Like
Author: KANIJJACKSON
Probability troll: 8.54233597680746e-08
Tweet text: Confirmed: Michael Cohen received hundreds of thousands of dollars  from a Russian oligarch Viktor Vekselberg.  The money was paid to a First Republic Bank account Cohen created  for Essential Consultants. This is the same bank  account Cohen used to pay Stormy Daniels $130,000

Author: IMAPHARRELFAKE
Probability troll: 2.3689616812652584e-07
Tweet text: Former Cuban President Fidel Castro dies at age 90, his brother, President Raul Castro announces. https://t.co/gHGSyRFlBi

Author: WORLDNEWSPOLI
Probability troll: 1.5878339961590657e-06
Tweet text: Bao Bao the giant panda leaves Washington zoo for new home in China https://t.co/e6WNrKcrMI https://t.co/QnCVIUb3nQ

Author: COVFEFENATIONUS
Probability troll: 1.9084533260972855e-06
Tweet text: Randall Saito was arrested in Stockton, CA this morning, per  San Joaquin Co Sheriff’s FB page  SJCO credits “a tip from an alert taxi cab driver”  Saito escaped

10951    None
27469    None
19144    None
13334    None
2595     None
dtype: object

In [13]:
df_Trolls_News.loc[df_Trolls_News.account_category_Troll==0].groupby(by='author').mean().proba_troll.sort_values()

author
Reuters           0.013266
AP                0.018762
chicagotribune    0.039258
ABC               0.041498
politico          0.050556
WSJ               0.052525
NPR               0.079233
CNN               0.087843
USATODAY          0.114722
Forbes            0.116943
nytimes           0.122085
washingtonpost    0.126263
nypost            0.165859
FoxNews           0.168716
HuffPost          0.220815
Name: proba_troll, dtype: float64

In [115]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    lin_exp = np.vectorize(lambda x: math.exp(x))
    coefs_exp_0 = lin_exp(clf.feature_log_prob_[0])
    coefs_exp_1 = lin_exp(clf.feature_log_prob_[1])

    denom = np.add(coefs_exp_0, coefs_exp_1)
    weighted_coefs_0 = np.divide(coefs_exp_0, denom)
    weighted_coefs_1 = np.divide(coefs_exp_1, denom)

    coefs_with_fns_0 = sorted(zip(weighted_coefs_0, feature_names))
    coefs_with_fns_1 = sorted(zip(weighted_coefs_1, feature_names))

    top = zip(coefs_with_fns_0[:-(n + 1):-1], coefs_with_fns_1[:-(n + 1):-1])
    bot = zip(coefs_with_fns_0[:n], coefs_with_fns_1[:n])
    
    print("Top")
    print("\tNews\t\t\t\tTrolls")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
    print()
    print("Bottom")
    print("\tNews\t\t\t\tTrolls")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [116]:
show_most_informative_features(vect, nb)

Top
	News				Trolls
	0.9967	🔓              		0.9979	�              
	0.9964	reuterstv      		0.9977	tcot           
	0.9950	politicomag    		0.9976	pjnet          
	0.9920	apcentralregion		0.9964	��             
	0.9919	tariff         		0.9961	maga           
	0.9905	walkout        		0.9949	nowplay        
	0.9901	apwestregion   		0.9941	ccot           
	0.9885	getti          		0.9936	fuck           
	0.9883	foxnews2016    		0.9934	wakeupamerica  
	0.9875	apeastregion   		0.9931	▶              
	0.9863	ap_odd         		0.9931	topl           
	0.9853	t.co…          		0.9931	blm            
	0.9849	playbookplus   		0.9930	blacklivesmatt 
	0.9849	ap_polit       		0.9927	���            
	0.9845	breakingview   		0.9925	2a             
	0.9843	nationalwalkoutday		0.9921	gopdeb         
	0.9826	pyeongchang2018		0.9914	lol            
	0.9810	t.co/…         		0.9914	antifa         
	0.9807	apsouthregion  		0.9913	islamkil       
	0.9793	nassar         		0.9907	blacktwitt     

Bottom
	News				

In [112]:
a = []
a + ['a','b']

['a', 'b']