In [2]:
import pandas as pd
import numpy as np
import sklearn
import gensim 
import logging
import os

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# load the data
data = pd.read_csv('../data/classified_tweets/tweetData.csv', index_col=None, encoding='ISO-8859-1')
print(data.info())

# clean up columns
data.columns = data.columns.str.strip() 
print(data.columns)

# preprocess things
other_document = []
troll_document = []
for i in range(len(data.content)):
    tokenized = gensim.utils.simple_preprocess(data.content[i])
    if i < 172207:
        other_document.append(tokenized)
    else:
        troll_document.append(tokenized)
print(other_document)
print(troll_document)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362458 entries, 0 to 362457
Data columns (total 3 columns):
content    362458 non-null object
Normal     362458 non-null int64
Bot        362458 non-null int64
dtypes: int64(2), object(1)
memory usage: 8.3+ MB
None
Index(['content', 'Normal', 'Bot'], dtype='object')


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
# create a model for each tweet type
from copy import deepcopy

model0 = gensim.models.Word2Vec(min_count=1, workers=10, hs=1, negative=0)
model0.build_vocab(other_document)

model1 = gensim.models.Word2Vec(min_count=1, workers=10, hs=1, negative=0)
model1.build_vocab(troll_document)

# now train each model
model0.train(other_document, total_examples=len(other_document), epochs=3)
model1.train(troll_document, total_examples=len(troll_document), epochs=3)

2018-11-20 07:40:10,591 : INFO : collecting all words and their counts
2018-11-20 07:40:10,591 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-20 07:40:10,747 : INFO : PROGRESS: at sentence #10000, processed 144159 words, keeping 29469 word types
2018-11-20 07:40:10,935 : INFO : PROGRESS: at sentence #20000, processed 280984 words, keeping 53366 word types
2018-11-20 07:40:11,091 : INFO : PROGRESS: at sentence #30000, processed 418884 words, keeping 74614 word types
2018-11-20 07:40:11,247 : INFO : PROGRESS: at sentence #40000, processed 559432 words, keeping 94459 word types
2018-11-20 07:40:11,434 : INFO : PROGRESS: at sentence #50000, processed 705415 words, keeping 113280 word types
2018-11-20 07:40:11,605 : INFO : PROGRESS: at sentence #60000, processed 855453 words, keeping 130722 word types
2018-11-20 07:40:11,746 : INFO : PROGRESS: at sentence #70000, processed 1015560 words, keeping 146506 word types
2018-11-20 07:40:11,902 : INFO : PROGRESS:

2018-11-20 07:41:47,084 : INFO : EPOCH - 1 : training on 2633586 raw words (2020354 effective words) took 7.5s, 270441 effective words/s
2018-11-20 07:41:48,129 : INFO : EPOCH 2 - PROGRESS: at 12.00% examples, 227539 words/s, in_qsize 20, out_qsize 0
2018-11-20 07:41:49,175 : INFO : EPOCH 2 - PROGRESS: at 22.82% examples, 212988 words/s, in_qsize 19, out_qsize 0
2018-11-20 07:41:50,235 : INFO : EPOCH 2 - PROGRESS: at 37.53% examples, 234873 words/s, in_qsize 18, out_qsize 1
2018-11-20 07:41:51,281 : INFO : EPOCH 2 - PROGRESS: at 51.58% examples, 245502 words/s, in_qsize 17, out_qsize 3
2018-11-20 07:41:52,295 : INFO : EPOCH 2 - PROGRESS: at 66.67% examples, 257791 words/s, in_qsize 19, out_qsize 1
2018-11-20 07:41:53,309 : INFO : EPOCH 2 - PROGRESS: at 81.47% examples, 263865 words/s, in_qsize 19, out_qsize 0
2018-11-20 07:41:54,293 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-20 07:41:54,340 : INFO : EPOCH 2 - PROGRESS: at 97.15% examples, 271740 words/s,

2018-11-20 07:42:21,856 : INFO : EPOCH 3 - PROGRESS: at 31.97% examples, 258105 words/s, in_qsize 20, out_qsize 1
2018-11-20 07:42:22,870 : INFO : EPOCH 3 - PROGRESS: at 42.70% examples, 265733 words/s, in_qsize 19, out_qsize 0
2018-11-20 07:42:23,868 : INFO : EPOCH 3 - PROGRESS: at 52.66% examples, 268435 words/s, in_qsize 20, out_qsize 0
2018-11-20 07:42:24,929 : INFO : EPOCH 3 - PROGRESS: at 64.36% examples, 267599 words/s, in_qsize 19, out_qsize 0
2018-11-20 07:42:25,990 : INFO : EPOCH 3 - PROGRESS: at 76.77% examples, 266336 words/s, in_qsize 20, out_qsize 1
2018-11-20 07:42:27,066 : INFO : EPOCH 3 - PROGRESS: at 88.75% examples, 268004 words/s, in_qsize 19, out_qsize 1
2018-11-20 07:42:27,690 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-20 07:42:27,706 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-20 07:42:27,706 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-20 07:42:27,768 : INFO : worker thr

(7411148, 9079620)

In [7]:
models = [model0, model1]
print(models[0]) # other
print(models[1]) # troll
vocab0_size = len(models[0].wv.vocab)
vocab1_size = len(models[0].wv.vocab)
vocab_dim = 100

Word2Vec(vocab=291325, size=100, alpha=0.025)
Word2Vec(vocab=274634, size=100, alpha=0.025)


In [8]:
# based on sample code from gensim:
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/deepir.ipynb
docs = [other_document, troll_document]
listall = [s for d in docs for s in d]
# the log likelihood of each sentence in this category under each w2v representation
llhd = np.array([ m.score(listall, len(listall)) for m in models ])
# now exponentiate to get likelihoods, 
lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
# normalize across models to get sentence-category probabilities
prob = pd.DataFrame((lhd/lhd.sum(axis=0)).transpose())
# and finally average the sentence probabilities to get the category's probability
prob["doc"] = [i for i,d in enumerate(docs) for s in d]
prob = prob.groupby("doc").mean()
print(prob)

2018-11-20 07:48:03,248 : INFO : scoring sentences with 10 workers on 291325 vocabulary and 100 features, using sg=0 hs=1 sample=0.001 and negative=0
2018-11-20 07:48:04,246 : INFO : PROGRESS: at 2780000.00% sentences, 27781 sentences/s
2018-11-20 07:48:05,245 : INFO : PROGRESS: at 5400000.00% sentences, 26921 sentences/s
2018-11-20 07:48:06,251 : INFO : PROGRESS: at 8060000.00% sentences, 26769 sentences/s
2018-11-20 07:48:07,251 : INFO : PROGRESS: at 10940000.00% sentences, 27272 sentences/s
2018-11-20 07:48:08,265 : INFO : PROGRESS: at 13860000.00% sentences, 27638 sentences/s
2018-11-20 07:48:09,263 : INFO : PROGRESS: at 16730000.00% sentences, 27805 sentences/s
2018-11-20 07:48:10,261 : INFO : PROGRESS: at 19750000.00% sentences, 28136 sentences/s
2018-11-20 07:48:11,275 : INFO : PROGRESS: at 22450000.00% sentences, 27948 sentences/s
2018-11-20 07:48:12,289 : INFO : PROGRESS: at 25330000.00% sentences, 28030 sentences/s
2018-11-20 07:48:13,288 : INFO : PROGRESS: at 28060000.00% se

            0         1
doc                    
0    0.467345  0.532655
1    0.562522  0.437478


In [17]:
vec = lhd.clip(min=0) # remove any negative values
print(vec)
print(vec.shape)

[[1.0000000e+00 2.7998937e-03 4.6160036e-08 ... 1.6678620e-09
  1.0000000e+00 1.9203770e-03]
 [1.1070258e-42 1.0000000e+00 1.0000000e+00 ... 1.0000000e+00
  1.5398059e-14 1.0000000e+00]]
(2, 362458)


In [23]:
# split the data into 60% train and 40% test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(vec.transpose(), data.Normal, test_size=0.4)
print("X_train: " + str(X_train.shape))
print("Y_train: " + str(Y_train.shape))
print("X_test: " + str(X_test.shape))
print("X_test: " + str(Y_test.shape))

X_train: (217474, 2)
Y_train: (217474,)
X_test: (144984, 2)
X_test: (144984,)


In [24]:
# not sure if this is needed?
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
X_train_tf = tf_transformer.transform(X_train)
print(X_train_tf.shape)

(217474, 2)


In [25]:
#from sklearn.preprocessing import StandardScaler
#X = StandardScaler().fit_transform(embedding_matrix)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, Y_train)
print(clf.score(X_test, Y_test))

0.5512263422170722


In [26]:
# borrowed this technique from Daniel
from sklearn.metrics import confusion_matrix

Y_hat = clf.predict(X_test)
confusion = confusion_matrix(Y_test, Y_hat)
print(confusion)
true_neg = float(confusion[0][0])
true_pos = float(confusion[1][1])
false_neg = float(confusion[1][0])
false_pos = float(confusion[0][1])
total = true_neg+true_pos+false_neg+false_pos

print("Precision % = " + str(true_pos*100/(true_pos+false_pos)))
print("Recall % = " + str(true_pos*100/(false_neg+true_pos)))
print("False Positive % = " + str(false_pos*100/total))
print("False Negative % = " + str(false_neg*100/total))

[[43933 32267]
 [32798 35986]]
Precision % = 52.724422369712684
Recall % = 52.31739939520819
False Positive % = 22.255559234122387
False Negative % = 22.62180654417039


In [27]:
# now do the logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(X_train, Y_train)
print(lr.score(X_test, Y_test))

Y_hat = lr.predict(X_test)
confusion = confusion_matrix(Y_test, Y_hat)
print(confusion)

true_neg = float(confusion[0][0])
true_pos = float(confusion[1][1])
false_neg = float(confusion[1][0])
false_pos = float(confusion[0][1])
total = true_neg+true_pos+false_neg+false_pos

print("Precision % = " + str(true_pos*100/(true_pos+false_pos)))
print("Recall % = " + str(true_pos*100/(false_neg+true_pos)))
print("False Positive % = " + str(false_pos*100/total))
print("False Negative % = " + str(false_neg*100/total))



0.5592341223859184
[[46467 29733]
 [34171 34613]]
Precision % = 53.79199950268859
Recall % = 50.3212956501512
False Positive % = 20.507780168846217
False Negative % = 23.56880759256194


In [28]:
# now do the knn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, Y_train)
print(knn.score(X_test, Y_test))

Y_hat = knn.predict(X_test)
confusion = confusion_matrix(Y_test, Y_hat)
print(confusion)

true_neg = float(confusion[0][0])
true_pos = float(confusion[1][1])
false_neg = float(confusion[1][0])
false_pos = float(confusion[0][1])
total = true_neg+true_pos+false_neg+false_pos

print("Precision % = " + str(true_pos*100/(true_pos+false_pos)))
print("Recall % = " + str(true_pos*100/(false_neg+true_pos)))
print("False Positive % = " + str(false_pos*100/total))
print("False Negative % = " + str(false_neg*100/total))

0.6045149809634166
[[48666 27534]
 [29805 38979]]
Precision % = 58.603581254792296
Recall % = 56.668702023726446
False Positive % = 18.99106108260222
False Negative % = 20.557440821056115


In [29]:
# now do the svm
from sklearn import svm

s = svm.SVC(kernel='linear', gamma='auto').fit(X_train, Y_train)
print(s.score(X_test, Y_test))

Y_hat = s.predict(X_test)
confusion = confusion_matrix(Y_test, Y_hat)
print(confusion)

true_neg = float(confusion[0][0])
true_pos = float(confusion[1][1])
false_neg = float(confusion[1][0])
false_pos = float(confusion[0][1])
total = true_neg+true_pos+false_neg+false_pos

print("Precision % = " + str(true_pos*100/(true_pos+false_pos)))
print("Recall % = " + str(true_pos*100/(false_neg+true_pos)))
print("False Positive % = " + str(false_pos*100/total))
print("False Negative % = " + str(false_neg*100/total))

0.551453953539701
[[43997 32203]
 [32829 35955]]
Precision % = 52.7524281815781
Recall % = 52.27233077459874
False Positive % = 22.211416432158032
False Negative % = 22.643188213871877
