In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import pickle

In [2]:
users = pd.read_csv('data/full/train.csv')

In [3]:
users = users[['user_id','target']]

In [None]:
tweets = pd.concat([pd.read_csv('data/porn/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/propaganda/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/spam/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/fake_followers/tweets.csv', sep='\t')[['user_id','full_text']]])

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
tweets = tweets.drop_duplicates()

In [6]:
tweets = pd.merge(tweets, users, on='user_id')

In [7]:
tweets.drop(['user_id'], axis=1, inplace=True)

In [8]:
len(tweets[tweets.target==4])

261233

In [30]:
tweets.iloc[2].full_text

'https://t.co/KiTk9FMJwj'

In [26]:
tweets[tweets.target == 1].full_text

196811    RT @funder: Retweetfest: Tweet this link out i...
196812    RT @RWPUSA: So much for party labels. With Dem...
196813    RT @funder: BREAKING: Trump just said he’s ref...
196814    RT @robreiner: When an American President atta...
196815    RT @SteveSchmidtSES: TRUMP disgraced the Presi...
196816    RT @funder: BREAKING: Republicans are trying t...
196817    RT @GovHowardDean: The Trump administration no...
196818    RT @SenFeinstein: BREAKING: I’ve introduced ou...
196819    RT @ProudResister: Donald Trump is attacking t...
196820    RT @RepSwalwell: Why, @realDonaldTrump? Looks ...
196821    RT @SteveSchmidtSES: The American people will ...
196822    RT @SteveSchmidtSES: A clear line has been dra...
196823    RT @NotDexVonFrisch: From Privilege to Progres...
196824    RT @funder: Retweet if you accept Joy Reid’s a...
196825    RT @BwanaSokwe: @thehill @realDonaldTrump 's a...
196826    RT @politvidchannel: BREAKING: Nine state atto...
196827    RT @chrislhayes: The last time

In [10]:
def remove_rt(x):
    if 'RT @' in x:
        try:
            return x[x.find(':')+2:]
        except:
            return x
    else:
        return x

In [11]:
#def translate(x):
#    try:
#        return translator.translate(x).text
#    except:
#        return x

In [9]:
stop_words = stopwords.words('english')

def remove_stop(x):
    return [word for word in x.split() if word not in stop_words]

In [10]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: remove_rt(x))

In [11]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x))

In [12]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: re.sub(r'[^\w\s]','',x))

In [13]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: x.lower())

In [14]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: remove_stop(x))

In [15]:
tweets['full_text'] = tweets['full_text'].astype(str)

In [16]:
tweets = tweets[tweets.full_text!='[]']

In [17]:
tweets

Unnamed: 0,user_id,full_text,target
1,1113472201,"['1d', 'stans', 'acting', 'like', '1d', 'super...",0
5,1113472201,"['hi', 'lets', 'talk', 'look', 'bio']",0
6,1113472201,"['usernames', 'girls', 'live', 'webcam', 'teen...",0
7,1113472201,"['got', 'tattoo', 'last', 'weekend', 'really',...",0
8,29795845,"['worst', 'year', 'entire', 'life', 'officiall...",0
10,29795845,"['destined', 'meet', 'meeting', 'sure', 'matte...",0
13,29795845,"['hi', 'today', 'great', 'day', 'look', 'bio',...",0
14,29795845,"['whos', 'chopper', 'know', 'bangladesh', 'men...",0
15,29795845,"['president', 'thomas', 'monson', '16th', 'pro...",0
16,29795845,"['mr', 'barbaro', 'nytimes', 'ne', 'foolish', ...",0


In [18]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc:(stemmer.stem(w) for w in analyzer(doc))


stem_vectorizer = StemmedCountVectorizer(stemmer)

pipeline = Pipeline([('vect', stem_vectorizer), 
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB(fit_prior=False))])
X_train, X_test, y_train, y_test = train_test_split(tweets.full_text, tweets.target, test_size=0.33, random_state=42)

clf = pipeline.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy: ' + str(np.mean(y_pred == y_test)))
print('partial f1: ' + str(f1_score(y_test, y_pred, average=None)))
print('total f1: ' + str(f1_score(y_test, y_pred, average='macro')))

KeyboardInterrupt: 

In [20]:
clf.predict_proba(pd.Series("Try this awesome app"))

array([[0.0646053 , 0.07329557, 0.69675381, 0.01342943, 0.15191589]])

In [23]:
clf = pipeline.fit(tweets.full_text, tweets.target)

In [26]:
y_pred = clf.predict(tweets.full_text)

print('accuracy: ' + str(np.mean(y_pred == tweets.target)))
print('partial f1: ' + str(f1_score(tweets.target, y_pred, average=None)))
print('total f1: ' + str(f1_score(tweets.target, y_pred, average='macro')))

accuracy: 0.7668795662672099
partial f1: [0.63853231 0.83060419 0.8263119  0.57269671 0.6802656 ]
total f1: 0.7096821426189857


In [27]:
pickle.dump( clf, open( "../scripts/nb.model", "wb" ) )