In [203]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import pickle

In [204]:
users = pd.read_csv('data/full/train.csv')

In [205]:
users = users[['user_id','target']]

In [206]:
tweets = pd.concat([pd.read_csv('data/porn/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/propaganda/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/spam/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/fake_followers/tweets.csv', sep='\t')[['user_id','full_text']]])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [207]:
tweets = tweets.drop_duplicates()

In [208]:
tweets =  tweets.join(users.set_index('user_id'), on='user_id')

In [209]:
tweets = tweets[tweets.target!=4]

In [210]:
tweets.target.value_counts()

2.0    453719
1.0    280300
0.0    196712
3.0     41316
Name: target, dtype: int64

In [211]:
tweets.drop(['user_id'], axis=1, inplace=True)

In [212]:
tweets = tweets[~np.isnan(tweets.target)]

In [213]:
tweets.target = tweets.target.astype(int)

In [214]:
tweets

Unnamed: 0,full_text,target
0,https://t.co/esART8smVX,0
1,RT @zaynwinterfell: 1d stans acting like 1d is...,0
2,https://t.co/KiTk9FMJwj,0
3,https://t.co/KGjAK50Px0,0
4,https://t.co/oDlvrgZMfe,0
5,"Hi, how are you? Let's talk) Look at my bio!❤️...",0
6,RT @GirlzKiki: #usernames #girls #live #webcam...,0
7,RT @bartz_ryan: So I got a tattoo last weekend...,0
8,RT @kylewesterfer: The worst year of my entire...,0
9,https://t.co/JXT7P9LZ0q,0


In [215]:
def remove_rt(x):
    if 'RT @' in x:
        try:
            return x[x.find(':')+2:]
        except:
            return x
    else:
        return x

In [216]:
#def translate(x):
#    try:
#        return translator.translate(x).text
#    except:
#        return x

In [217]:
stop_words = stopwords.words('english')

def remove_stop(x):
    return [word for word in x.split() if word not in stop_words]

In [218]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: remove_rt(x))

In [219]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x))

In [220]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: re.sub(r'[^\w\s]','',x))

In [221]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: x.lower())

In [222]:
tweets['full_text'] = tweets['full_text'].apply(lambda x: remove_stop(x))

In [223]:
tweets['full_text'] = tweets['full_text'].astype(str)

In [224]:
tweets = tweets[tweets.full_text!='[]']

In [225]:
tweets.target.value_counts()

2    448362
1    274624
0    172060
3     39578
Name: target, dtype: int64

In [226]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc:(stemmer.stem(w) for w in analyzer(doc))


stem_vectorizer = StemmedCountVectorizer(stemmer)

pipeline = Pipeline([('vect', stem_vectorizer), 
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB(fit_prior=False))])
X_train, X_test, y_train, y_test = train_test_split(tweets.full_text, tweets.target, test_size=0.33, random_state=42)

clf = pipeline.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy: ' + str(np.mean(y_pred == y_test)))
print('partial f1: ' + str(f1_score(y_test, y_pred, average=None)))
print('total f1: ' + str(f1_score(y_test, y_pred, average='macro')))

accuracy: 0.8002211227328435
partial f1: [0.58261769 0.85033284 0.85908688 0.51225276]
total f1: 0.7010725431740751


In [227]:
clf.predict_proba(pd.Series("try this awesome app"))

array([[0.08452161, 0.09596882, 0.80108685, 0.01842271]])

In [228]:
clf = pipeline.fit(tweets.full_text, tweets.target)

In [229]:
y_pred = clf.predict(tweets.full_text)

print('accuracy: ' + str(np.mean(y_pred == tweets.target)))
print('partial f1: ' + str(f1_score(tweets.target, y_pred, average=None)))
print('total f1: ' + str(f1_score(tweets.target, y_pred, average='macro')))

accuracy: 0.8356344369500462
partial f1: [0.69233249 0.87028824 0.88211509 0.59120657]
total f1: 0.7589855958658608


In [230]:
pickle.dump( clf, open( "../scripts/nb.model", "wb" ) )