## Preprocess the tweets

In [1]:
import pandas as pd
import preprocessor as p

df = pd.read_csv('../data/tweets_original.csv')
df.shape

(69519, 3)

In [2]:
df['clean_text'] = df['raw_text'].apply(p.clean)
df.head()

Unnamed: 0,id_str,screen_name,raw_text,clean_text
0,1193152895337689088,YXSzzzz,@gww067 Haha there is no way Ryza is not cute 😂,Haha there is no way Ryza is not cute
1,1193152895337689088,YXSzzzz,@somberness :DDD,
2,1193152895337689088,YXSzzzz,@IthzelDoe Love. 🤣,Love. 🤣
3,1193152895337689088,YXSzzzz,😮😮😮 https://t.co/l4jcdkz67Z,
4,1193152895337689088,YXSzzzz,@Kyonko802 😮,


In [3]:
import numpy as np

df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna()
df.shape

(64888, 4)

## Vectorize the tweets

In [66]:
from polyglot.text import Text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold

X = df.clean_text
y = df.screen_name

def ploy_tokenizer(raw_text):
    return Text(raw_text).words

count_vectorizer = CountVectorizer(tokenizer=ploy_tokenizer)
tf_transformer = TfidfTransformer()

kf = KFold(n_splits=10, shuffle=True, random_state=1)

## Classification

In [77]:
import logging

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

logging.getLogger().setLevel(logging.ERROR)

nb = MultinomialNB()
lr = LogisticRegression(solver='liblinear', multi_class='auto')
svm = LinearSVC()

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

def tokenize_test_pipeline(model):
    pipe = Pipeline([
        ('count', count_vectorizer),
        ('tfidf', tf_transformer),
        ('clf', globals()[model]),
    ])

    params = {
        'count__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
        'count__analyzer': ['word', 'char'],
    }

    algorithm = ''
    if model.startswith('lr'):
        algorithm = 'Logistic Regression'
    elif model.startswith('nb'):
        algorithm = 'Multinomial Naive Bayes'
    elif model.startswith('svm'):
        algorithm = 'Linear SVM'

    gd_clf = GridSearchCV(pipe, params, cv=10, scoring=scoring, n_jobs=-1, refit='f1_macro')
    gd_clf.fit(X, y)

    print(algorithm)
    print('------------------')
    print('Scores: ')
    print(gd_clf.cv_results_)
    print('Best params:')
    print(gd_clf.best_params_)
    print('Best score:')
    print(gd_clf.best_score)

### Naive Bayes

In [75]:
tokenize_test_pipeline(model='nb')

Multinomial Naive Bayes
------------------
Scores: 
{'mean_fit_time': array([ 22.82143555,  54.0353518 ,  89.37386882, 116.17271426,
       166.75812354,   5.2865149 ,  13.01191733,  28.52952027,
        49.60075006,  84.00029309]), 'std_fit_time': array([ 0.51038264,  1.56401284,  5.77294014,  4.50866494, 12.90599934,
        1.15951596,  0.38834506,  1.42791471,  2.55377926, 20.5300274 ]), 'mean_score_time': array([10.34100204, 21.44172318, 33.2644012 , 49.1438055 , 76.52168016,
        2.64268632,  5.2288317 , 12.6390388 , 18.72427282, 57.66140523]), 'std_score_time': array([ 0.56706378,  1.85646073,  4.64300491,  9.24574801, 17.5837646 ,
        0.64748692,  0.39634994,  2.37079997,  3.46028468, 28.57153615]), 'param_count__analyzer': masked_array(data=['word', 'word', 'word', 'word', 'word', 'char', 'char',
                   'char', 'char', 'char'],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',


### Logistic Regression

In [42]:
tokenize_test_pipeline(model='lr')

Features:  74495
Average Scores: 
fit_time : 32.170420
score_time : 0.231972
test_accuracy : 0.706063
test_precision_macro : 0.697994
test_recall_macro : 0.491269
test_f1_macro : 0.526272
Logistic Regression


### SVM

In [32]:
tokenize_test_pipeline(model='svm')

Features:  74495
Average Scores: 
fit_time : 115.475916
score_time : 0.237551
test_accuracy : 0.731291
test_precision_macro : 0.651458
test_recall_macro : 0.602806
test_f1_macro : 0.619864
Linear SVM
