## Preprocess the tweets

In [1]:
import pandas as pd

df = pd.read_csv('../data/tweets_original.csv')
df.shape

(69519, 3)

In [2]:
from opencc import OpenCC
import preprocessor as p

cc = OpenCC('t2s')

def clean_text(raw_text):
    detweeted_text = p.clean(raw_text)
    simplied_text = cc.convert(detweeted_text)
    return simplied_text


df['clean_text'] = df['raw_text'].apply(clean_text)
df.head()

Unnamed: 0,id_str,screen_name,raw_text,clean_text
0,1193152895337689088,YXSzzzz,@gww067 Haha there is no way Ryza is not cute 😂,Haha there is no way Ryza is not cute
1,1193152895337689088,YXSzzzz,@somberness :DDD,
2,1193152895337689088,YXSzzzz,@IthzelDoe Love. 🤣,Love. 🤣
3,1193152895337689088,YXSzzzz,😮😮😮 https://t.co/l4jcdkz67Z,
4,1193152895337689088,YXSzzzz,@Kyonko802 😮,


In [3]:
import numpy as np

df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna()
df.shape

(64888, 4)

## Vectorize the tweets

In [4]:
from polyglot.text import Text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold

X = df.clean_text
y = df.screen_name

def ploy_tokenizer(raw_text):
    return Text(raw_text).words

count_vectorizer = CountVectorizer(tokenizer=ploy_tokenizer)
tf_transformer = TfidfTransformer()

kf = KFold(n_splits=10, shuffle=True, random_state=1)

## Classification

In [5]:
import logging

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

logging.getLogger().setLevel(logging.ERROR)

nb = MultinomialNB()
lr = LogisticRegression(solver='liblinear', multi_class='auto')
svm = LinearSVC()

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

def tokenize_test_pipeline(model):
    pipe = Pipeline([
        ('count', count_vectorizer),
        ('tfidf', tf_transformer),
        ('clf', globals()[model]),
    ])

    params = {
        'count__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
        'count__analyzer': ['word', 'char'],
    }

    algorithm = ''
    if model.startswith('lr'):
        algorithm = 'Logistic Regression'
    elif model.startswith('nb'):
        algorithm = 'Multinomial Naive Bayes'
    elif model.startswith('svm'):
        algorithm = 'Linear SVM'

    gd_clf = GridSearchCV(pipe, params, cv=10, scoring=scoring, n_jobs=-1, refit='f1_macro')
    gd_clf.fit(X, y)

    print(algorithm)
    print('------------------')
    print('Scores: ')
    print(gd_clf.cv_results_)
    print('Best params:')
    print(gd_clf.best_params_)
    print('Best score:')
    print(gd_clf.best_score_)

### Naive Bayes

In [75]:
tokenize_test_pipeline(model='nb')

Multinomial Naive Bayes
------------------
Scores: 
{'mean_fit_time': array([ 22.82143555,  54.0353518 ,  89.37386882, 116.17271426,
       166.75812354,   5.2865149 ,  13.01191733,  28.52952027,
        49.60075006,  84.00029309]), 'std_fit_time': array([ 0.51038264,  1.56401284,  5.77294014,  4.50866494, 12.90599934,
        1.15951596,  0.38834506,  1.42791471,  2.55377926, 20.5300274 ]), 'mean_score_time': array([10.34100204, 21.44172318, 33.2644012 , 49.1438055 , 76.52168016,
        2.64268632,  5.2288317 , 12.6390388 , 18.72427282, 57.66140523]), 'std_score_time': array([ 0.56706378,  1.85646073,  4.64300491,  9.24574801, 17.5837646 ,
        0.64748692,  0.39634994,  2.37079997,  3.46028468, 28.57153615]), 'param_count__analyzer': masked_array(data=['word', 'word', 'word', 'word', 'word', 'char', 'char',
                   'char', 'char', 'char'],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',


### Logistic Regression

In [6]:
tokenize_test_pipeline(model='lr')

Logistic Regression
------------------
Scores: 
{'mean_fit_time': array([ 55.95206192, 165.05135224, 308.45796001, 490.74182959,
       704.78875749,  66.92583542, 191.42893808, 360.63658466,
       680.21014888, 954.86220932]), 'std_fit_time': array([  5.47236038,   3.15659445,  19.47056584,  28.42158136,
        49.85140343,   4.30012517,  16.19323012,  13.34457513,
        24.63225834, 223.7896533 ]), 'mean_score_time': array([10.52984135, 19.17915235, 31.2358391 , 46.58561313, 65.17891693,
        2.07769601,  5.42040668,  9.6248826 , 16.75403554, 18.83134894]), 'std_score_time': array([ 0.25791863,  1.43874218,  2.07569077,  5.780974  , 19.62611423,
        0.14130892,  0.44013282,  0.62595665,  0.98055509,  8.11656266]), 'param_count__analyzer': masked_array(data=['word', 'word', 'word', 'word', 'word', 'char', 'char',
                   'char', 'char', 'char'],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fil

### SVM

In [7]:
tokenize_test_pipeline(model='svm')

Linear SVM
------------------
Scores: 
{'mean_fit_time': array([ 39.13557873,  90.95561879, 148.21184311, 209.62468457,
       284.55015097,  30.32936647,  57.36534059,  89.63355989,
       166.13039422, 240.52759602]), 'std_fit_time': array([ 1.25531657,  3.02542467,  3.67334365,  5.4513434 ,  7.9995442 ,
        1.06682304,  3.51573656,  2.93676364,  3.5661605 , 57.58947559]), 'mean_score_time': array([10.10502214, 18.46141534, 27.71622579, 37.91307306, 52.32033958,
        1.86744795,  5.18878911,  8.97869923, 14.20682335, 16.77347555]), 'std_score_time': array([0.61044843, 0.77308262, 1.25853845, 3.6616263 , 7.48556931,
       0.16546649, 0.72565496, 1.12672797, 2.02688301, 6.8698538 ]), 'param_count__analyzer': masked_array(data=['word', 'word', 'word', 'word', 'word', 'char', 'char',
                   'char', 'char', 'char'],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=objec