In [121]:
import pandas as pd

df = pd.read_csv('../data/tweets.csv')
df.shape

(69519, 3)

## Vectorize the tweets

In [126]:
from polyglot.text import Text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold

X = df.monolingual_text
y = df.screen_name

def ploy_tokenizer(raw_text):
    return Text(raw_text).words

count_vectorizer = CountVectorizer(tokenizer=ploy_tokenizer)
tf_transformer = TfidfTransformer()

kf = KFold(n_splits=10, shuffle=True, random_state=1)

## Classification

In [127]:
import logging

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

logging.getLogger().setLevel(logging.ERROR)

nb = MultinomialNB()
lr = LogisticRegression(solver='liblinear', multi_class='auto', class_weight='balanced', verbose=1)
svm = LinearSVC(class_weight='balanced', verbose=1)

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

def tokenize_test_pipeline(model, dataset_X=X, dataset_y=y):
    pipe = Pipeline([
        ('count', count_vectorizer),
        ('tfidf', tf_transformer),
        ('clf', globals()[model]),
    ])

    params = {
        'count__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
        'count__analyzer': ['word', 'char'],
    }

    algorithm = ''
    if model.startswith('lr'):
        algorithm = 'Logistic Regression'
    elif model.startswith('nb'):
        algorithm = 'Multinomial Naive Bayes'
    elif model.startswith('svm'):
        algorithm = 'Linear SVM'

    gd_clf = GridSearchCV(pipe, params, cv=10, scoring=scoring, n_jobs=-1, refit='accuracy', verbose=1)
    gd_clf.fit(dataset_X, dataset_y)

    print(algorithm)
    print('------------------')
    print('Scores: ')
    print(gd_clf.cv_results_)
    print('Best params:')
    print(gd_clf.best_params_)
    print('Best score:')
    print(gd_clf.best_score_)

### Logistic Regression

In [23]:
tokenize_test_pipeline(model='lr')

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 29.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 122.4min finished
[LibLinear]Logistic Regression
------------------
Scores: 
{'mean_fit_time': array([  90.15525188,  232.93297663,  451.50999672,  719.66827676,
        980.32116294,   96.98633301,  255.17148619,  468.58294835,
        875.15281441, 1195.69569204]), 'std_fit_time': array([  5.9519921 ,   9.79278282,  20.80733611,  23.56356231,
        54.25495594,   6.44482546,  18.45416698,  12.39980251,
        35.0238599 , 271.53529192]), 'mean_score_time': array([17.1941143 , 26.21392817, 40.84982026, 55.71520891, 67.03659639,
        2.46889   ,  5.79440479,  9.99990385, 16.24484694, 16.7575417 ]), 'std_score_time': array([0.36333204, 1.88849302, 1.79602701, 3.17261649, 9.08721112,
       0.20355378, 0.23245006, 0.45316449, 0.848

### SVM

In [128]:
tokenize_test_pipeline(model='svm')

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 54.6min finished
[LibLinear]Linear SVM
------------------
Scores: 
{'mean_fit_time': array([ 58.71042354, 140.47947326, 238.94598553, 347.43826673,
       474.99477327, 106.56099887, 103.59562347, 148.85106387,
       273.57449019, 415.7658452 ]), 'std_fit_time': array([ 3.14350209,  4.91390437,  5.22551652, 10.00073211, 17.8378984 ,
        5.10242396,  3.92357788,  2.77836701,  3.90592974, 69.02634004]), 'mean_score_time': array([11.91829133, 22.1799896 , 32.97235961, 44.99313819, 59.34663827,
        2.2784024 ,  5.8568759 ,  9.63819273, 15.62063251, 17.16798544]), 'std_score_time': array([0.93631485, 1.09209493, 1.7998025 , 1.37603647, 2.2468502 ,
       0.23829812, 0.50866111, 1.10216584, 1.13305516, 5.76355094]), 'param_count__an