In [1]:
import pandas as pd
import numpy as np

In [2]:
train_set = pd.read_pickle('train_set.pkl')
strat_train_set = pd.read_pickle('strat_train_set.pkl')
strat_test_set = pd.read_pickle('strat_test_set.pkl')
strat_valid_set = pd.read_pickle('strat_valid_set.pkl')

### [Test] Sample to class ratios 1:1:1

In [34]:
min_tag_count = strat_train_set['tags_id'].value_counts().min()

strat_train_set['tags_id'].value_counts(), min_tag_count

(2    1274
 0     557
 1     505
 Name: tags_id, dtype: int64, 505)

In [35]:
strat_train_set['tags_id'].value_counts()[2]

1274

In [36]:
np.random.seed(10)

def balance_tag_counts(tag, df):
    remove_n = df['tags_id'].value_counts()[tag] - min_tag_count
    drop_indices = np.random.choice(df[df['tags_id'] == tag].index, remove_n, replace=False)
    df = df.drop(drop_indices)
    return df

In [37]:
print(strat_train_set.shape)
strat_train_set = balance_tag_counts(2, strat_train_set)
print(strat_train_set.shape)
strat_train_set = balance_tag_counts(1, strat_train_set)
print(strat_train_set.shape)
strat_train_set = balance_tag_counts(0, strat_train_set)
strat_train_set.shape

(2336, 6)
(1567, 6)
(1567, 6)


(1515, 6)

In [91]:
strat_train_set['tags_id'].value_counts()

2    1274
0     557
1     505
Name: tags_id, dtype: int64

### Vectorize input

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, min_df=0.05, max_df=0.8) 
tfidf_char = TfidfVectorizer(lowercase=False, min_df=0.05, max_df=0.8, analyzer='char', ngram_range=(2,4))
#tfidf = tfidf.fit(train_set['segmented'].values)

# Pipeline and Select a Model

In [65]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### Multinominal Naive Bayes Classifier

In [75]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [80]:
mnb = MultinomialNB()

In [81]:
mnb_pipe = Pipeline(
    steps=[('tfidf', tfidf_char), 
           ('mnb', mnb)]
)

In [70]:
mnb_pipe.fit(strat_train_set['cleaned'].values, strat_train_set['tags_id'].values)\
.score(strat_valid_set['cleaned'].values, strat_valid_set['tags_id'].values)

0.9871630295250321

### Supper Vector Machines Classifier

In [50]:
from sklearn.svm import SVC

In [82]:
svc = SVC(gamma='auto')

In [83]:
svc_pipe = Pipeline(
    steps=[('tfidf', tfidf_char), 
           ('svc', svc)]
)

In [74]:
svc_pipe.fit(strat_train_set['cleaned'].values, strat_train_set['tags_id'].values)\
.score(strat_valid_set['cleaned'].values, strat_valid_set['tags_id'].values)

0.5442875481386393

### K-Nearest Neighbors Classifier

In [54]:
from sklearn.neighbors import KNeighborsClassifier 

In [84]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [86]:
neigh_pipe = Pipeline(
    steps=[('tfidf', tfidf_char), 
           ('neigh', neigh)]
)

In [61]:
neigh_pipe.fit(strat_train_set['cleaned'].values, strat_train_set['tags_id'].values)\
.score(strat_valid_set['cleaned'].values, strat_valid_set['tags_id'].values)

0.9717586649550706

# GridSearch

GridSearch on MNB and Neigh

In [90]:
mnb_parameters = {
    'tfidf__max_df': (0.75, 1.0),
    #'tfidf__max_features': (None, 1000, 1500, 2000, 5000),
    'tfidf__ngram_range': ((1, 4), (2, 4)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
}

mnb_grid = GridSearchCV(mnb_pipe, mnb_parameters, cv=5,
                               n_jobs=-1, verbose=1)

In [91]:
neigh_parameters = {
    'tfidf__max_df': (0.75, 1.0),
    #'tfidf__max_features': (None, 1000, 1500, 2000, 5000),
    'tfidf__ngram_range': ((1, 4), (2, 4)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'neigh__weights': ('uniform', 'distance'),
}

neigh_grid = GridSearchCV(neigh_pipe, neigh_parameters, cv=5,
                               n_jobs=-1, verbose=1)

In [93]:
mnb_grid.fit(strat_train_set['cleaned'].values, strat_train_set['tags_id'].values)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  8.6min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.8, max_features=None, min_df=0.05,
        ngram_range=(2, 4), norm='l2', preprocessor=None, smooth_idf=T...rue,
        vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'tfidf__max_df': (0.75, 1.0), 'tfidf__ngram_range': ((1, 4), (2, 4)), 'tfidf__use_idf': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [100]:
mnb_grid.cv_results_



{'mean_fit_time': array([41.77883558, 33.18874226, 31.15381718, 30.56107492, 33.253197  ,
        38.71106129, 42.61868896, 41.9627943 ]),
 'std_fit_time': array([4.11338215, 0.50902765, 0.51844353, 0.33286636, 1.15856888,
        1.01704529, 8.65308843, 3.64538768]),
 'mean_score_time': array([3.32255659, 2.90699229, 2.30756764, 2.30824175, 3.67066927,
        3.58707809, 2.5140902 , 2.3443418 ]),
 'std_score_time': array([0.18463748, 0.07783609, 0.07774013, 0.12413056, 1.51964687,
        0.84627309, 0.18021793, 0.10587431]),
 'param_tfidf__max_df': masked_array(data=[0.75, 0.75, 0.75, 0.75, 1.0, 1.0, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__ngram_range': masked_array(data=[(1, 4), (1, 4), (2, 4), (2, 4), (1, 4), (1, 4), (2, 4),
                    (2, 4)],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dt

In [95]:
mnb_grid.best_score_

0.9931506849315068

In [96]:
mnb_grid.best_params_

{'tfidf__max_df': 1.0, 'tfidf__ngram_range': (2, 4), 'tfidf__use_idf': False}

In [101]:
neigh_grid.fit(strat_train_set['cleaned'].values, strat_train_set['tags_id'].values)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 