## DS9 Character Classifier

In [62]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_union, make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
from nltk import text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
import pandas as pd
import string

In [4]:
def classification_scorer(pipeline):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print('train score:', pipeline.score(X_train, y_train))
    print('accuracy score:', accuracy_score(y_test, preds))
    #print(confusion_matrix(y_test, preds))

In [5]:
df = pd.read_csv('./merged_df.csv')

In [6]:
df.head()

Unnamed: 0,character,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,LOCUTUS,Resistance is futile.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
1,LOCUTUS,You will disarm your weapons and escort us to ...,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
2,LOCUTUS,"If you attempt to intervene, we will destroy you.",emissary,3 Jan. 1993,Emissary,1,7.5,1,1
3,LOCUTUS,It is malevolent.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
4,LOCUTUS,Destroy it now.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1


In [7]:
common_chars = df['character'].value_counts()[:10].index
common_chars

Index(['SISKO', 'KIRA', 'BASHIR', 'QUARK', 'O'BRIEN', 'ODO', 'DAX', 'WORF',
       'GARAK', 'DUKAT'],
      dtype='object')

In [8]:
common_chars_df = df.loc[df['character'].isin(common_chars)]

In [9]:
count_array = [len(word_tokenize(line)) > 5 for line in common_chars_df['text']]
longer_than_5_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 8 for line in common_chars_df['text']]
longer_than_8_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 10 for line in common_chars_df['text']]
longer_than_10_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 15 for line in common_chars_df['text']]
longer_than_15_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 20 for line in common_chars_df['text']]
longer_than_20_df = common_chars_df[count_array]

In [10]:
list_of_dfs = [longer_than_5_df, longer_than_8_df, longer_than_10_df, longer_than_15_df, longer_than_20_df]

In [11]:
for lists in list_of_dfs:
    print(lists.shape[0])

47421
28985
20351
8091
3060


In [17]:
def baseline_accuracy(df):
    return df['character'].value_counts().values[0]/df.shape[0]

In [18]:
for lists in list_of_dfs:
    print(baseline_accuracy(lists))

0.200860378313
0.205037088149
0.205051348828
0.205413422321
0.199673202614


In [19]:
le = LabelEncoder()
le.fit(longer_than_5_df['character'])
list(le.classes_)

['BASHIR',
 'DAX',
 'DUKAT',
 'GARAK',
 'KIRA',
 "O'BRIEN",
 'ODO',
 'QUARK',
 'SISKO',
 'WORF']

In [27]:
cv = CountVectorizer()
cv.fit(df['text'])
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())
to_dense_df.sum().sort_values(ascending=False)[:50]

you      31232
the      30496
to       27166
it       14904
of       12247
that     12071
and       9907
we        8881
is        7783
in        7490
have      6972
me        6854
what      6801
be        6342
for       6295
this      5724
re        5538
not       5535
he        5435
on        5387
your      5014
do        4918
but       4745
my        4729
are       4593
they      4592
can       4533
don       4522
was       4513
all       4180
with      4177
ll        3946
know      3938
if        3809
no        3771
there     3650
ve        3486
about     3448
one       3219
here      3075
just      3054
so        2993
going     2906
as        2777
get       2662
him       2598
right     2532
like      2530
will      2506
at        2497
dtype: int64

In [21]:
stop_words_list = stopwords.words('english')

In [23]:
stop_words_list.extend(['im', 'go', 'dont', 'know', 'get', 'one', 'want', 'well', 'your',
       'think', 'like', 'us', 'would', 'take', 'that', 'see', 'could',
       'right', 'way', 'make', 'ill', 'say', 'ive', 'tell', 'back',
       'let', 'come', 'thing', 'cant', 'tri', 'two',
       'someth', 'there', 'find', 'talk', 'got', 'didn\t', 'sure', 'he', 'id', 'work'])

In [25]:
stop_words_list.extend(['go', 'your', 'theyr', 'day', 'much', 'use', 'still', 'mean', 'thought', 'oh', 'anyth'])

In [30]:
len(stop_words_list)

205

In [None]:
infrequent_words = to_dense_df.sum()[to_dense_df.sum() < 5].index

In [31]:
stop_words_list.extend(infrequent_words)

In [32]:
len(stop_words_list)

12373

In [33]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stop_words_list 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

## Fitting models

In [34]:
# toy example to check for bugs
X = longer_than_5_df['text']
y = le.transform(longer_than_5_df['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.893434556446
accuracy score: 0.237938596491


In [35]:
X = longer_than_5_df['text']
y = le.transform(longer_than_5_df['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.8684943062
accuracy score: 0.23895074224


In [36]:
le.inverse_transform(rfc_pipe.predict(['diagnosis', 'doctor', 'my dear', 'old man', 'testing a longer string']))

array(['SISKO', 'SISKO', 'KIRA', 'SISKO', 'SISKO'], dtype=object)

In [37]:
print(classification_report(y_test, rfc_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.25      0.24      0.24      1451
          1       0.20      0.15      0.18      1008
          2       0.16      0.11      0.13       443
          3       0.12      0.08      0.09       532
          4       0.19      0.20      0.19      1522
          5       0.25      0.23      0.24      1359
          6       0.19      0.16      0.17      1313
          7       0.28      0.27      0.28      1386
          8       0.28      0.41      0.33      2337
          9       0.22      0.17      0.19       505

avg / total       0.23      0.24      0.23     11856



In [38]:
for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(rfc_pipe))

train score: 0.870743708702
accuracy score: 0.237769905533
None
train score: 0.949949397369
accuracy score: 0.244928936111
None
train score: 0.974972154884
accuracy score: 0.242727987421
None
train score: 0.987640079103
accuracy score: 0.253583786456
None
train score: 0.988235294118
accuracy score: 0.245751633987
None


In [40]:
tfid_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(tfid_pipe))

train score: 0.872205820329
accuracy score: 0.249240890688
None
train score: 0.956573741835
accuracy score: 0.248240651304
None
train score: 0.97837908668
accuracy score: 0.253144654088
None
train score: 0.987804878049
accuracy score: 0.244686109738
None
train score: 0.98779956427
accuracy score: 0.239215686275
None


In [39]:
tfid_pipe = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(tfid_pipe))

train score: 0.89748348095
accuracy score: 0.248313090418
None
train score: 0.971892538412
accuracy score: 0.25307023596
None
train score: 0.985258468191
accuracy score: 0.240959119497
None
train score: 0.990771259064
accuracy score: 0.237765694513
None
train score: 0.98779956427
accuracy score: 0.239215686275
None


In [43]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_15_df['text'], longer_than_15_df['character'])

lr_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    LogisticRegressionCV()
)

classification_scorer(lr_pipe)

train score: 0.481872116018
accuracy score: 0.283242708848


In [44]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_15_df['text'], le.transform(longer_than_15_df['character']))

lr_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    LogisticRegressionCV()
)

classification_scorer(lr_pipe)

train score: 0.561140408701
accuracy score: 0.29115175482


In [45]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_15_df['text'], le.transform(longer_than_15_df['character']))

rfc_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.989452867502
accuracy score: 0.252100840336


In [48]:
rfc_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'countvectorizer', 'randomforestclassifier', 'countvectorizer__analyzer', 'countvectorizer__binary', 'countvectorizer__decode_error', 'countvectorizer__dtype', 'countvectorizer__encoding', 'countvectorizer__input', 'countvectorizer__lowercase', 'countvectorizer__max_df', 'countvectorizer__max_features', 'countvectorizer__min_df', 'countvectorizer__ngram_range', 'countvectorizer__preprocessor', 'countvectorizer__stop_words', 'countvectorizer__strip_accents', 'countvectorizer__token_pattern', 'countvectorizer__tokenizer', 'countvectorizer__vocabulary', 'randomforestclassifier__bootstrap', 'randomforestclassifier__class_weight', 'randomforestclassifier__criterion', 'randomforestclassifier__max_depth', 'randomforestclassifier__max_features', 'randomforestclassifier__max_leaf_nodes', 'randomforestclassifier__min_impurity_decrease', 'randomforestclassifier__min_impurity_split', 'randomforestclassifier__min_samples_leaf', 'randomforestclassifier__min_samples_spli

In [None]:
RandomForestClassifier()

In [52]:
params_grid = {
    'randomforestclassifier__n_estimators': [5, 10, 15],
    'randomforestclassifier__max_depth': [None, 5, 10],
    'randomforestclassifier__min_samples_split': [2, 5],
    'randomforestclassifier__min_samples_leaf': [1, 3]
}

In [53]:
gs = GridSearchCV(rfc_pipe, params_grid, verbose=2)

In [54]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5 
[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5, total=  11.8s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.5s remaining:    0.0s


[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5, total=  11.7s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5 
[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5, total=  11.7s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10 
[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10, total=  11.9s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_le

[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=15, total=  11.6s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=15 
[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=15, total=  11.6s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=15 
[CV]  randomforestclassifier__max_depth=None, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=15, total=  11.6s
[CV] randomforestclassifier__max_depth=None, randomforestclassifier__min_samples

[CV]  randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=5, total=  11.5s
[CV] randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=10 
[CV]  randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=10, total=  11.5s
[CV] randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=10 
[CV]  randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=10, total=  11.5s
[CV] randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=1, randomfore

[CV]  randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=15, total=  11.5s
[CV] randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=15 
[CV]  randomforestclassifier__max_depth=5, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=5, randomforestclassifier__n_estimators=15, total=  11.5s
[CV] randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5 
[CV]  randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=5, total=  11.5s
[CV] randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=1, randomfo

[CV]  randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10, total=  11.5s
[CV] randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10 
[CV]  randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10, total=  11.5s
[CV] randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10 
[CV]  randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=3, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=10, total=  11.5s
[CV] randomforestclassifier__max_depth=10, randomforestclassifier__min_samples_leaf=3, ran

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 34.6min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function cleaner...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestclassifier__n_estimators': [5, 10, 15], 'randomforestclassifier__max_depth': [None, 5, 10], 'randomforestclassifier__min_samples_split': [2, 5], 'randomforestclassifier__min_samples_leaf': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [55]:
gs.best_params_

{'randomforestclassifier__max_depth': None,
 'randomforestclassifier__min_samples_leaf': 3,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 15}

In [56]:
gs.best_score_

0.27653263019116675

In [60]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_15_df['text'], le.transform(longer_than_15_df['character']))

lr_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    LogisticRegressionCV()
)

classification_scorer(lr_pipe)

train score: 0.532300593276
accuracy score: 0.29115175482


In [70]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_15_df['text'], le.transform(longer_than_15_df['character']))

lr_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    LogisticRegressionCV()
)

In [61]:
#lr_pipe.get_params()

In [74]:
params_grid = {
    'countvectorizer__max_df': [0.8, 1.0],
    'logisticregressioncv__fit_intercept': [True, False]
}

In [75]:
gs = GridSearchCV(lr_pipe, params_grid, verbose=2)

In [76]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=True 
[CV]  countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=True, total=  24.0s
[CV] countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.7s remaining:    0.0s


[CV]  countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=True, total=  24.2s
[CV] countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=True 
[CV]  countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=True, total=  24.1s
[CV] countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=False 
[CV]  countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=False, total=  21.0s
[CV] countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=False 
[CV]  countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=False, total=  20.9s
[CV] countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=False 
[CV]  countvectorizer__max_df=0.8, logisticregressioncv__fit_intercept=False, total=  21.2s
[CV] countvectorizer__max_df=1.0, logisticregressioncv__fit_intercept=True 
[CV]  countvectorizer__max_df=1.0, logisticregressioncv__fit_intercept=True, total=  24.0s
[CV] countvectorizer__max_df=1.0, logisticregressioncv__fit_intercep

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  6.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function cleaner...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'countvectorizer__max_df': [0.8, 1.0], 'logisticregressioncv__fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [77]:
gs.best_score_

0.24785761371127224

In [None]:
 'countvectorizer__max_df': 1.0,
 'countvectorizer__max_features': None,
 'countvectorizer__min_df': 1,
 'countvectorizer__ngram_range': (1, 1),
'logisticregressioncv__fit_intercept': True,
    'logisticregressioncv__penalty': 'l2',

In [66]:
# X_train, X_test, y_train, y_test = train_test_split(longer_than_15_df['text'], le.transform(longer_than_15_df['character']))

# gnb_pipe = make_pipeline(
#     CountVectorizer(preprocessor=cleaner),
#     GaussianNB()
# )

# classification_scorer(gnb_pipe)

In [65]:
mnb_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    MultinomialNB()
)

classification_scorer(mnb_pipe)

train score: 0.634146341463
accuracy score: 0.302521008403


In [None]:
mnb_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    MultinomialNB()
)

classification_scorer(mnb_pipe)