## DS9 Character Classifier

In [1]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_union, make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
from nltk import text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
import pandas as pd
import string

In [4]:
def classification_scorer(pipeline):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print('train score:', pipeline.score(X_train, y_train))
    print('accuracy score:', accuracy_score(y_test, preds))
    #print(confusion_matrix(y_test, preds))

In [5]:
df = pd.read_csv('./merged_df.csv')

In [6]:
df.head()

Unnamed: 0,character,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,LOCUTUS,Resistance is futile.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
1,LOCUTUS,You will disarm your weapons and escort us to ...,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
2,LOCUTUS,"If you attempt to intervene, we will destroy you.",emissary,3 Jan. 1993,Emissary,1,7.5,1,1
3,LOCUTUS,It is malevolent.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
4,LOCUTUS,Destroy it now.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1


In [7]:
common_chars = df['character'].value_counts()[:10].index
common_chars

Index(['SISKO', 'KIRA', 'BASHIR', 'QUARK', 'O'BRIEN', 'ODO', 'DAX', 'WORF',
       'GARAK', 'DUKAT'],
      dtype='object')

In [8]:
common_chars_df = df.loc[df['character'].isin(common_chars)]

In [9]:
count_array = [len(word_tokenize(line)) > 5 for line in common_chars_df['text']]
longer_than_5_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 8 for line in common_chars_df['text']]
longer_than_8_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 10 for line in common_chars_df['text']]
longer_than_10_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 15 for line in common_chars_df['text']]
longer_than_15_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 20 for line in common_chars_df['text']]
longer_than_20_df = common_chars_df[count_array]

In [10]:
list_of_dfs = [longer_than_5_df, longer_than_8_df, longer_than_10_df, longer_than_15_df, longer_than_20_df]

In [11]:
for lists in list_of_dfs:
    print(lists.shape[0])

47421
28985
20351
8091
3060


In [17]:
def baseline_accuracy(df):
    return df['character'].value_counts().values[0]/df.shape[0]

In [18]:
for lists in list_of_dfs:
    print(baseline_accuracy(lists))

0.200860378313
0.205037088149
0.205051348828
0.205413422321
0.199673202614


In [19]:
le = LabelEncoder()
le.fit(longer_than_5_df['character'])
list(le.classes_)

['BASHIR',
 'DAX',
 'DUKAT',
 'GARAK',
 'KIRA',
 "O'BRIEN",
 'ODO',
 'QUARK',
 'SISKO',
 'WORF']

In [27]:
cv = CountVectorizer()
cv.fit(df['text'])
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())
to_dense_df.sum().sort_values(ascending=False)[:50]

you      31232
the      30496
to       27166
it       14904
of       12247
that     12071
and       9907
we        8881
is        7783
in        7490
have      6972
me        6854
what      6801
be        6342
for       6295
this      5724
re        5538
not       5535
he        5435
on        5387
your      5014
do        4918
but       4745
my        4729
are       4593
they      4592
can       4533
don       4522
was       4513
all       4180
with      4177
ll        3946
know      3938
if        3809
no        3771
there     3650
ve        3486
about     3448
one       3219
here      3075
just      3054
so        2993
going     2906
as        2777
get       2662
him       2598
right     2532
like      2530
will      2506
at        2497
dtype: int64

In [21]:
stop_words_list = stopwords.words('english')

In [23]:
stop_words_list.extend(['im', 'go', 'dont', 'know', 'get', 'one', 'want', 'well', 'your',
       'think', 'like', 'us', 'would', 'take', 'that', 'see', 'could',
       'right', 'way', 'make', 'ill', 'say', 'ive', 'tell', 'back',
       'let', 'come', 'thing', 'cant', 'tri', 'two',
       'someth', 'there', 'find', 'talk', 'got', 'didn\t', 'sure', 'he', 'id', 'work'])

In [25]:
stop_words_list.extend(['go', 'your', 'theyr', 'day', 'much', 'use', 'still', 'mean', 'thought', 'oh', 'anyth'])

In [None]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stop_words_list = 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [28]:
infrequent_words = to_dense_df.sum()[to_dense_df.sum() < 5].index

In [None]:
infr