In [211]:
import spacy

In [270]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_union, make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [92]:
from nltk import text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [220]:
import pandas as pd
import string

In [107]:
def classification_scorer(pipeline):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print('train score:', pipeline.score(X_train, y_train))
    print('accuracy score:', accuracy_score(y_test, preds))
    #print(confusion_matrix(y_test, preds))

In [4]:
df = pd.read_csv('./merged_df.csv')

In [5]:
df.head()

Unnamed: 0,character,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,LOCUTUS,Resistance is futile.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
1,LOCUTUS,You will disarm your weapons and escort us to ...,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
2,LOCUTUS,"If you attempt to intervene, we will destroy you.",emissary,3 Jan. 1993,Emissary,1,7.5,1,1
3,LOCUTUS,It is malevolent.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
4,LOCUTUS,Destroy it now.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1


In [114]:
common_chars = df['character'].value_counts()[:10].index
common_chars

Index(['SISKO', 'KIRA', 'BASHIR', 'QUARK', 'O'BRIEN', 'ODO', 'DAX', 'WORF',
       'GARAK', 'DUKAT'],
      dtype='object')

In [115]:
#[sent for sent in df.loc[df['character'] == 'BASHIR']['text'] if len(word_tokenize(sent)) > 5]

In [116]:
# for each in df.loc[df['character'] == 'BASHIR']['text']:
#     if len(word_tokenize(each)) > 5:
#         print(each)

In [117]:
common_chars_df = df.loc[df['character'].isin(common_chars)]

In [118]:
count_array = [len(word_tokenize(line)) > 5 for line in common_chars_df['text']]
longer_than_5_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 8 for line in common_chars_df['text']]
longer_than_8_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 10 for line in common_chars_df['text']]
longer_than_10_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 15 for line in common_chars_df['text']]
longer_than_15_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 20 for line in common_chars_df['text']]
longer_than_20_df = common_chars_df[count_array]

In [146]:
list_of_dfs = [longer_than_5_df, longer_than_8_df, longer_than_10_df, longer_than_15_df, longer_than_20_df]

In [197]:
longer_than_8_df['character'].unique()

array(['SISKO', "O'BRIEN", 'KIRA', 'ODO', 'QUARK', 'BASHIR', 'DAX',
       'DUKAT', 'GARAK', 'WORF'], dtype=object)

In [147]:
for lists in list_of_dfs:
    print(lists.shape[0])

47421
28985
20351
8091
3060


In [157]:
def baseline_accuracy(df):
    return df['character'].value_counts().values[0]/df.shape[0]

In [158]:
for lists in list_of_dfs:
    print(baseline_accuracy(lists))

0.200860378313
0.205037088149
0.205051348828
0.205413422321
0.199673202614


In [159]:
le = LabelEncoder()
le.fit(longer_than_5_df['character'])
list(le.classes_)

['BASHIR',
 'DAX',
 'DUKAT',
 'GARAK',
 'KIRA',
 "O'BRIEN",
 'ODO',
 'QUARK',
 'SISKO',
 'WORF']

## Modeling

First, confirm a very simple example works:

In [160]:
X = longer_than_5_df['text']
y = le.transform(longer_than_5_df['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.891072683818
accuracy score: 0.230347503374


In [144]:
rfc_pipe.predict(['test', 'test'])

array([7, 7])

In [164]:
le.inverse_transform(rfc_pipe.predict(['diagnosis', 'doctor', 'my dear', 'old man', 'the new generation of annoying quote-RTs is people quote-RTing people that are actually making the point that a bad thing is bad but the quote-RTer misinterpreted it but doesn\'t bother to delete because the quote RT is doing numbers']))

array(['BASHIR', 'SISKO', 'GARAK', 'SISKO', 'WORF'], dtype=object)

In [165]:
le.inverse_transform(rfc_pipe.predict(['There is a drag show called Dragula that does not use the Rob Zombie song as its theme.']))

array(['QUARK'], dtype=object)

In [170]:
def temp_predictor(a_string):
    print(le.inverse_transform(rfc_pipe.predict([a_string])))

In [176]:
# Classifiers to test: RandomForestClassifier, AdaBoostClassifier

In [195]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_5_df['text'], longer_than_5_df['character'])

rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.891410094194
accuracy score: 0.241649797571


In [266]:
print(classification_report(y_test, rfc_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.21      0.54      0.31       667
          1       0.57      0.12      0.20       445
          2       0.56      0.20      0.29       236
          3       0.52      0.21      0.30       258
          4       0.33      0.17      0.23       600
          5       0.34      0.21      0.26       521
          6       0.32      0.24      0.27       571
          7       0.46      0.23      0.31       542
          8       0.30      0.51      0.38      1057
          9       0.68      0.15      0.24       191

avg / total       0.38      0.31      0.29      5088



In [269]:
le.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

array(['BASHIR', 'DAX', 'DUKAT', 'GARAK', 'KIRA', "O'BRIEN", 'ODO',
       'QUARK', 'SISKO', 'WORF'], dtype=object)

In [None]:
X = longer_than_5_df['text']
y = le.transform(longer_than_5_df['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

In [199]:
for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(rfc_pipe))

train score: 0.891691269507
accuracy score: 0.240553306343
None
train score: 0.965544208299
accuracy score: 0.239547398924
None
train score: 0.98073773177
accuracy score: 0.244496855346
None
train score: 0.987145682268
accuracy score: 0.254078101829
None
train score: 0.988235294118
accuracy score: 0.258823529412
None


In [208]:
tfid_pipe = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(tfid_pipe))

train score: 0.896190074511
accuracy score: 0.248060053981
None
train score: 0.969776428374
accuracy score: 0.251828342763
None
train score: 0.98381707397
accuracy score: 0.254127358491
None
train score: 0.987969676994
accuracy score: 0.256055363322
None
train score: 0.986928104575
accuracy score: 0.232679738562
None


In [209]:
svc_pipe = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    SVC()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(svc_pipe))

train score: 0.201237171376
accuracy score: 0.199730094467
None
train score: 0.20701076456
accuracy score: 0.199116875949
None
train score: 0.206905588679
accuracy score: 0.199488993711
None
train score: 0.207976268952
accuracy score: 0.197726149283
None
train score: 0.200435729847
accuracy score: 0.197385620915
None


In [210]:
svc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    SVC()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(svc_pipe))

train score: 0.200196822719
accuracy score: 0.202850877193
None
train score: 0.203744594719
accuracy score: 0.208914033393
None
train score: 0.204612461508
accuracy score: 0.206367924528
None
train score: 0.200889914305
accuracy score: 0.218981710331
None
train score: 0.202614379085
accuracy score: 0.190849673203
None


In [None]:
cv = CountVectorizer(stop_words=airline_stopwords)
X = cv.fit_transform(X_train)
feature_names = cv.get_feature_names()

In [202]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(X_train)
feature_names = cv.get_feature_names()

def LDA_batch(X, n):
    lda = LatentDirichletAllocation(n_topics=n)
    lda.fit(X)
    results = pd.DataFrame(lda.components_, columns=feature_names)
    print('LDA for {} topics:'.format(n))
    for topic in range(n):
        print('Topic', topic)
        word_list = results.T[topic].sort_values(ascending=False).index
        print(' '.join(word_list[0:25]), '\n')

In [204]:
LDA_batch(X, 10)



LDA for 10 topics:
Topic 0
thought reason space people looking deep power need like starfleet break line state children damage romulan blood dna systems problem learn eyes office late change 

Topic 1
couldn waiting told night ferengi nagus create met drive words field went pattern century plans base analysis water seeing man sleep carrying second ahead zek 

Topic 2
think ship sure don make cardassian man long war federation klingon way cardassia like years doesn defiant time dominion ll dukat know ago computer mind 

Topic 3
station need kind able soon help people like talk way entire crew boy thirty knew sit programme didn stand begin understand say morning allowed believe 

Topic 4
want know right don dominion just going security matter ve federation used klingons won officer thing chief end trying wanted romulans tell weeks ll captain 

Topic 5
new look isn start life ll case far away order brain know mean don control home help wants appreciate throw planet concerned easy peace he

In [218]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [221]:
for each in longer_than_20_df.loc[longer_than_10_df['character'] == 'BASHIR'][:10]['text']:
    print(cleaner(each))

knife thrust directli left right thorac vertebra perfor lower ventricl heart
well apart us bodi discov dna present weve identifi ibudan
im sure analys fragment detect trace complex organ structur
reconstruct dna sequenc might give us idea
that look like there genet drift cant quit put finger
mean shoe know dax intim think id find hard resist
know peopl say remain ds nine eye ear fellow cardassian
ran full neurosynapt comparison dax obrien found tempor lobe patient
exampl look hold tricord process occur brain connect stimulu word tricord
base level viru found patient blood believ absorb ingest


train score: 0.937157317588
accuracy score: 0.244348852901
None
train score: 0.983807157972
accuracy score: 0.246446805575
None
train score: 0.989713686693
accuracy score: 0.252358490566
None
train score: 0.992089650626
accuracy score: 0.241720217499
None
train score: 0.985620915033
accuracy score: 0.23660130719
None


In [226]:
knc_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    KNeighborsClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(knc_pipe))

train score: 0.42612118656
accuracy score: 0.184379217274
None
train score: 0.484313184286
accuracy score: 0.155926590313
None
train score: 0.549629823757
accuracy score: 0.198702830189
None
train score: 0.479400131839
accuracy score: 0.225407810183
None
train score: 0.488888888889
accuracy score: 0.21045751634
None


In [227]:
cv_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(cv_pipe))

train score: 0.93237733727
accuracy score: 0.23616734143
None
train score: 0.981875057503
accuracy score: 0.23251000414
None
train score: 0.987420559523
accuracy score: 0.246855345912
None
train score: 0.990771259064
accuracy score: 0.236282748393
None
train score: 0.984749455338
accuracy score: 0.245751633987
None


In [254]:
cv_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    TruncatedSVD(),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(cv_pipe))

train score: 0.43840854773
accuracy score: 0.136386639676
None
train score: 0.516284846812
accuracy score: 0.13108872637
None
train score: 0.513201860709
accuracy score: 0.130110062893
None
train score: 0.598549769281
accuracy score: 0.128521997034
None
train score: 0.976034858388
accuracy score: 0.126797385621
None


In [None]:
tfid_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(tfid_pipe))

In [245]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.6, 0.8, 1],
    'rfc__n_estimators': [20, 25, 30],
    'rfc__min_samples_leaf': [1, 2, 3]   
}

pipe_gs = GridSearchCV(pipe, param_grid=grid)

In [246]:
pipe_gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vec__max_df': [0.6, 0.8, 1], 'rfc__n_estimators': [20, 25, 30], 'rfc__min_samples_leaf': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [247]:
pipe_gs.best_params_

{'rfc__min_samples_leaf': 3,
 'rfc__n_estimators': 20,
 'vec__max_df': 0.6,
 'vec__ngram_range': (1, 1)}

In [248]:
pipe_gs.best_score_

0.25098039215686274

In [249]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.4, 0.6, 0.8],
    'rfc__n_estimators': [15, 20, 25],
    'rfc__min_samples_leaf': [3, 4, 5]   
}

pipe_gs = GridSearchCV(pipe, param_grid=grid)
pipe_gs.fit(X_train, y_train)
pipe_gs.best_params_

{'rfc__min_samples_leaf': 3,
 'rfc__n_estimators': 25,
 'vec__max_df': 0.6,
 'vec__ngram_range': (1, 1)}

In [250]:
pipe_gs.best_score_

0.24705882352941178

In [251]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.5, 0.6, 0.7],
    'rfc__n_estimators': [20, 25, 30],
    'rfc__min_samples_leaf': [3, 4, 5]   
}

pipe_gs = GridSearchCV(pipe, param_grid=grid)
pipe_gs.fit(X_train, y_train)
pipe_gs.best_params_

{'rfc__min_samples_leaf': 3,
 'rfc__n_estimators': 20,
 'vec__max_df': 0.7,
 'vec__ngram_range': (1, 1)}

In [252]:
pipe_gs.best_score_

0.24575163398692809

In [256]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.5, 0.6, 0.7],
    'rfc__n_estimators': [20, 25, 30],
    'rfc__min_samples_leaf': [3, 4, 5]   
}
pipe_gs = GridSearchCV(pipe, param_grid=grid)

In [258]:
for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipe_gs.fit(X_train, y_train)
    print('best params:', pipe_gs.best_params_)
    print('best score:', pipe_gs.best_score_)

KeyboardInterrupt: 

In [260]:
l = longer_than_20_df

X = l['text']
y = le.transform(l['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
pipe_gs.fit(X_train, y_train)
print('best params:', pipe_gs.best_params_)
print('best score:', pipe_gs.best_score_)

best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 30, 'vec__max_df': 0.7, 'vec__ngram_range': (1, 1)}
best score: 0.247058823529


In [261]:
l = longer_than_15_df

X = l['text']
y = le.transform(l['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
pipe_gs.fit(X_train, y_train)
print('best params:', pipe_gs.best_params_)
print('best score:', pipe_gs.best_score_)

best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 30, 'vec__max_df': 0.5, 'vec__ngram_range': (1, 1)}
best score: 0.262689518787


In [262]:
l = longer_than_10_df

X = l['text']
y = le.transform(l['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
pipe_gs.fit(X_train, y_train)
print('best params:', pipe_gs.best_params_)
print('best score:', pipe_gs.best_score_)

KeyboardInterrupt: 

In [271]:
mlp = MLPClassifier()

In [276]:
mlp_pipe = make_pipeline(TfidfVectorizer(preprocessor=cleaner),
           MLPClassifier()
)

l = longer_than_20_df

classification_scorer(mlp_pipe)



train score: 0.999017231213
accuracy score: 0.253341194969


In [None]:
cv_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(cv_pipe))