In [48]:
# sklearn imports
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [60]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.utils import np_utils

Using TensorFlow backend.
  return f(*args, **kwds)
Couldn't import dot_parser, loading of dot files will not be possible.


In [39]:
# nltk imports
from nltk import text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [40]:
import pandas as pd
import string

In [41]:
def classification_scorer(pipeline):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print('train score:', pipeline.score(X_train, y_train))
    print('accuracy score:', accuracy_score(y_test, preds))
    #print(confusion_matrix(y_test, preds))

In [55]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [10]:
tos = pd.read_csv('./merged_tos.csv')
tos.head()

Unnamed: 0,character,ep_title_x,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,SPOCK,The Cage,Check the circuit.,thecage,27 Nov. 1988,The Cage,0,7.7,1,1
1,SPOCK,The Cage,It can't be the screen then.,thecage,27 Nov. 1988,The Cage,0,7.7,1,1
2,SPOCK,The Cage,"Definitely something out there, Captain, heade...",thecage,27 Nov. 1988,The Cage,0,7.7,1,1
3,SPOCK,The Cage,Their call letters check with a survey expedit...,thecage,27 Nov. 1988,The Cage,0,7.7,1,1
4,SPOCK,The Cage,SS Columbia.,thecage,27 Nov. 1988,The Cage,0,7.7,1,1


In [11]:
tng = pd.read_csv('./merged_tng.csv')
tng.head()

Unnamed: 0,character,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,PICARD,"You will agree, Data, that Starfleet's orders ...",encounteratfarpoint,26 Sep. 1987,Encounter at Farpoint,1,6.9,1,1
1,PICARD,As simple as that.,encounteratfarpoint,26 Sep. 1987,Encounter at Farpoint,1,6.9,1,1
2,PICARD,"It's hardly simple, Data, to negotiate a frien...",encounteratfarpoint,26 Sep. 1987,Encounter at Farpoint,1,6.9,1,1
3,PICARD,"Data, how can you be programmed as a virtual e...",encounteratfarpoint,26 Sep. 1987,Encounter at Farpoint,1,6.9,1,1
4,PICARD,"It means to spy, to sneak.",encounteratfarpoint,26 Sep. 1987,Encounter at Farpoint,1,6.9,1,1


In [12]:
ds9 = pd.read_csv('./merged_df.csv')
ds9.head()

Unnamed: 0,character,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,LOCUTUS,Resistance is futile.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
1,LOCUTUS,You will disarm your weapons and escort us to ...,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
2,LOCUTUS,"If you attempt to intervene, we will destroy you.",emissary,3 Jan. 1993,Emissary,1,7.5,1,1
3,LOCUTUS,It is malevolent.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
4,LOCUTUS,Destroy it now.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1


In [15]:
kirk = tos.loc[tos['character'] == 'KIRK'][['character', 'text']]
picard = tng.loc[tng['character'] == 'PICARD'][['character', 'text']]
sisko = ds9.loc[ds9['character'] == 'SISKO'][['character', 'text']]

In [24]:
cap = kirk.append(picard).append(sisko)

In [27]:
count_array = [len(word_tokenize(line)) > 5 for line in cap['text']]
cap_5 = cap[count_array]

count_array = [len(word_tokenize(line)) > 8 for line in cap['text']]
cap_8 = cap[count_array]

count_array = [len(word_tokenize(line)) > 10 for line in cap['text']]
cap_10 = cap[count_array]

count_array = [len(word_tokenize(line)) > 15 for line in cap['text']]
cap_15 = cap[count_array]

count_array = [len(word_tokenize(line)) > 20 for line in cap['text']]
cap_20 = cap[count_array]

In [28]:
all_caps = [cap_5, cap_8, cap_10, cap_15, cap_20]

In [29]:
cap['character'].value_counts()

PICARD    20883
KIRK      17001
SISKO     14023
Name: character, dtype: int64

In [33]:
# baseline accuracy
cap['character'].value_counts().values[0]/cap.shape[0]

0.40231567996609319

In [35]:
for each in all_caps:
    print(each['character'].value_counts())
    print('baseline:', each['character'].value_counts().values[0]/each.shape[0], '\n')

PICARD    13696
KIRK      10219
SISKO      9525
Name: character, dtype: int64
baseline: 0.40956937799 

PICARD    8611
SISKO     5943
KIRK      5419
Name: character, dtype: int64
baseline: 0.431132028238 

PICARD    5955
SISKO     4173
KIRK      3522
Name: character, dtype: int64
baseline: 0.436263736264 

PICARD    2287
SISKO     1662
KIRK      1252
Name: character, dtype: int64
baseline: 0.439723130167 

PICARD    854
SISKO     611
KIRK      434
Name: character, dtype: int64
baseline: 0.449710373881 



In [37]:
le = LabelEncoder()
le.fit(cap['character'])
list(le.classes_)

['KIRK', 'PICARD', 'SISKO']

In [45]:
rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(rfc_pipe))

train score: 0.942145135566
accuracy score: 0.582057416268
None
train score: 0.977101275118
accuracy score: 0.576091309571
None
train score: 0.982025984175
accuracy score: 0.588631702315
None
train score: 0.981282051282
accuracy score: 0.60568793236
None
train score: 0.977528089888
accuracy score: 0.574736842105
None


In [52]:
rfc_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(rfc_pipe))

train score: 0.963157894737
accuracy score: 0.562200956938
None
train score: 0.985179250951
accuracy score: 0.561674008811
None
train score: 0.984663475628
accuracy score: 0.574274831527
None
train score: 0.984615384615
accuracy score: 0.576479631053
None
train score: 0.983848314607
accuracy score: 0.597894736842
None


In [None]:
#ada, gradient

In [46]:
ada_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    AdaBoostClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(ada_pipe))

train score: 0.523803827751
accuracy score: 0.522966507177
None
train score: 0.553107684091
accuracy score: 0.557268722467
None
train score: 0.574680082055
accuracy score: 0.564605918547
None
train score: 0.598717948718
accuracy score: 0.574173712529
None
train score: 0.65308988764
accuracy score: 0.545263157895
None


In [53]:
ada_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    AdaBoostClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(ada_pipe))

train score: 0.523923444976
accuracy score: 0.523803827751
None
train score: 0.558782295213
accuracy score: 0.550060072087
None
train score: 0.570186578099
accuracy score: 0.564312921184
None
train score: 0.604102564103
accuracy score: 0.571099154497
None
train score: 0.653792134831
accuracy score: 0.547368421053
None


In [49]:
bag_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    BaggingClassifier()
)

for l in all_caps:
    X = l['text']b
    bag_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    BaggingClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(bag_pipe))
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(bag_pipe))

train score: 0.937878787879
accuracy score: 0.569019138756
None
train score: 0.970291741772
accuracy score: 0.561273528234
None
train score: 0.97460193416
accuracy score: 0.571051860533
None
train score: 0.977179487179
accuracy score: 0.5618754804
None
train score: 0.962078651685
accuracy score: 0.528421052632
None


In [54]:
bag_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    BaggingClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(bag_pipe))

train score: 0.959210526316
accuracy score: 0.56028708134
None
train score: 0.976967754857
accuracy score: 0.57148578294
None
train score: 0.974894988766
accuracy score: 0.584236741869
None
train score: 0.972820512821
accuracy score: 0.568024596464
None
train score: 0.974719101124
accuracy score: 0.595789473684
None


In [56]:
rfc_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(rfc_pipe))

train score: 0.965350877193
accuracy score: 0.569377990431
None
train score: 0.984244609119
accuracy score: 0.580296355627
None
train score: 0.985347269708
accuracy score: 0.574860826253
None
train score: 0.984358974359
accuracy score: 0.578016910069
None
train score: 0.982443820225
accuracy score: 0.555789473684
None


In [58]:
ada_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    AdaBoostClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(ada_pipe))

train score: 0.524362041467
accuracy score: 0.522488038278
None
train score: 0.556846251419
accuracy score: 0.554865839007
None
train score: 0.572824069552
accuracy score: 0.56782888954
None
train score: 0.601794871795
accuracy score: 0.594926979247
None
train score: 0.663623595506
accuracy score: 0.555789473684
None


In [59]:
bag_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    BaggingClassifier()
)

for l in all_caps:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(bag_pipe))

train score: 0.958094098884
accuracy score: 0.57523923445
None
train score: 0.979237599306
accuracy score: 0.579295154185
None
train score: 0.977727849956
accuracy score: 0.566363902725
None
train score: 0.978717948718
accuracy score: 0.564950038432
None
train score: 0.980337078652
accuracy score: 0.591578947368
None
