In [211]:
import spacy

In [1]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_union, make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
from nltk import text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
import pandas as pd
import string

In [4]:
def classification_scorer(pipeline):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print('train score:', pipeline.score(X_train, y_train))
    print('accuracy score:', accuracy_score(y_test, preds))
    #print(confusion_matrix(y_test, preds))

In [5]:
df = pd.read_csv('./merged_df.csv')

In [6]:
df.head()

Unnamed: 0,character,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,LOCUTUS,Resistance is futile.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
1,LOCUTUS,You will disarm your weapons and escort us to ...,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
2,LOCUTUS,"If you attempt to intervene, we will destroy you.",emissary,3 Jan. 1993,Emissary,1,7.5,1,1
3,LOCUTUS,It is malevolent.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
4,LOCUTUS,Destroy it now.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1


In [7]:
common_chars = df['character'].value_counts()[:10].index
common_chars

Index(['SISKO', 'KIRA', 'BASHIR', 'QUARK', 'O'BRIEN', 'ODO', 'DAX', 'WORF',
       'GARAK', 'DUKAT'],
      dtype='object')

In [115]:
#[sent for sent in df.loc[df['character'] == 'BASHIR']['text'] if len(word_tokenize(sent)) > 5]

In [116]:
# for each in df.loc[df['character'] == 'BASHIR']['text']:
#     if len(word_tokenize(each)) > 5:
#         print(each)

In [8]:
common_chars_df = df.loc[df['character'].isin(common_chars)]

In [9]:
count_array = [len(word_tokenize(line)) > 5 for line in common_chars_df['text']]
longer_than_5_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 8 for line in common_chars_df['text']]
longer_than_8_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 10 for line in common_chars_df['text']]
longer_than_10_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 15 for line in common_chars_df['text']]
longer_than_15_df = common_chars_df[count_array]

count_array = [len(word_tokenize(line)) > 20 for line in common_chars_df['text']]
longer_than_20_df = common_chars_df[count_array]

In [10]:
list_of_dfs = [longer_than_5_df, longer_than_8_df, longer_than_10_df, longer_than_15_df, longer_than_20_df]

In [11]:
longer_than_8_df['character'].unique()

array(['SISKO', "O'BRIEN", 'KIRA', 'ODO', 'QUARK', 'BASHIR', 'DAX',
       'DUKAT', 'GARAK', 'WORF'], dtype=object)

In [12]:
for lists in list_of_dfs:
    print(lists.shape[0])

47421
28985
20351
8091
3060


In [13]:
def baseline_accuracy(df):
    return df['character'].value_counts().values[0]/df.shape[0]

In [158]:
for lists in list_of_dfs:
    print(baseline_accuracy(lists))

0.200860378313
0.205037088149
0.205051348828
0.205413422321
0.199673202614


In [14]:
le = LabelEncoder()
le.fit(longer_than_5_df['character'])
list(le.classes_)

['BASHIR',
 'DAX',
 'DUKAT',
 'GARAK',
 'KIRA',
 "O'BRIEN",
 'ODO',
 'QUARK',
 'SISKO',
 'WORF']

## Modeling

First, confirm a very simple example works:

In [15]:
X = longer_than_5_df['text']
y = le.transform(longer_than_5_df['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.8917756221
accuracy score: 0.235745614035


In [16]:
rfc_pipe.predict(['test', 'test'])

array([7, 7])

In [17]:
le.inverse_transform(rfc_pipe.predict(['diagnosis', 'doctor', 'my dear', 'old man', 'the new generation of annoying quote-RTs is people quote-RTing people that are actually making the point that a bad thing is bad but the quote-RTer misinterpreted it but doesn\'t bother to delete because the quote RT is doing numbers']))

array(['BASHIR', 'SISKO', 'GARAK', 'SISKO', 'KIRA'], dtype=object)

In [18]:
le.inverse_transform(rfc_pipe.predict(['There is a drag show called Dragula that does not use the Rob Zombie song as its theme.']))

array(['QUARK'], dtype=object)

In [19]:
def temp_predictor(a_string):
    print(le.inverse_transform(rfc_pipe.predict([a_string])))

In [176]:
# Classifiers to test: RandomForestClassifier, AdaBoostClassifier

In [20]:
X_train, X_test, y_train, y_test = train_test_split(longer_than_5_df['text'], longer_than_5_df['character'])

rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.892478560382
accuracy score: 0.239035087719


In [21]:
print(classification_report(y_test, rfc_pipe.predict(X_test)))

             precision    recall  f1-score   support

     BASHIR       0.26      0.24      0.25      1473
        DAX       0.19      0.14      0.16      1043
      DUKAT       0.18      0.11      0.14       467
      GARAK       0.14      0.09      0.11       497
       KIRA       0.18      0.21      0.20      1492
    O'BRIEN       0.26      0.25      0.25      1318
        ODO       0.19      0.18      0.19      1245
      QUARK       0.27      0.26      0.27      1397
      SISKO       0.29      0.38      0.33      2404
       WORF       0.23      0.17      0.20       520

avg / total       0.23      0.24      0.23     11856



In [22]:
le.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

array(['BASHIR', 'DAX', 'DUKAT', 'GARAK', 'KIRA', "O'BRIEN", 'ODO',
       'QUARK', 'SISKO', 'WORF'], dtype=object)

In [23]:
X = longer_than_5_df['text']
y = le.transform(longer_than_5_df['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    RandomForestClassifier()
)

classification_scorer(rfc_pipe)

train score: 0.89242232532
accuracy score: 0.238866396761


In [25]:
for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(rfc_pipe))

train score: 0.893996907072
accuracy score: 0.238444669366
None
train score: 0.965912227436
accuracy score: 0.234993790534
None
train score: 0.983686038131
accuracy score: 0.240762578616
None
train score: 0.987640079103
accuracy score: 0.249134948097
None
train score: 0.988235294118
accuracy score: 0.247058823529
None


In [26]:
tfid_pipe = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(tfid_pipe))

train score: 0.897371010825
accuracy score: 0.243758434548
None
train score: 0.969730425982
accuracy score: 0.252518283428
None
train score: 0.983489484374
accuracy score: 0.254913522013
None
train score: 0.990606460119
accuracy score: 0.247157686604
None
train score: 0.986492374728
accuracy score: 0.23137254902
None


In [27]:
svc_pipe = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    SVC()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(svc_pipe))

train score: 0.20241810769
accuracy score: 0.196187584345
None
train score: 0.205400680835
accuracy score: 0.203946460604
None
train score: 0.203433138964
accuracy score: 0.209905660377
None
train score: 0.208635464733
accuracy score: 0.19574888779
None
train score: 0.203050108932
accuracy score: 0.18954248366
None


In [28]:
svc_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    SVC()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(svc_pipe))

train score: 0.200618585688
accuracy score: 0.201585695007
None
train score: 0.20484865213
accuracy score: 0.205602318201
None
train score: 0.207626285789
accuracy score: 0.197327044025
None
train score: 0.205504284773
accuracy score: 0.205140879881
None
train score: 0.199564270153
accuracy score: 0.2
None


In [30]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(X_train)
feature_names = cv.get_feature_names()

In [39]:
cv = CountVectorizer(preprocessor=cleaner)
X = cv.fit_transform(X_train)

def LDA_batch(X, n):
    
    feature_names = cv.get_feature_names()
    lda = LatentDirichletAllocation(n_topics=n)
    lda.fit(X)
    results = pd.DataFrame(lda.components_, columns=feature_names)
    print('LDA for {} topics:'.format(n))
    for topic in range(n):
        print('Topic', topic)
        word_list = results.T[topic].sort_values(ascending=False).index
        print(' '.join(word_list[0:25]), '\n')

In [40]:
LDA_batch(X, 10)



LDA for 10 topics:
Topic 0
someth id happen quadrant that bajoran man believ sisko gamma hold go captain prophet your talk first quark back actual noth two want get reason 

Topic 1
went founder report cargo arm design gone easi hit never known bay wall water sourc breen woman homeworld year train energi surpris date festiv armi 

Topic 2
year cant three two four obrien secur see room night hundr everi last ive might ago one sit get home small defiant spent surviv side 

Topic 3
know time day go im first come life one say new make ask would much bajor cardassia dax us tri cardassian help major feder twenti 

Topic 4
go want dont know think would im like thing get station look back tell ship starfleet us peopl take command offic kind feder well around 

Topic 5
ill next would your youll well bring anyon thought wont order found stand inform might tell could even take peopl realli mean differ sure heart 

Topic 6
littl right kill your put that may bajoran moment destroy interest point we

In [41]:
LDA_batch(X, 20)



LDA for 20 topics:
Topic 0
way time might youv ive there want thing long got tell never know either feel oh last us made person good use first stay work 

Topic 1
dax best week seven gul curzon probe probabl two promenad dukat guess lieuten doctor cannot record access dock sir univers file produc bajoran three opposit 

Topic 2
keep matter leav would wasnt accord chanc us cardassia bajor peac close aliv major two least treati children hard jake get encount trip kira vedek 

Topic 3
cardassian time bajoran day next five bring well hope begin two level prophet togeth minut weve bodi respons twenti high someth point dominion need empir 

Topic 4
station want know go that see make one right let ill your starfleet war back feder tri day put happen find everi peopl secur everyth 

Topic 5
may believ feder mean tell cant could know dominion seem man help order even she come take see work heart protect attack cardassia there second 

Topic 6
one peopl life klingon us know look thing entir he h

In [33]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [42]:
for each in longer_than_20_df.loc[longer_than_10_df['character'] == 'BASHIR'][:10]['text']:
    print(cleaner(each))

knife thrust directli left right thorac vertebra perfor lower ventricl heart
well apart us bodi discov dna present weve identifi ibudan
im sure analys fragment detect trace complex organ structur
reconstruct dna sequenc might give us idea
that look like there genet drift cant quit put finger
mean shoe know dax intim think id find hard resist
know peopl say remain ds nine eye ear fellow cardassian
ran full neurosynapt comparison dax obrien found tempor lobe patient
exampl look hold tricord process occur brain connect stimulu word tricord
base level viru found patient blood believ absorb ingest


In [43]:
knc_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    KNeighborsClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(knc_pipe))

train score: 0.425165190496
accuracy score: 0.176703778677
None
train score: 0.435274634281
accuracy score: 0.168897474817
None
train score: 0.555460918561
accuracy score: 0.197720125786
None
train score: 0.480388925511
accuracy score: 0.232328225408
None
train score: 0.485403050109
accuracy score: 0.250980392157
None


In [44]:
cv_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(cv_pipe))

train score: 0.932855335302
accuracy score: 0.232371794872
None
train score: 0.980817002484
accuracy score: 0.250310473299
None
train score: 0.987158487846
accuracy score: 0.244300314465
None
train score: 0.9912656559
accuracy score: 0.24962926347
None
train score: 0.99128540305
accuracy score: 0.213071895425
None


In [45]:
cv_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    TruncatedSVD(),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(cv_pipe))

train score: 0.370729649937
accuracy score: 0.134024966262
None
train score: 0.445671174901
accuracy score: 0.130260797571
None
train score: 0.574461115115
accuracy score: 0.128341194969
None
train score: 0.622445616348
accuracy score: 0.133465150766
None
train score: 0.971677559913
accuracy score: 0.154248366013
None


In [51]:
tfid_pipe = make_pipeline(
    TfidfVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(tfid_pipe))

train score: 0.937157317588
accuracy score: 0.243421052632
None
train score: 0.984037169933
accuracy score: 0.238167517593
None
train score: 0.990106794208
accuracy score: 0.254520440252
None
train score: 0.990936058009
accuracy score: 0.241720217499
None
train score: 0.990849673203
accuracy score: 0.202614379085
None


In [47]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.6, 0.8, 1],
    'rfc__n_estimators': [20, 25, 30],
    'rfc__min_samples_leaf': [1, 2, 3]   
}

pipe_gs = GridSearchCV(pipe, param_grid=grid)

In [48]:
pipe_gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vec__max_df': [0.6, 0.8, 1], 'rfc__n_estimators': [20, 25, 30], 'rfc__min_samples_leaf': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
pipe_gs.best_params_

{'rfc__min_samples_leaf': 2,
 'rfc__n_estimators': 25,
 'vec__max_df': 0.8,
 'vec__ngram_range': (1, 1)}

In [50]:
pipe_gs.best_score_

0.23921568627450981

In [52]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.4, 0.6, 0.8],
    'rfc__n_estimators': [15, 20, 25],
    'rfc__min_samples_leaf': [3, 4, 5]   
}

pipe_gs = GridSearchCV(pipe, param_grid=grid)
pipe_gs.fit(X_train, y_train)
pipe_gs.best_params_

{'rfc__min_samples_leaf': 3,
 'rfc__n_estimators': 25,
 'vec__max_df': 0.4,
 'vec__ngram_range': (1, 1)}

In [53]:
pipe_gs.best_score_

0.24183006535947713

In [54]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.5, 0.6, 0.7],
    'rfc__n_estimators': [20, 25, 30],
    'rfc__min_samples_leaf': [3, 4, 5]   
}

pipe_gs = GridSearchCV(pipe, param_grid=grid)
pipe_gs.fit(X_train, y_train)
pipe_gs.best_params_

{'rfc__min_samples_leaf': 3,
 'rfc__n_estimators': 30,
 'vec__max_df': 0.6,
 'vec__ngram_range': (1, 1)}

In [55]:
pipe_gs.best_score_

0.24749455337690632

In [56]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

grid = {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vec__max_df': [0.5, 0.6, 0.7],
    'rfc__n_estimators': [20, 25, 30],
    'rfc__min_samples_leaf': [3, 4, 5]   
}
pipe_gs = GridSearchCV(pipe, param_grid=grid)

In [57]:
for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipe_gs.fit(X_train, y_train)
    print('best params:', pipe_gs.best_params_)
    print('best score:', pipe_gs.best_score_)

best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 30, 'vec__max_df': 0.5, 'vec__ngram_range': (1, 2)}
best score: 0.271418529453
best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 30, 'vec__max_df': 0.5, 'vec__ngram_range': (1, 1)}
best score: 0.269252001104
best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 30, 'vec__max_df': 0.7, 'vec__ngram_range': (1, 1)}
best score: 0.265347572561
best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 30, 'vec__max_df': 0.6, 'vec__ngram_range': (1, 1)}
best score: 0.257251153593
best params: {'rfc__min_samples_leaf': 3, 'rfc__n_estimators': 25, 'vec__max_df': 0.6, 'vec__ngram_range': (1, 1)}
best score: 0.251416122004


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function cleaner at 0x7f1ebccad6a8>, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Experimenting with stop words

In [67]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(df['text'])
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())
to_dense_df.sum().sort_values(ascending=False)[:50]

In [73]:
new_stop_words = ['im', 'go', 'dont', 'know', 'one', 'get', 'your', 'want', 'well',
       'like', 'think', 'right', 'that', 'time', 'us', 'see', 'would', 'come',
       'take', 'look', 'could', 'ill', 'make', 'way', 'let', 'say', 'tell',
       'good', 'back', 'ive', 'need', 'thing', 'cant', 'someth',
       'never', 'two', 'tri', 'he', 'ye', 'oh', 'sure', 'talk', 'got', 'find', 'didnt']

In [78]:
stop_words_for_cleaner = stopwords.words('english').append(new_stop_words)

In [81]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stop_words_for_cleaner
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [86]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [90]:
type(stopwords.words('english'))

list

In [92]:
english_stop_words = stopwords.words('english')

In [95]:
english_stop_words.extend(new_stop_words)

In [87]:
cleaner('testing')

'test'

In [109]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = english_stop_words
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [98]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(df['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function cleaner at 0x7f1e830e7268>, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [99]:
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())


your          3112
go            2905
that          2430
command       1347
someth        1346
ship          1275
tri           1271
he            1241
ye            1214
station       1154
cardassian    1140
peopl         1139
work          1076
there         1075
mean          1060
doctor        1049
theyr         1033
day           1003
help           998
captain        984
much           967
sir            957
realli         933
first          931
id             928
give           927
still          916
mayb           912
three          905
odo            904
better         902
even           899
use            890
quark          884
major          871
littl          862
noth           855
chief          847
klingon        847
thank          830
thought        823
anyth          820
long           818
believ         811
ask            801
year           800
happen         800
feel           795
sisko          789
must           785
dtype: int64

In [100]:
to_dense_df.sum().sort_values(ascending=False).index[:50]

Index(['your', 'go', 'that', 'command', 'someth', 'ship', 'tri', 'he', 'ye',
       'station', 'cardassian', 'peopl', 'work', 'there', 'mean', 'doctor',
       'theyr', 'day', 'help', 'captain', 'much', 'sir', 'realli', 'first',
       'id', 'give', 'still', 'mayb', 'three', 'odo', 'better', 'even', 'use',
       'quark', 'major', 'littl', 'noth', 'chief', 'klingon', 'thank',
       'thought', 'anyth', 'long', 'believ', 'ask', 'year', 'happen', 'feel',
       'sisko', 'must'],
      dtype='object')

In [102]:
english_stop_words.extend(['your', 'go', 'that', 'someth', 'he', 'ye', 'there', 'peopl', 'theyr', 'much', 'realli', 'id', 'give', 'still', 'mayb', 'three', 'better', 'even', 'use'])

In [103]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(df['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function cleaner at 0x7f1e830e7268>, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [105]:
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())


In [108]:
sum_of_word_counts = to_dense_df.sum()

In [111]:
type(sum_of_word_counts)

pandas.core.series.Series

In [110]:
to_dense_df.sum().sort_values(ascending=False)[:15]

your          3112
go            2905
that          2430
command       1347
someth        1346
ship          1275
tri           1271
he            1241
ye            1214
station       1154
cardassian    1140
peopl         1139
work          1076
there         1075
mean          1060
dtype: int64

In [None]:
english_stop_words

In [None]:

pd.DataFrame(cv.transform(df['text']).todense(), 
             columns=cv.get_feature_names()).sum().sort_values(ascending=False)[:15].plot(kind='bar');

In [None]:
l = longer_than_20_df

X = l['text']
y = le.transform(l['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
pipe_gs.fit(X_train, y_train)
print('best params:', pipe_gs.best_params_)
print('best score:', pipe_gs.best_score_)

In [None]:
l = longer_than_15_df

X = l['text']
y = le.transform(l['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
pipe_gs.fit(X_train, y_train)
print('best params:', pipe_gs.best_params_)
print('best score:', pipe_gs.best_score_)

In [None]:
l = longer_than_10_df

X = l['text']
y = le.transform(l['character'])
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
pipe_gs.fit(X_train, y_train)
print('best params:', pipe_gs.best_params_)
print('best score:', pipe_gs.best_score_)

In [None]:
mlp = MLPClassifier()

In [None]:
mlp_pipe = make_pipeline(TfidfVectorizer(preprocessor=cleaner),
           MLPClassifier()
)

l = longer_than_20_df

classification_scorer(mlp_pipe)

In [None]:
cv_pipe = make_pipeline(
    CountVectorizer(preprocessor=cleaner),
    RandomForestClassifier()
)

for l in list_of_dfs:
    X = l['text']
    y = le.transform(l['character'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    print(classification_scorer(cv_pipe))

In [58]:
print('test')

test
