In [286]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_union, make_pipeline

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import spacy
from textblob import TextBlob

In [287]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

In [288]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV

In [289]:
%matplotlib inline

In [328]:
imdb_df = pd.read_csv('./imdb_ds9_data.csv')
imdb_df.rename(columns={'ep_name': 'ep_title'}, inplace=True)

In [329]:
df = pd.read_csv('./dataframe_all_ds9.csv')

In [330]:
df.head()

Unnamed: 0,character,ep_title,text
0,LOCUTUS,Emissary,Resistance is futile.
1,LOCUTUS,Emissary,You will disarm your weapons and escort us to ...
2,LOCUTUS,Emissary,"If you attempt to intervene, we will destroy you."
3,LOCUTUS,Emissary,It is malevolent.
4,LOCUTUS,Emissary,Destroy it now.


In [331]:
df.describe()

Unnamed: 0,character,ep_title,text
count,114487,114487,114487
unique,661,173,93606
top,SISKO,The Way of the Warrior,No.
freq,14023,1344,537


In [332]:
imdb_df.head()

Unnamed: 0,airdate,ep_title,number,rating,season,index
0,3 Jan. 1993,Emissary,1,7.5,1,1
1,10 Jan. 1993,Past Prologue,2,7.0,1,2
2,17 Jan. 1993,A Man Alone,3,6.9,1,3
3,24 Jan. 1993,Babel,4,6.9,1,4
4,31 Jan. 1993,Captive Pursuit,5,7.7,1,5


In [333]:
imdb_df.tail()

Unnamed: 0,airdate,ep_title,number,rating,season,index
168,5 May 1999,When It Rains...,21,8.1,7,169
169,12 May 1999,Tacking Into the Wind,22,8.7,7,170
170,19 May 1999,Extreme Measures,23,7.5,7,171
171,26 May 1999,The Dogs of War,24,8.2,7,172
172,2 Jun. 1999,What You Leave Behind,25,8.8,7,173


In [334]:
len(imdb_df)

173

In [335]:
imdb_df['index'] = list(range(1,174))

In [336]:
imdb_df.head()

Unnamed: 0,airdate,ep_title,number,rating,season,index
0,3 Jan. 1993,Emissary,1,7.5,1,1
1,10 Jan. 1993,Past Prologue,2,7.0,1,2
2,17 Jan. 1993,A Man Alone,3,6.9,1,3
3,24 Jan. 1993,Babel,4,6.9,1,4
4,31 Jan. 1993,Captive Pursuit,5,7.7,1,5


In [337]:
imdb_df.tail()

Unnamed: 0,airdate,ep_title,number,rating,season,index
168,5 May 1999,When It Rains...,21,8.1,7,169
169,12 May 1999,Tacking Into the Wind,22,8.7,7,170
170,19 May 1999,Extreme Measures,23,7.5,7,171
171,26 May 1999,The Dogs of War,24,8.2,7,172
172,2 Jun. 1999,What You Leave Behind,25,8.8,7,173


In [338]:
df['ep_title_formatted'] = [''.join(e for e in title if e.isalnum()).lower() for title in df['ep_title']]

In [339]:
df.head()

Unnamed: 0,character,ep_title,text,ep_title_formatted
0,LOCUTUS,Emissary,Resistance is futile.,emissary
1,LOCUTUS,Emissary,You will disarm your weapons and escort us to ...,emissary
2,LOCUTUS,Emissary,"If you attempt to intervene, we will destroy you.",emissary
3,LOCUTUS,Emissary,It is malevolent.,emissary
4,LOCUTUS,Emissary,Destroy it now.,emissary


In [340]:
imdb_df['ep_title_formatted'] = [''.join(e for e in title if e.isalnum()).lower() for title in imdb_df['ep_title']]

In [341]:
imdb_df.head()

Unnamed: 0,airdate,ep_title,number,rating,season,index,ep_title_formatted
0,3 Jan. 1993,Emissary,1,7.5,1,1,emissary
1,10 Jan. 1993,Past Prologue,2,7.0,1,2,pastprologue
2,17 Jan. 1993,A Man Alone,3,6.9,1,3,amanalone
3,24 Jan. 1993,Babel,4,6.9,1,4,babel
4,31 Jan. 1993,Captive Pursuit,5,7.7,1,5,captivepursuit


In [307]:
temp_df = []

In [309]:
imdb_set = set(imdb_df['ep_title_formatted'])
df_set = set(df['ep_title_formatted'])

In [310]:
for ep in imdb_set:
    if ep not in df_set:
        print(ep)

favorthebold
tildeathdouspart
sonsofmogh
honoramongthieves
thesiegeofar558


In [311]:
for ep in df_set:
    if ep not in imdb_set:
        print(ep)

honouramongthieves
tilldeathusdopart
siegeofar558
thesonsofmogh
favourthebold


In [313]:
df.replace(to_replace='favourthebold', value='favorthebold', inplace=True)
df.replace(to_replace='tilldeathusdopart', value='tildeathdouspart', inplace=True)
df.replace(to_replace='siegeofar558', value='thesiegeofar558', inplace=True)
df.replace(to_replace='thesonsofmogh', value='sonsofmogh', inplace=True)
df.replace(to_replace='honouramongthieves', value='honoramongthieves', inplace=True)

In [314]:
imdb_set = set(imdb_df['ep_title_formatted'])
df_set = set(df['ep_title_formatted'])

In [315]:
for ep in df_set:
    if ep not in imdb_set:
        print(ep)

In [316]:
for ep in imdb_set:
    if ep not in df_set:
        print(ep)

In [317]:
set(df['ep_title_formatted']) ^ set(imdb_df['ep_title_formatted'])

set()

In [318]:
merged_df = pd.merge(df, imdb_df, on='ep_title_formatted', how='outer')

In [319]:
merged_df.head()

Unnamed: 0,character,ep_title_x,text,ep_title_formatted,airdate,ep_title_y,number,rating,season,index
0,LOCUTUS,Emissary,Resistance is futile.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
1,LOCUTUS,Emissary,You will disarm your weapons and escort us to ...,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
2,LOCUTUS,Emissary,"If you attempt to intervene, we will destroy you.",emissary,3 Jan. 1993,Emissary,1,7.5,1,1
3,LOCUTUS,Emissary,It is malevolent.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1
4,LOCUTUS,Emissary,Destroy it now.,emissary,3 Jan. 1993,Emissary,1,7.5,1,1


In [320]:
merged_df.drop(['ep_title_x'], axis=1, inplace=True)

In [321]:
merged_df.to_csv('merged_df.csv', index=False)

In [187]:
X = common_chars_df['text']
y = common_chars_df['character']

In [188]:
cv = CountVectorizer(stop_words='english')
X_cv = cv.fit_transform(X)
feature_names = cv.get_feature_names()

In [189]:
def LDA_batch(X, n):
    lda = LatentDirichletAllocation(n_topics=n)
    lda.fit(X)
    results = pd.DataFrame(lda.components_, columns=feature_names)
    print('LDA for {} topics:'.format(n))
    for topic in range(n):
        print('Topic', topic)
        word_list = results.T[topic].sort_values(ascending=False).index
        print(' '.join(word_list[0:25]), '\n')

In [191]:
LDA_batch(X_cv, 10)



LDA for 10 topics:
Topic 0
time didn station really major sir thank bajor trying kira guess garak ferengi nog haven able matter command heard point rest quite couldn actually killed 

Topic 1
long sisko doing commander talk day new does hear war lot computer glad end chance benjamin aren lost holosuite mission damar different programme trust hard 

Topic 2
just come sure yes cardassian talking place power stay looking night entire klingons weapons gone dead room forget family sir trouble case times week sense 

Topic 3
did tell little starfleet isn sorry hadar jem prophets left coming ships dukat yeah friend means thirty nagus far field waiting easy face turn death 

Topic 4
don want better chief thing quark thought worf course understand mind remember great brien mister wish order rom miles business suppose bring morning looks telling 

Topic 5
know right think way make look mean said life years bajoran wanted julian seven ago space quadrant best ah stand excuse makes true reason deep

In [192]:
LDA_batch(X_cv, 30)



KeyboardInterrupt: 