In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_union, make_pipeline

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import spacy
from textblob import TextBlob

In [51]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline, Pipeline

In [60]:
from sklearn import svm

In [58]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV

In [4]:
ds9_rating_and_char_count = pd.read_csv('./ds9_rating_and_char_count.csv')

In [6]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
ds9_rating_and_char_count.head()

Unnamed: 0,ep_title_formatted,BAREIL,BASHIR,COMPUTER,DAMAR,DAX,DUKAT,EDDINGTON,EZRI,GARAK,...,WINN,WORF,ZEK,index_x,airdate,ep_name,number,rating,season,index_y
0,emissary,0.0,27.0,36.0,0.0,79.0,25.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,3 Jan. 1993,Emissary,1,7.5,1,1
1,pastprologue,0.0,59.0,0.0,0.0,12.0,0.0,0.0,0.0,65.0,...,0.0,0.0,0.0,2.0,10 Jan. 1993,Past Prologue,2,7.0,1,2
2,amanalone,0.0,80.0,2.0,0.0,46.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,17 Jan. 1993,A Man Alone,3,6.9,1,3
3,babel,0.0,65.0,33.0,0.0,22.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,24 Jan. 1993,Babel,4,6.9,1,4
4,captivepursuit,0.0,2.0,7.0,0.0,18.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,31 Jan. 1993,Captive Pursuit,5,7.7,1,5


In [7]:
ds9_rating_and_char_count.columns

Index(['ep_title_formatted', 'BAREIL', 'BASHIR', 'COMPUTER', 'DAMAR', 'DAX',
       'DUKAT', 'EDDINGTON', 'EZRI', 'GARAK', 'JAKE', 'KASIDY', 'KEIKO',
       'KIRA', 'MARTOK', 'NOG', 'O'BRIEN', 'ODO', 'QUARK', 'ROM', 'SISKO',
       'VIC', 'WEYOUN', 'WINN', 'WORF', 'ZEK', 'index_x', 'airdate', 'ep_name',
       'number', 'rating', 'season', 'index_y'],
      dtype='object')

In [8]:
char_cols = ['BAREIL', 'BASHIR', 'COMPUTER', 'DAMAR', 'DAX',
       'DUKAT', 'EDDINGTON', 'EZRI', 'GARAK', 'JAKE', 'KASIDY', 'KEIKO',
       'KIRA', 'MARTOK', 'NOG', 'O\'BRIEN', 'ODO', 'QUARK', 'ROM', 'SISKO',
       'VIC', 'WEYOUN', 'WINN', 'WORF', 'ZEK']

In [12]:
[col for col in ds9_rating_and_char_count if col in char_cols]

['BAREIL',
 'BASHIR',
 'COMPUTER',
 'DAMAR',
 'DAX',
 'DUKAT',
 'EDDINGTON',
 'EZRI',
 'GARAK',
 'JAKE',
 'KASIDY',
 'KEIKO',
 'KIRA',
 'MARTOK',
 'NOG',
 "O'BRIEN",
 'ODO',
 'QUARK',
 'ROM',
 'SISKO',
 'VIC',
 'WEYOUN',
 'WINN',
 'WORF',
 'ZEK']

In [14]:
X = ds9_rating_and_char_count[[col for col in ds9_rating_and_char_count if col in char_cols]]

In [15]:
X.head()

Unnamed: 0,BAREIL,BASHIR,COMPUTER,DAMAR,DAX,DUKAT,EDDINGTON,EZRI,GARAK,JAKE,...,O'BRIEN,ODO,QUARK,ROM,SISKO,VIC,WEYOUN,WINN,WORF,ZEK
0,0.0,27.0,36.0,0.0,79.0,25.0,0.0,0.0,0.0,25.0,...,113.0,34.0,34.0,0.0,326.0,0.0,0.0,0.0,0.0,0.0
1,0.0,59.0,0.0,0.0,12.0,0.0,0.0,0.0,65.0,0.0,...,34.0,48.0,0.0,0.0,127.0,0.0,0.0,0.0,0.0,0.0
2,0.0,80.0,2.0,0.0,46.0,0.0,0.0,0.0,0.0,12.0,...,29.0,100.0,29.0,25.0,98.0,0.0,0.0,0.0,0.0,0.0
3,0.0,65.0,33.0,0.0,22.0,0.0,0.0,0.0,0.0,8.0,...,61.0,76.0,84.0,0.0,119.0,0.0,0.0,0.0,0.0,0.0
4,0.0,2.0,7.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,...,255.0,32.0,35.0,0.0,105.0,0.0,0.0,0.0,0.0,0.0


In [34]:
above_average = ds9_rating_and_char_count['rating'].map(lambda x: 1 if x > 7.5 else 0)

In [16]:
y = ds9_rating_and_char_count['rating']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
X_train.size

3225

In [21]:
X_test.size

1100

In [22]:
lr = linear_model.LinearRegression()

In [23]:
lr.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
lr_preds = lr.predict(X_test)

In [26]:
lr.coef_

array([-0.00587528, -0.00226498,  0.00688276,  0.00890924, -0.00493242,
        0.00034886, -0.0020683 , -0.00526868,  0.00558416, -0.00190882,
       -0.00035684, -0.00529593, -0.00197335,  0.00694431,  0.00367633,
        0.00088791, -0.00069235, -0.00185418,  0.00161492,  0.0011779 ,
        0.00036794,  0.00714722, -0.00417768,  0.00061244, -0.00516768])

In [27]:
mean_squared_error(y_test, lr_preds)

0.53836153259911113

In [28]:
r2_score(y_test, lr_preds)

-0.20796440447351028

In [None]:
ds9_rating_and_char_count

In [30]:
for row in ds9_rating_and_char_count.iterrows():
    print(row)

(0, ep_title_formatted       emissary
BAREIL                          0
BASHIR                         27
COMPUTER                       36
DAMAR                           0
DAX                            79
DUKAT                          25
EDDINGTON                       0
EZRI                            0
GARAK                           0
JAKE                           25
KASIDY                          0
KEIKO                           0
KIRA                          130
MARTOK                          0
NOG                             3
O'BRIEN                       113
ODO                            34
QUARK                          34
ROM                             0
SISKO                         326
VIC                             0
WEYOUN                          0
WINN                            0
WORF                            0
ZEK                             0
index_x                         1
airdate               3 Jan. 1993
ep_name                  Emissary
number    

(113, ep_title_formatted      businessasusual
BAREIL                                0
BASHIR                               18
COMPUTER                              0
DAMAR                                 0
DAX                                  37
DUKAT                                 0
EDDINGTON                             0
EZRI                                  0
GARAK                                 0
JAKE                                 20
KASIDY                                0
KEIKO                                 0
KIRA                                  9
MARTOK                                0
NOG                                   0
O'BRIEN                              79
ODO                                  11
QUARK                               214
ROM                                   0
SISKO                                36
VIC                                   0
WEYOUN                                0
WINN                                  0
WORF                              

In [35]:
y = above_average

In [36]:
X = ds9_rating_and_char_count[[col for col in ds9_rating_and_char_count if col in char_cols]]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [38]:
lr = linear_model.LinearRegression()

In [39]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
lr_preds = lr.predict(X_test)

In [41]:
lr.coef_

array([-0.00142305, -0.0012708 , -0.00121524,  0.003762  , -0.00261303,
        0.00124667,  0.00215888, -0.00302592,  0.00166725,  0.00030211,
       -0.00061955, -0.00393634, -0.00136154,  0.00400692,  0.0005098 ,
        0.0010851 , -0.00065409, -0.00038452, -0.00146443, -0.00088572,
        0.00218416,  0.00561201, -0.00251563,  0.0004269 , -0.0030313 ])

In [42]:
mean_squared_error(y_test, lr_preds)

0.29823941863911912

In [43]:
r2_score(y_test, lr_preds)

-0.2028989885111141

In [44]:
log_r = linear_model.LogisticRegression()

In [45]:
log_r.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
y_preds = log_r.predict(X_test)

In [48]:
lr.coef_

array([-0.00142305, -0.0012708 , -0.00121524,  0.003762  , -0.00261303,
        0.00124667,  0.00215888, -0.00302592,  0.00166725,  0.00030211,
       -0.00061955, -0.00393634, -0.00136154,  0.00400692,  0.0005098 ,
        0.0010851 , -0.00065409, -0.00038452, -0.00146443, -0.00088572,
        0.00218416,  0.00561201, -0.00251563,  0.0004269 , -0.0030313 ])

In [49]:
from sklearn.metrics import accuracy_score

In [50]:
accuracy_score(y_preds, y_test)

0.61363636363636365

In [56]:
rfc_pipe = make_pipeline(
    RandomForestClassifier()
)
classification_scorer(rfc_pipe)

train score: 0.945736434109
accuracy score: 0.636363636364
[[15  9]
 [ 7 13]]


In [57]:
ada_pipe = make_pipeline(
    AdaBoostClassifier()
)
classification_scorer(ada_pipe)

train score: 0.992248062016
accuracy score: 0.568181818182
[[17  7]
 [12  8]]


In [59]:
gbc_pipe = make_pipeline(
    GradientBoostingClassifier()
)
classification_scorer(gbc_pipe)

train score: 1.0
accuracy score: 0.590909090909
[[16  8]
 [10 10]]


In [61]:
svc_pipe = make_pipeline(
    svm.SVC()
)
classification_scorer(svc_pipe)

train score: 1.0
accuracy score: 0.454545454545
[[ 0 24]
 [ 0 20]]


In [54]:
def classification_scorer(pipeline):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print('train score:', pipeline.score(X_train, y_train))
    print('accuracy score:', accuracy_score(y_test, preds))
    print(confusion_matrix(y_test, preds))