# NLP Bake-Off

In [1]:
import pickle
with open('speeches.pkl', 'rb') as f:
    speeches = pickle.load(f)

In [2]:
len(speeches)

58

In [3]:
with open('target.pkl', 'rb') as f:
    target = pickle.load(f)

## Consciously Splitting Speeches

In [4]:
train_indices = [1, 2, 3, 6, 7, 9, 11, 12, 13, 14, 17, 18,
                 21, 22, 23, 24, 28, 29, 30, 31, 34, 35, 36,
                 37, 39, 40, 41, 44, 45, 47, 48, 50, 51, 54,
                 55, 57]
test_indices = [0, 4, 5, 8, 10, 15, 16, 19, 20, 25, 26, 27,
                32, 33, 38, 42, 43, 46, 49, 52, 53, 56]

In [5]:
X_train = [speeches[j] for j in train_indices]
y_train = [target[j] for j in train_indices]

X_test = [speeches[j] for j in test_indices]
y_test = [target[j] for j in test_indices]

## Preprocessing

In [6]:
import nltk

In [7]:
stem = nltk.stem.SnowballStemmer('english')

speeches_cleaned = [[stem.stem(word.lower())
                     for word in nltk.word_tokenize(speech.replace('\'', ''))]\
                     for speech in X_train]

In [39]:
speeches_cleaned

[['fellow',
  'citizen',
  ':',
  'i',
  'am',
  'again',
  'call',
  'upon',
  'by',
  'the',
  'voic',
  'of',
  'my',
  'countri',
  'to',
  'execut',
  'the',
  'function',
  'of',
  'it',
  'chief',
  'magistr',
  '.',
  'when',
  'the',
  'occas',
  'proper',
  'for',
  'it',
  'shall',
  'arriv',
  ',',
  'i',
  'shall',
  'endeavor',
  'to',
  'express',
  'the',
  'high',
  'sens',
  'i',
  'entertain',
  'of',
  'this',
  'distinguish',
  'honor',
  ',',
  'and',
  'of',
  'the',
  'confid',
  'which',
  'has',
  'been',
  'repos',
  'in',
  'me',
  'by',
  'the',
  'peopl',
  'of',
  'unit',
  'america',
  '.',
  'previous',
  'to',
  'the',
  'execut',
  'of',
  'ani',
  'offici',
  'act',
  'of',
  'the',
  'presid',
  'the',
  'constitut',
  'requir',
  'an',
  'oath',
  'of',
  'offic',
  '.',
  'this',
  'oath',
  'i',
  'am',
  'now',
  'about',
  'to',
  'take',
  ',',
  'and',
  'in',
  'your',
  'presenc',
  ':',
  'that',
  'if',
  'it',
  'shall',
  'be',
  'found

In [8]:
speeches_cleaned[3] = speeches_cleaned[3][:-5]

In [9]:
import string

In [10]:
stops = nltk.corpus.stopwords.words('english')
stops.extend(list(string.punctuation))
stops.extend(list(string.digits))

corpus = []
for speech in speeches_cleaned:
    cleaned = []
    for word in speech:
        if word not in stops and not word.isnumeric():
            cleaned.append(word)
    corpus.append(cleaned)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer()
cv.fit([' '.join(speech) for speech in corpus])

CountVectorizer()

In [13]:
X = cv.transform([' '.join(speech) for speech in corpus])

## Modeling

### Naive Bayes

In [34]:
from sklearn.naive_bayes import ComplementNB, MultinomialNB

In [15]:
cnb = ComplementNB().fit(X, y_train)

In [16]:
cnb.score(X, y_train)

1.0

In [17]:
stem = nltk.stem.SnowballStemmer('english')

test_cleaned = [[stem.stem(word.lower())
                     for word in nltk.word_tokenize(speech.replace('\'', ''))]\
                     for speech in X_test]

In [18]:
stops = nltk.corpus.stopwords.words('english')
stops.extend(list(string.punctuation))
stops.extend(list(string.digits))

testcorpus = []
for speech in test_cleaned:
    cleaned = []
    for word in speech:
        if word not in stops and not word.isnumeric():
            cleaned.append(word)
    testcorpus.append(cleaned)

In [19]:
Xtest = cv.transform([' '.join(speech) for speech in testcorpus])

In [20]:
cnb.score(Xtest, y_test)

0.5909090909090909

In [35]:
mnb = MultinomialNB(alpha=0.01)

In [37]:
mnb.fit(X, y_train)

MultinomialNB(alpha=0.01)

In [38]:
mnb.score(Xtest, y_test)

0.5454545454545454

### Support Vector Machine

In [39]:
from sklearn.svm import SVC

In [40]:
svc = SVC().fit(X, y_train)

In [41]:
svc.score(Xtest, y_test)

0.45454545454545453

### Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

In [43]:
logreg = LogisticRegression().fit(X, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [44]:
logreg.score(Xtest, y_test)

0.5454545454545454

## GridSearchCV

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
cnb_grid = {
    'norm': [True, False],
    'alpha': [1, 0.1, 0.01]
}

In [48]:
gs_cnb = GridSearchCV(ComplementNB(), cnb_grid)
gs_cnb.fit(X, y_train)



GridSearchCV(estimator=ComplementNB(),
             param_grid={'alpha': [1, 0.1, 0.01], 'norm': [True, False]})

In [49]:
gs_cnb.best_params_

{'alpha': 1, 'norm': True}

In [50]:
gs_cnb.score(Xtest, y_test)

0.5454545454545454

In [51]:
mnb_grid = {
    'alpha': [1, 0.1, 0.01]
}

In [52]:
gs_mnb = GridSearchCV(MultinomialNB(), mnb_grid)
gs_mnb.fit(X, y_train)



GridSearchCV(estimator=MultinomialNB(), param_grid={'alpha': [1, 0.1, 0.01]})

In [53]:
gs_mnb.best_params_

{'alpha': 1}

In [55]:
gs_mnb.score(Xtest, y_test)

0.5454545454545454

In [58]:
svc_grid = {
    'C': [10, 1, 0.1],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'degree': [2, 3 ,4],
    'gamma': ['scale', 'auto'],
    'probability': [True, False]
}

In [59]:
gs_svc = GridSearchCV(SVC(), svc_grid)
gs_svc.fit(X, y_train)



GridSearchCV(estimator=SVC(),
             param_grid={'C': [10, 1, 0.1], 'degree': [2, 3, 4],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['poly', 'rbf', 'sigmoid'],
                         'probability': [True, False]})

In [60]:
gs_svc.best_params_

{'C': 10, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'probability': True}

In [61]:
gs_svc.score(Xtest, y_test)

0.6363636363636364

In [91]:
svc_grid2 = {
    'C': [1, 5, 10, 100],
    'degree': [0, 1, 2],
    'kernel': ['poly', 'linear']
}

In [94]:
gs_svc2 = GridSearchCV(SVC(gamma='auto',
                          probability=True), svc_grid2)
gs_svc2.fit(X, y_train)



GridSearchCV(estimator=SVC(gamma='auto', probability=True),
             param_grid={'C': [1, 5, 10, 100], 'degree': [0, 1, 2],
                         'kernel': ['poly', 'linear']})

In [95]:
gs_svc2.best_params_

{'C': 5, 'degree': 2, 'kernel': 'poly'}

In [96]:
gs_svc2.score(Xtest, y_test)

0.5909090909090909

In [79]:
logreg_grid = {
    'penalty': ['l1', 'l2'],
    'C': [10, 1, 0.1],
    'class_weight': [None, 'balanced']
}

In [80]:
gs_logreg = GridSearchCV(LogisticRegression(max_iter=1e4,
                                           solver='liblinear'), logreg_grid)
gs_logreg.fit(X, y_train)



GridSearchCV(estimator=LogisticRegression(max_iter=10000.0, solver='liblinear'),
             param_grid={'C': [10, 1, 0.1], 'class_weight': [None, 'balanced'],
                         'penalty': ['l1', 'l2']})

In [81]:
gs_logreg.best_params_

{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}

In [82]:
logreg_grid2 = {
    'C': [0.5, 0.1, 0.01]
}

In [83]:
gs_logreg2 = GridSearchCV(LogisticRegression(max_iter=1e4,
                                            solver='liblinear'),
                         logreg_grid2)
gs_logreg2.fit(X, y_train)



GridSearchCV(estimator=LogisticRegression(max_iter=10000.0, solver='liblinear'),
             param_grid={'C': [0.5, 0.1, 0.01]})

In [84]:
gs_logreg2.best_params_

{'C': 0.1}

In [85]:
gs_logreg.score(Xtest, y_test)

0.5

## Voting and Stacking

In [103]:
from sklearn.ensemble import VotingClassifier, StackingClassifier

In [115]:
vc = VotingClassifier(estimators=[
    ('mnb', gs_mnb),
    ('cnb', gs_cnb),
    ('svc', gs_svc),
    ('logreg', gs_logreg)
])

In [116]:
vc.fit(X, y_train)



VotingClassifier(estimators=[('mnb',
                              GridSearchCV(estimator=MultinomialNB(),
                                           param_grid={'alpha': [1, 0.1,
                                                                 0.01]})),
                             ('cnb',
                              GridSearchCV(estimator=ComplementNB(),
                                           param_grid={'alpha': [1, 0.1, 0.01],
                                                       'norm': [True, False]})),
                             ('svc',
                              GridSearchCV(estimator=SVC(),
                                           param_grid={'C': [10, 1, 0.1],
                                                       'degree': [2, 3, 4],
                                                       'gamma': ['scale',
                                                                 'auto'],
                                                       'kernel': ['poly', 'rbf',
  

In [117]:
vc.score(Xtest, y_test)

0.6363636363636364

In [100]:
sc = StackingClassifier(estimators=[
    ('mnb', gs_mnb),
    ('cnb', gs_cnb),
    ('svc', gs_svc),
    ('logreg', gs_logreg)
])

In [101]:
sc.fit(X, y_train)



StackingClassifier(estimators=[('mnb',
                                GridSearchCV(estimator=MultinomialNB(),
                                             param_grid={'alpha': [1, 0.1,
                                                                   0.01]})),
                               ('cnb',
                                GridSearchCV(estimator=ComplementNB(),
                                             param_grid={'alpha': [1, 0.1,
                                                                   0.01],
                                                         'norm': [True,
                                                                  False]})),
                               ('svc',
                                GridSearchCV(estimator=SVC(),
                                             param_grid={'C': [10, 1, 0.1],
                                                         'degree': [2, 3, 4],
                                                         'gamma': ['scale'

In [102]:
sc.score(Xtest, y_test)

0.5454545454545454

***

## Preparing for Pickling

In [27]:
import pandas as pd
import numpy as np
traindf = pd.DataFrame(zip(X_train, y_train))

In [28]:
X_train = traindf.sample(frac=1, random_state=42)[0]

In [29]:
y_train = traindf.sample(frac=1, random_state=42)[1]

In [30]:
testdf = pd.DataFrame(zip(X_test, y_test))

In [31]:
X_test = testdf.sample(frac=1, random_state=42)[0]

In [32]:
y_test = testdf.sample(frac=1, random_state=42)[1]

In [87]:
with open('X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

In [88]:
with open('y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

In [89]:
with open('X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)

In [90]:
with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

In [118]:
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(vc, f)