In [4]:
import nltk
from nltk.corpus import inaugural

import pandas as pd

### Load inaugural addresses

In [3]:
fileids = inaugural.fileids()

In [31]:
files = []
for fid in fileids:
    s = ""
    for i in inaugural.open(fid):
        s += i
    files.append(s)

fs = []
for ix, file in enumerate(files):
    fs += [(ix, i.strip(" \n")) for i in re.split('[.!?]', file)]

In [32]:
len(fs)

4950

### Translate

In [24]:
import re

In [5]:
import urllib.request
from urllib.parse import quote
import sys

typ = sys.getfilesystemencoding()

def translate(querystr, to_l="zh", from_l="en"):
    '''for google tranlate by doom
    '''
    C_agent = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.165063 Safari/537.36 AppEngine-Google."}
    flag = 'class="t0">'
    tarurl = ("http://translate.google.cn/m?hl=%s&sl=%s&q=%s" % (to_l, from_l, quote(querystr, safe='')))
    request = urllib.request.Request(tarurl, headers=C_agent)
    page = str(urllib.request.urlopen(request).read().decode(typ))
    target = page[page.find(flag) + len(flag):]
    target = target.split("<")[0]
    return target

In [44]:
for i,eng in fs[1473:]:
    newversion.append((i, eng, translate(eng)))

In [48]:
temp = pd.DataFrame(newversion)

In [49]:
temp.to_csv("newdata.csv")

### Load ground truth unprocessed

In [18]:
raw_data = pd.read_csv("inaug_addr_cleaned.csv", encoding="latin").dropna()

In [20]:
raw_data.head()

Unnamed: 0,doc index,text,P1,P2,Final,IsSame
0,0,Fellow-Citizens of the Senate and of the House...,0.0,0.0,0.0,True
1,0,"On the one hand, I was summoned by my Country,...",0.0,0.0,0.0,True
2,0,"On the other hand, the magnitude and difficult...",0.0,0.0,0.0,True
3,0,In this conflict of emotions all I dare aver i...,0.0,0.0,0.0,True
4,0,"All I dare hope is that if, in executing this ...",0.0,0.0,0.0,True


## 1. Baseline

### 1.1 Bag of Words 

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
vectorizer = CountVectorizer()

text = vectorizer.fit_transform(raw_data["text"]).toarray()

In [30]:
# we have 4847 samples, each is 9017 long
text.shape

(4847, 9017)

### 1.2 Train Test Split

In [60]:
import numpy as np
from sklearn.model_selection import train_test_split

In [62]:
ytrue = np.array(raw_data["Final"], dtype=int)

In [63]:
train_x, test_x, train_y, test_y = train_test_split(text, ytrue, test_size=0.2, random_state=0)

### 1.3 Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

In [34]:
logi = LogisticRegressionCV(cv=5, random_state=0).fit(train_x, train_y)

In [35]:
# accuracy
logi.score(train_x, train_y)

0.9994841372194996

In [36]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score

In [70]:
# Baseline
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    logi = LogisticRegression(random_state=0).fit(X_train, y_train)
    precision = precision_score(y_test, logi.predict(X_test))
    recall = recall_score(y_test, logi.predict(X_test))
    f1 = f1_score(y_test, logi.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.69
Recall:    0.31
F1_score:  0.42
----------
Precision: 0.60
Recall:    0.29
F1_score:  0.39
----------
Precision: 0.36
Recall:    0.12
F1_score:  0.18
----------
Precision: 0.67
Recall:    0.27
F1_score:  0.38
----------
Precision: 0.54
Recall:    0.24
F1_score:  0.33
----------


In [73]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

Avg precision: 0.57
Avg recall:    0.24
Avg fscore:    0.34


### 1.4 Naive Bayes

In [79]:
from sklearn.naive_bayes import GaussianNB

In [81]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = GaussianNB()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.09
Recall:    0.07
F1_score:  0.08
----------
Precision: 0.11
Recall:    0.09
F1_score:  0.10
----------
Precision: 0.07
Recall:    0.07
F1_score:  0.07
----------
Precision: 0.13
Recall:    0.10
F1_score:  0.12
----------


In [None]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

### 1.5 Support Vector Machine

In [82]:
from sklearn.svm import SVC

In [83]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = SVC()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------


In [84]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

Avg precision: 0.00
Avg recall:    0.00
Avg fscore:    0.00


### 1.6 Neural Network

In [85]:
from sklearn.neural_network import MLPClassifier

In [86]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = MLPClassifier()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.62
Recall:    0.14
F1_score:  0.23
----------
Precision: 0.73
Recall:    0.26
F1_score:  0.39
----------
Precision: 0.30
Recall:    0.09
F1_score:  0.14
----------
Precision: 0.86
Recall:    0.20
F1_score:  0.32
----------
Precision: 0.50
Recall:    0.21
F1_score:  0.29
----------


In [87]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

Avg precision: 0.60
Avg recall:    0.18
Avg fscore:    0.27
