In [1]:
import nltk
from nltk.corpus import inaugural

import pandas as pd

### Load inaugural addresses

In [2]:
fileids = inaugural.fileids()

LookupError: 
**********************************************************************
  Resource [93minaugural[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('inaugural')
  [0m
  Searched in:
    - '/Users/winnielee/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/winnielee/code/.virtualenvs/nlp/bin/../nltk_data'
    - '/Users/winnielee/code/.virtualenvs/nlp/bin/../share/nltk_data'
    - '/Users/winnielee/code/.virtualenvs/nlp/bin/../lib/nltk_data'
**********************************************************************


In [None]:
files = []
for fid in fileids:
    s = ""
    for i in inaugural.open(fid):
        s += i
    files.append(s)

fs = []
for ix, file in enumerate(files):
    fs += [(ix, i.strip(" \n")) for i in re.split('[.!?]', file)]

In [None]:
len(fs)

### Translate

In [None]:
import re

In [None]:
import urllib.request
from urllib.parse import quote
import sys

typ = sys.getfilesystemencoding()

def translate(querystr, to_l="zh", from_l="en"):
    '''for google tranlate by doom
    '''
    C_agent = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.165063 Safari/537.36 AppEngine-Google."}
    flag = 'class="t0">'
    tarurl = ("http://translate.google.cn/m?hl=%s&sl=%s&q=%s" % (to_l, from_l, quote(querystr, safe='')))
    request = urllib.request.Request(tarurl, headers=C_agent)
    page = str(urllib.request.urlopen(request).read().decode(typ))
    target = page[page.find(flag) + len(flag):]
    target = target.split("<")[0]
    return target

In [None]:
for i,eng in fs[1473:]:
    newversion.append((i, eng, translate(eng)))

In [None]:
temp = pd.DataFrame(newversion)

In [None]:
temp.to_csv("newdata.csv")

### Load ground truth unprocessed

In [3]:
raw_data = pd.read_csv("inaug_addr_cleaned.csv", encoding="latin").dropna()

In [4]:
raw_data.head()

Unnamed: 0,doc index,text,P1,P2,Final,IsSame
0,0,Fellow-Citizens of the Senate and of the House...,0.0,0.0,0.0,True
1,0,"On the one hand, I was summoned by my Country,...",0.0,0.0,0.0,True
2,0,"On the other hand, the magnitude and difficult...",0.0,0.0,0.0,True
3,0,In this conflict of emotions all I dare aver i...,0.0,0.0,0.0,True
4,0,"All I dare hope is that if, in executing this ...",0.0,0.0,0.0,True


## 1. Baseline

### 1.1 Bag of Words 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer()

text = vectorizer.fit_transform(raw_data["text"]).toarray()

In [7]:
# we have 4847 samples, each is 9017 long
text.shape

(4847, 9017)

### 1.2 Train Test Split

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from utilities import train_test_split_common

In [9]:
ytrue = np.array(raw_data["Final"], dtype=int)

In [10]:
train_x, test_x, train_y, test_y = train_test_split_common(text, ytrue)

### 1.3 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

In [None]:
logi = LogisticRegressionCV(cv=5, random_state=0).fit(train_x, train_y)

In [None]:
# accuracy
logi.score(train_x, train_y)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# Baseline
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    logi = LogisticRegression(random_state=0).fit(X_train, y_train)
    precision = precision_score(y_test, logi.predict(X_test))
    recall = recall_score(y_test, logi.predict(X_test))
    f1 = f1_score(y_test, logi.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

In [None]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

### 1.4 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = GaussianNB()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

In [None]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

### 1.5 Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = SVC()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

In [None]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

### 1.6 Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = MLPClassifier()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

In [None]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))