In [1]:
import nltk
from nltk.corpus import inaugural

import pandas as pd

### Load inaugural addresses

In [2]:
fileids = inaugural.fileids()

In [None]:
files = []
for fid in fileids:
    s = ""
    for i in inaugural.open(fid):
        s += i
    files.append(s)

fs = []
for ix, file in enumerate(files):
    fs += [(ix, i.strip(" \n")) for i in re.split('[.!?]', file)]

In [None]:
len(fs)

### Translate

In [None]:
import re

In [None]:
import urllib.request
from urllib.parse import quote
import sys

typ = sys.getfilesystemencoding()

def translate(querystr, to_l="zh", from_l="en"):
    '''for google tranlate by doom
    '''
    C_agent = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.165063 Safari/537.36 AppEngine-Google."}
    flag = 'class="t0">'
    tarurl = ("http://translate.google.cn/m?hl=%s&sl=%s&q=%s" % (to_l, from_l, quote(querystr, safe='')))
    request = urllib.request.Request(tarurl, headers=C_agent)
    page = str(urllib.request.urlopen(request).read().decode(typ))
    target = page[page.find(flag) + len(flag):]
    target = target.split("<")[0]
    return target

In [None]:
for i,eng in fs[1473:]:
    newversion.append((i, eng, translate(eng)))

In [None]:
temp = pd.DataFrame(newversion)

In [None]:
temp.to_csv("newdata.csv")

### Load ground truth unprocessed

In [3]:
raw_data = pd.read_csv("inaug_addr_cleaned.csv", encoding="latin").dropna()

In [4]:
raw_data.head()

Unnamed: 0,doc index,text,P1,P2,Final,IsSame
0,0,Fellow-Citizens of the Senate and of the House...,0.0,0.0,0.0,True
1,0,"On the one hand, I was summoned by my Country,...",0.0,0.0,0.0,True
2,0,"On the other hand, the magnitude and difficult...",0.0,0.0,0.0,True
3,0,In this conflict of emotions all I dare aver i...,0.0,0.0,0.0,True
4,0,"All I dare hope is that if, in executing this ...",0.0,0.0,0.0,True


In [219]:
len(raw_data)

4847

## 1. Baseline

### 1.1 Bag of Words 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer()

text = vectorizer.fit_transform(raw_data["text"]).toarray()

In [7]:
# we have 4847 samples, each is 9017 long
text.shape

(4847, 9017)

### 1.2 Train Test Split

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from utilities import train_test_split_common

In [10]:
ytrue = np.array(raw_data["Final"], dtype=int)

In [66]:
train_x, test_x, train_y, test_y = train_test_split_common(text, ytrue)

### 1.3 Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

In [None]:
logi = LogisticRegressionCV(cv=5, random_state=0).fit(train_x, train_y)

In [None]:
# accuracy
logi.score(train_x, train_y)

In [17]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score

In [18]:
# Baseline
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    logi = LogisticRegression(random_state=0).fit(X_train, y_train)
    precision = precision_score(y_test, logi.predict(X_test))
    recall = recall_score(y_test, logi.predict(X_test))
    f1 = f1_score(y_test, logi.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.69
Recall:    0.31
F1_score:  0.42
----------
Precision: 0.60
Recall:    0.29
F1_score:  0.39
----------
Precision: 0.36
Recall:    0.12
F1_score:  0.18
----------
Precision: 0.67
Recall:    0.27
F1_score:  0.38
----------
Precision: 0.54
Recall:    0.24
F1_score:  0.33
----------


In [19]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

Avg precision: 0.57
Avg recall:    0.24
Avg fscore:    0.34


### 1.4 Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB

In [21]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = GaussianNB()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.09
Recall:    0.07
F1_score:  0.08
----------
Precision: 0.11
Recall:    0.09
F1_score:  0.10
----------
Precision: 0.07
Recall:    0.07
F1_score:  0.07
----------
Precision: 0.13
Recall:    0.10
F1_score:  0.12
----------


In [22]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

Avg precision: 0.08
Avg recall:    0.07
Avg fscore:    0.07


### 1.5 Support Vector Machine

In [23]:
from sklearn.svm import SVC

In [None]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = SVC()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

In [None]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

### 1.6 Neural Network

In [25]:
from sklearn.neural_network import MLPClassifier

In [26]:
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    clf = MLPClassifier()
    clf = clf.fit(X_train, y_train)
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.71
Recall:    0.14
F1_score:  0.23
----------
Precision: 0.71
Recall:    0.24
F1_score:  0.36
----------
Precision: 0.30
Recall:    0.09
F1_score:  0.14
----------
Precision: 0.75
Recall:    0.20
F1_score:  0.32
----------
Precision: 0.50
Recall:    0.24
F1_score:  0.33
----------


In [27]:
print("Avg precision: {:.2f}".format(np.mean(precisions)))
print("Avg recall:    {:.2f}".format(np.mean(recalls)))
print("Avg fscore:    {:.2f}".format(np.mean(f1s)))

Avg precision: 0.60
Avg recall:    0.18
Avg fscore:    0.27


## 2. Feature Engineering

### 2.1 Word Embeddings

In [150]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import TrigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder

In [102]:
import string

In [133]:
def get_bigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    stemmer = PorterStemmer()
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
    
    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)

    result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in tokens 
              if x.lower() not in stopwords.words('english') and x.lower() not in string.punctuation]
    return result

In [163]:
def get_trigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    stemmer = PorterStemmer()
    trigram_finder = TrigramCollocationFinder.from_words(tokens)
    trigrams = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 500)
    tris = []
    for trigram_tuple in trigrams:
        x = "%s %s %s" % trigram_tuple
        tris.append(x)
    
    result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in tris]
    
    return result

In [136]:
# get bigram
X = []
for sentence in raw_data["text"]:
    X.append(get_bigrams(sentence))

In [174]:
# add trigram
for sentence, x in zip(raw_data["text"], X):
    x += get_trigrams(sentence)

In [183]:
from gensim.models import Word2Vec

In [240]:
# word embedding
model = Word2Vec(X, min_count=1, size = 50)

In [241]:
len(X)

4847

In [242]:
res = []
maxLen = 0
for x in X:
    try:
        res.append(np.concatenate([model.wv[w] for w in x]))
        maxLen = max(maxLen, len(res[-1]))
    except:
        res.append(np.zeros(1))
        print("a")

a
a


In [243]:
maxLen

67100

In [244]:
# padd to 150000
res_padded = []
for sentence in res:
    res_padded.append(np.concatenate((sentence, np.zeros(maxLen - sentence.shape[0]))))

In [245]:
sum([len(i) == maxLen for i in res_padded])

4847

In [235]:
from sklearn.decomposition import PCA

In [254]:
pca = PCA(n_components=500)

In [252]:
data = np.array(res_padded)

In [253]:
data.shape

(4847, 67100)

In [256]:
final_data = pca.fit_transform(data)

In [260]:
train_x, test_x, train_y, test_y = train_test_split_common(data, ytrue)

In [261]:
logi = LogisticRegression(random_state=0).fit(train_x, train_y)
precision = precision_score(test_y, logi.predict(test_x))

In [263]:
recall = recall_score(test_y, logi.predict(test_x))

In [259]:
# Baseline
kf = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    logi = LogisticRegression(random_state=0).fit(X_train, y_train)
    precision = precision_score(y_test, logi.predict(X_test))
    recall = recall_score(y_test, logi.predict(X_test))
    f1 = f1_score(y_test, logi.predict(X_test))
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    print("Precision: {:.2f}\nRecall:    {:.2f}\nF1_score:  {:.2f}".format(precision, recall, f1))
    print("----------")

Precision: 0.33
Recall:    0.03
F1_score:  0.05
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
Precision: 0.50
Recall:    0.03
F1_score:  0.06
----------
Precision: 0.00
Recall:    0.00
F1_score:  0.00
----------
