In [48]:
import csv
import string
import sys, itertools
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
%matplotlib inline

N = 1000
alphabet = string.ascii_lowercase + '#' + '.'
D = len(alphabet)
z = np.ones(N)

RI_pres = np.random.rand(D, N)
RI_pres = np.where(RI_pres>0.5, 1, -1)

RI_past = np.random.rand(D, N)
RI_past = np.where(RI_past>0.5, 1, -1)

def read_csv(filepath):
    category2word = {}
    key = 0
    present, past = [], []
    num_words = 0
    with open(filepath, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            if row[0] == "#":
                category2word[key] = [present, past]
                key += 1;
                present, past = [], []
            else:
                present.append(row[0])
                past.append(row[1])
            num_words += 1
    return category2word, num_words

def ngram_encode_cl(ngram_str, letter_vecs, window=3):
    vec = np.zeros(letter_vecs.shape[1])
    full_str = '#' + ngram_str + '.'
    for il, l in enumerate(full_str[:-(window-1)]):
        trivec = letter_vecs[alphabet.find(full_str[il]), :]
        for c3 in range(1, window):
            trivec = trivec * np.roll(letter_vecs[alphabet.find(full_str[il+c3]), :], c3)
        vec += trivec
    return 2* (vec + 0.1*(np.random.rand(letter_vecs.shape[1])-0.5) < 0) - 1

# http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_lda.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-lda-py
def lda(category2word, selected_categories, num_diffs, ngram_lengths, colors):
    X = np.zeros((num_diffs, N))
    y = np.zeros(num_diffs)
    for ngram_length in ngram_lengths:
        print "ngram_length %d" % ngram_length
        # Generate observations
        for c in selected_categories:
            num_subwords = len(category2word[c][0])
            for i in range(num_subwords):
                X[i] = ngram_encode_cl(category2word[c][1][i], RI_past, ngram_length) - ngram_encode_cl(category2word[c][0][i], RI_pres, ngram_length)
                y[i] = c
        
        clf = LinearDiscriminantAnalysis(n_components=len(selected_categories))
        clf.fit(X, y)
        y_pred = clf.predict(X)
        
        print "expected: "
        print y
        print "actual: "
        print y_pred
        print ("accuracy: ", accuracy_score(y, y_pred))
        # argh I can't project by lda
        # also the graphs are broken lmao
        """
        pca = PCA(n_components=2)
        X_r = pca.fit_transform(X)

        print X_r.shape
        fig, ax = plt.subplots(figsize=(5,5))
        i = 0
        for c in selected_categories:
            for txt in category2word[c][1]:
                ax.annotate(txt, X_r[i])
                ax.scatter(X_r[i][0], X_r[i][1], color=available_colors[c])
                i += 1
        plt.title('LDA expected')
        plt.show()
        
        fig, ax = plt.subplots(figsize=(5,5))
        i = 0
        for c in selected_categories:
            for txt in category2word[c][1]:
                ax.annotate(txt, X_r[i])
                ax.scatter(X_r[i][0], X_r[i][1], color=available_colors[int(y_pred[i])])
                i += 1
        plt.title('LDA actual')
        plt.show()
        """
        
        
    

In [51]:
ngram_lengths = [2, 3, 4]
available_colors = ['green', 'red', 'blue', 'yellow', 'black']
irreg_category2word, irreg_num_words = read_csv("data/cleaned/irregular_verbs_final_categorize.csv")
reg_category2word, reg_num_words = read_csv("data/cleaned/regular_verbs_clean_categorize_small.csv")

In [57]:
# looking only at categories:
irreg_selected_categories = [0, 1, 2, 3]
irreg_num_diffs = 0
for c in irreg_selected_categories:
    irreg_num_diffs += len(irreg_category2word[c][0])
lda(irreg_category2word, irreg_selected_categories, irreg_num_diffs, ngram_lengths, available_colors)

ngram_length 2
expected: 
[ 3.  3.  3.  3.  3.  3.  3.  3.  2.  2.  2.  2.  2.  2.  2.  2.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
actual: 
[ 0.  0.  3.  3.  3.  2.  3.  3.  2.  2.  0.  0.  0.  2.  2.  2.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
('accuracy: ', 0.84210526315789469)
ngram_length 3
expected: 
[ 3.  3.  3.  3.  3.  3.  3.  3.  2.  2.  2.  2.  2.  2.  2.  2.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
actual: 
[ 0.  0.  3.  3.  0.  3.  0.  3.  0.  2.  0.  0.  0.  2.  2.  2.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
('accuracy: ', 0.78947368421052633)
ngram_length 4
expected: 
[ 3.  3.  3.  3.  3.  3.  3.  3.  2.  2.  2.  2.  2.  2.  2.  2.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
actual: 
[ 0.  0.  0.  0.  3.  3.  3.  3.  0.  0.  0

In [58]:
# looking only at categories:
reg_selected_categories = [0, 1, 2]
reg_num_diffs = 0
for c in reg_selected_categories:
    reg_num_diffs += len(reg_category2word[c][0])
lda(reg_category2word, reg_selected_categories, reg_num_diffs, ngram_lengths, available_colors)

ngram_length 2
expected: 
[ 2.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
actual: 
[ 1.  0.  1.  0.  2.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  0.
  0.  1.  1.  1.  0.  0.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
('accuracy: ', 0.79591836734693877)
ngram_length 3
expected: 
[ 2.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
actual: 
[ 1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  1.  0.  0.  1.  0.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
('accuracy: ', 0.38775510204081631)
ngram_length 4
expected: 
[ 2.  2.  1.  1.  1.  1.  1.  1.