In [None]:
import time
import csv
import numpy as np
import math
import matplotlib.pyplot as plt
import string
import random
import scipy
import sklearn.metrics as metrics

from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
%matplotlib inline

from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sys import stdout
from operator import itemgetter

N = 1000
alphabet = string.ascii_lowercase + '#' + '.'
D = len(alphabet)
z = np.ones(N)

RI_pres = np.random.rand(D, N)
RI_pres = np.where(RI_pres>0.5, 1, -1)

RI_past = np.random.rand(D, N)
RI_past = np.where(RI_past>0.5, 1, -1)
    
# data augmentation:
def random_noise(X):
    noise = np.random.normal(0, .01, X.shape)
    return X + noise

def read_csv(filepath):
    rows = []
    with open(filepath, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            rows.append(row)
    return rows

def ngram_encode(ngram_str, letter_vecs, window=3):
    vec = np.zeros(letter_vecs.shape[1])
    
    full_str = '#' + ngram_str + '.'
    
    
    for il, l in enumerate(full_str[:-(window-1)]):
        trivec = letter_vecs[alphabet.find(full_str[il]), :]
        for c3 in range(1, window):
            trivec = trivec * np.roll(letter_vecs[alphabet.find(full_str[il+c3]), :], c3)
            
        vec += trivec
    return vec

def ngram_encode_cl(ngram_str, letter_vecs, window=3):
    vec = np.zeros(letter_vecs.shape[1])
    
    full_str = '#' + ngram_str + '.'
    
    for il, l in enumerate(full_str[:-(window-1)]):
        trivec = letter_vecs[alphabet.find(full_str[il]), :]
        for c3 in range(1, window):
            trivec = trivec * np.roll(letter_vecs[alphabet.find(full_str[il+c3]), :], c3)
            
        vec += trivec
    return 2* (vec + 0.1*(np.random.rand(letter_vecs.shape[1])-0.5) < 0) - 1

def shuffle(Xtrain, ytrain):
    N_s, N_f = Xtrain.shape
    N_t = 2*N_s//3
    N_v = N_s - N_t
    training_indices = np.random.choice(xrange(N_s), N_t, replace = False)
    validation_indices = np.array([i for i in xrange(N_s) if i not in training_indices])

    t_data = Xtrain[training_indices]
    v_data = Xtrain[validation_indices]
    t_y = ytrain[training_indices]
    v_y = ytrain[validation_indices]
    return t_data, v_data, t_y, v_y
    
def load_irreg():
    irreg = read_csv("wickle_train/irregular_verbs.csv")
    irreg_past_tense, irreg_words = [], []
    for row in irreg:
        irreg_words.append(row[0])
        irreg_past_tense.append(row[1])
    irreg_present = np.zeros((len(irreg_words), N))
    irreg_past = np.zeros((len(irreg_words), N))
    for i in range(irreg_present.shape[0]):
        irreg_present[i] = ngram_encode_cl(irreg_words[i], RI_pres)
        irreg_past[i] = ngram_encode_cl(irreg_past_tense[i], RI_past)
    return shuffle(irreg_present, irreg_past)

def load_reg():
    reg_present = read_csv("wickle_train/present_reg.csv")
    reg_past = read_csv("wickle_train/past_reg.csv")
    
    pres_enc = []
    past_enc = []
    num_words = len(reg_past)
    N_s = num_words
    N_t = 2*N_s//3
    N_v = N_s - N_t
    training_indices = np.random.choice(xrange(N_s), N_t, replace = False)
    validation_indices = np.array([i for i in xrange(N_s) if i not in training_indices])
    enct, encv = [], []
    # introducing present verbs in descending occurrence
    for i in training_indices:
        enct.append([int(reg_present[i][1]), reg_present[i][0], reg_past[i][0]])
    for i in validation_indices:
        encv.append([int(reg_present[i][1]), reg_present[i][0], reg_past[i][0]])
    enct = sorted(enct, reverse=True)
    encv = sorted(encv, reverse=True)
    t_data, t_y, t_ylabels = np.zeros((len(training_indices), N)), np.zeros((len(training_indices), N)), []
    v_data, v_y, v_ylabels = np.zeros((len(validation_indices), N)), np.zeros((len(validation_indices), N)), []

    for i in range(N_t):
        t_data[i] = ngram_encode_cl(enct[i][1], RI_pres)
        t_y[i] = ngram_encode_cl(enct[i][2], RI_past)
        t_ylabels.append(enct[i][2])
    
    for i in range(N_v):
        v_data[i] = ngram_encode_cl(encv[i][1], RI_pres)
        v_y[i] = ngram_encode_cl(encv[i][2], RI_past)
        v_ylabels.append(encv[i][2])
    print (t_data.shape, t_y.shape, v_data.shape, v_y.shape)
    return t_data, t_y, t_ylabels, v_data, v_y, v_ylabels
    

#load_irreg()
#load_reg()
X_train, y_train, labels_train, X_validate, y_validate, labels_validate = load_reg()