In [70]:
from tqdm import tqdm_notebook as tqdm
import nltk
import numpy as np
import pandas as pd
import json
import re
import os
from collections import Counter
from glob import glob
import gensim
import xml.etree.ElementTree as ET
from ast import literal_eval
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import sklearn
from sklearn.linear_model import LinearRegression

# put existing gzt and files into pd df

In [6]:
with open('data/acrolinx_gzt/lf.json') as lfjson:
    lf = json.load(lfjson)

In [10]:
with open('data/acrolinx_gzt/conv-words.json') as cvjson:
    form_with_sugg = json.load(cvjson)

In [17]:
unclean_gzt = []

with open('data/acrolinx_gzt/archaicWords.gzt') as file:
    unclean_gzt.extend(file.readlines())

with open('data/acrolinx_gzt/countFormalPhrases.gzt') as file:
    unclean_gzt.extend(file.readlines())

with open('data/acrolinx_gzt/countLatinExpressions.gzt') as file:
    unclean_gzt.extend(file.readlines())

In [69]:
gzt = {}

# things i noticed and don't want
exceptions = ['use either', '(']

for item in unclean_gzt:
    if item[0] == '@' or item[0] == '#':
        continue
    item = item.strip()
    if len(item) < 1:
        continue
    trigger = False
    for term in exceptions:
        if term in item:
            trigger = True
    if trigger:
        continue
    item = re.sub('\[', '', item)
    item = re.sub('\]', '', item)
    item = re.sub('\n', '', item)
    item = re.sub(';', '', item)
    if '-->' in item:
        pair = [part.strip() for part in item.split('-->')]
        if ',' in pair[0]:
            form_words = [part.strip() for part in pair[0].split(',')]
            for word in form_words:
                gzt[word] = [part.strip() for part in pair[1].split(',')]
        else:
            gzt[pair[0]] = [part.strip() for part in pair[1].split(',')]
    else:
        gzt[item] = np.nan

In [71]:
len(gzt)

597

In [72]:
formal = []
informal = []

for word in gzt:
    formal.append(word)
    informal.append(gzt[word])

words = pd.DataFrame()
words['formal'] = formal
words['suggestions'] = informal
words.head()

Unnamed: 0,formal,suggestions
0,purchase,[buy]
1,caveat,[warning]
2,asserted,
3,thenceforward,
4,Pardon me,[Sorry]


In [162]:
words.to_pickle('data/acrolinx_gzt/initial_words.pkl')

NameError: name 'words' is not defined

# next: extrapolate to the other words using embeddings

In [190]:
words_df.to_pickle('data/acrolinx_gzt/initial_words.pkl')

In [66]:
words_df = pd.read_pickle('data/acrolinx_gzt/initial_words.pkl')

In [67]:
words_df.head()

Unnamed: 0,formal,suggestions,X_w2v,y_w2v
0,purchase,[buy],"[0.05419922, -0.16699219, -0.18261719, 0.17089...","[0.060302734, -0.17871094, -0.09716797, 0.2753..."
1,caveat,[warning],"[0.15234375, -0.03515625, 0.059814453, 0.125, ...","[-0.11376953, -0.15136719, 0.16992188, -0.0500..."
2,asserted,,"[-0.106933594, 0.14550781, -0.047851562, -0.08...",
3,thenceforward,,,
4,Pardon me,[Sorry],"[0.255859375, -0.052001953125, 0.168701171875,...","[0.052246094, 0.095703125, -0.0028839111, 0.13..."


In [4]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('/home/rebekah/Documents/Word Embeddings/GoogleNews-vectors-negative300.bin', binary=True) 

In [5]:
word = 'oppose'
w2v.most_similar(positive=[word, 'peruse', 'penurious', 'amidst', 'endeavor'], negative=['read', 'poor', 'among', 'try'])[:3]

[('countenance', 0.3350488245487213),
 ('revivified', 0.32268810272216797),
 ('unstinted', 0.32051318883895874)]

In [61]:
def process_word_lists(wl, embed):
    if len(wl) == 1:
        if wl[0] in embed:
            return embed[wl[0]]
    elif len(wl) > 1:
        vecs = [0.0] * len(embed['word'])
        for w in wl:
            if w in embed:
                vecs = list(map(sum, zip(vecs, embed[w])))
        if vecs != [0.0] * len(embed['word']):
            return vecs
    return np.nan

def make_data(df, embed, X, y):
    X_ph = np.nan * len(df)
    y_ph = np.nan * len(df)
    df[X] = X_ph
    df[X] = df[X].astype(object)
    df[y] = y_ph
    df[y] = df[y].astype(object)
    
    for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
        formal_words = nltk.word_tokenize(row['formal'])
        df.at[idx, X] = process_word_lists(formal_words, embed) 
        informal_words = []
        if type(row['suggestions']) != float:
            for word in row['suggestions']:
                word = nltk.word_tokenize(word)
                word = process_word_lists(word, embed)
                informal_words.append(word)
        if len(informal_words) > 0:
            df.at[idx, y] = informal_words[0]
                
    return df

In [65]:
words_df = make_data(words_df, w2v, 'X_w2v', 'y_w2v')
words_df.to_pickle('data/acrolinx_gzt/initial_words.pkl')




In [68]:
train = words_df.dropna()

In [76]:
for idx, row in train.iterrows():
    assert len(row['X_w2v']) == 300

In [90]:
X = np.array(list(train['X_w2v']))
y = np.array(list(train['y_w2v']))

In [94]:
lr = LinearRegression().fit(X, y)

In [97]:
lr.coef_.shape

(300, 300)

In [95]:
lr.score(X, y)



1.0

In [141]:
words_df.sample(10)

Unnamed: 0,formal,suggestions,X_w2v,y_w2v
593,duly observe,,"[-0.259765625, -0.150634765625, 0.21923828125,...",
273,elect,"[chose, pick]","[-0.035888672, -0.008422852, -0.011108398, 0.1...","[0.25976562, 0.359375, 0.16796875, 0.119140625..."
223,beseech,,"[0.17773438, 0.38085938, 0.3125, 0.26953125, -...",
378,make an attempt,[try],"[0.216796875, 0.311767578125, 0.10986328125, 0...","[0.24023438, 0.20117188, 0.16210938, 0.2089843..."
578,consumedly,,,
69,contra,,"[-0.13183594, -0.17578125, 0.022338867, 0.1137...",
225,Mrs.,,"[0.232421875, -0.1875, -0.28125, -0.0654296875...",
452,alack,,"[0.02722168, -0.029541016, -0.061279297, 0.204...",
200,on a basis,,"[-0.0037841796875, -0.2734375, 0.015380859375,...",
57,reflect,"[say, show]","[-0.34765625, 0.12695312, 0.056152344, -0.0158...","[-0.036132812, -0.12109375, 0.13378906, 0.1142..."


In [153]:
def predict(word):
    pred = lr.predict([word]).reshape(-1, 1)
    pred = pred.reshape(300,)
    return w2v.similar_by_vector(pred, topn = 10)

In [160]:
w2v_pred = []
for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
    if type(row['X_w2v']) == float:
        formal = nltk.word_tokenize(row['formal'])
        vec = process_word_lists(formal, w2v)
        if type(vec) == float:
            w2v_pred.append(np.nan)
        else:
            w2v_pred.append(predict(vec))
    else:
        w2v_pred.append(predict(row['X_w2v']))
words_df['pred_w2v'] = w2v_pred




NameError: name 'word_df' is not defined

In [161]:
words_df['pred_w2v'] = w2v_pred
words_df.head()

Unnamed: 0,formal,suggestions,X_w2v,y_w2v,pred_w2v
0,purchase,[buy],"[0.05419922, -0.16699219, -0.18261719, 0.17089...","[0.060302734, -0.17871094, -0.09716797, 0.2753...","[(buy, 1.0), (sell, 0.8308461904525757), (purc..."
1,caveat,[warning],"[0.15234375, -0.03515625, 0.059814453, 0.125, ...","[-0.11376953, -0.15136719, 0.16992188, -0.0500...","[(warning, 1.0), (warnings, 0.8184125423431396..."
2,asserted,,"[-0.106933594, 0.14550781, -0.047851562, -0.08...",,"[(think, 0.5262875556945801), (say, 0.49223661..."
3,thenceforward,,,,
4,Pardon me,[Sorry],"[0.255859375, -0.052001953125, 0.168701171875,...","[0.052246094, 0.095703125, -0.0028839111, 0.13...","[(Sorry, 1.0), (Hey, 0.690428614616394), (Okay..."


In [186]:
def display(df):
    for idx, row in df.iterrows():
        print('Original Word:\t' + row['formal'])
        train = type(row['y_w2v']) != float
        print('Training Data?:\t' + str(train))
        if type(row['suggestions']) != float:
            print('Given Answer:\t' + str(row['suggestions']))
        if type(row['pred_w2v']) != float:
            ans = ''
            for item in row['pred_w2v']:
                ans += item[0] + '\t' + str('%s' % float('%.3g' % item[1])) + '\n\t\t'
            print('Pred Answers:\t' + ans)

In [188]:
display(words_df.sample(10))

Original Word:	instill
Training Data?:	False
Pred Answers:	Where're	0.385
		honest	0.382
		critisize	0.377
		nit_pick	0.375
		oogle	0.372
		showin	0.369
		apreciate	0.368
		d_**_khead	0.367
		complainin	0.366
		boing_boing	0.366
		
Original Word:	entitlement
Training Data?:	True
Given Answer:	['right']
Pred Answers:	right	1.0
		Right	0.57
		wrong	0.553
		##.Help_us	0.55
		Goodwill_Catanese	0.516
		left	0.492
		fielder_Joe_Borchard	0.489
		NOTE_Xactly_Incent	0.489
		fielder_Ambiorix_Concepcion	0.484
		now	0.479
		
Original Word:	perspire
Training Data?:	True
Given Answer:	['sweat']
Pred Answers:	sweat	1.0
		perspiration	0.642
		Mud_resin	0.633
		sweating	0.603
		sweated	0.597
		sweaty	0.552
		sweat_dripping	0.546
		Sweat	0.515
		perspire	0.505
		sweats	0.5
		
Original Word:	discover
Training Data?:	True
Given Answer:	['find out']
Pred Answers:	find	0.819
		out	0.758
		discover	0.569
		finding	0.567
		get	0.554
		see	0.551
		back	0.49
		found	0.489
		tofind	0.489
		in.	0.482
		
Original 

# NN

In [189]:
import tensorflow as tf
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Activation, Embedding, RNN, LSTM, LSTMCell, Dense, Dropout, Concatenate
from keras.layers import TimeDistributed, Bidirectional, Lambda, Layer
from keras.layers import concatenate
from keras.layers.recurrent import Recurrent
from keras.layers.core import Reshape
from keras.activations import tanh, softmax
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import metrics, optimizers

Using TensorFlow backend.
