In [1]:
from tqdm import tqdm_notebook as tqdm
import nltk
import numpy as np
import pandas as pd
import json
import re
import os
from collections import Counter
from glob import glob
import gensim
import xml.etree.ElementTree as ET
from ast import literal_eval
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import sklearn
from sklearn.linear_model import LinearRegression

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


# put existing gzt and files into pd df

In [45]:
with open('data/acrolinx_gzt/lf.json') as lfjson:
    lf = json.load(lfjson)

In [46]:
with open('data/acrolinx_gzt/conv-words.json') as cvjson:
    form_with_sugg = json.load(cvjson)

In [47]:
unclean_gzt = []

with open('data/acrolinx_gzt/archaicWords.gzt') as file:
    unclean_gzt.extend(file.readlines())

with open('data/acrolinx_gzt/countFormalPhrases.gzt') as file:
    unclean_gzt.extend(file.readlines())

with open('data/acrolinx_gzt/countLatinExpressions.gzt') as file:
    unclean_gzt.extend(file.readlines())

In [48]:
gzt = {}

# things i noticed and don't want
exceptions = ['use either', '(']

for item in unclean_gzt:
    if item[0] == '@' or item[0] == '#':
        continue
    item = item.strip()
    if len(item) < 1:
        continue
    trigger = False
    for term in exceptions:
        if term in item:
            trigger = True
    if trigger:
        continue
    item = re.sub('\[', '', item)
    item = re.sub('\]', '', item)
    item = re.sub('\n', '', item)
    item = re.sub(';', '', item)
    if '-->' in item:
        pair = [part.strip() for part in item.split('-->')]
        if ',' in pair[0]:
            form_words = [part.strip() for part in pair[0].split(',')]
            for word in form_words:
                gzt[word] = [part.strip() for part in pair[1].split(',')]
        else:
            gzt[pair[0]] = [part.strip() for part in pair[1].split(',')]
    else:
        gzt[item] = np.nan

In [49]:
len(gzt)

597

In [50]:
formal = []
informal = []

for word in gzt:
    formal.append(word)
    informal.append(gzt[word])

words = pd.DataFrame()
words['formal'] = formal
words['suggestions'] = informal
words.head()

Unnamed: 0,formal,suggestions
0,set forth,
1,abeyance,
2,in abeyance,
3,afore,
4,afore mentioned,


In [55]:
words.to_pickle('data/acrolinx_gzt/clean_words.pkl')

# next: extrapolate to the other words using embeddings

In [67]:
words_df = pd.read_pickle('data/acrolinx_gzt/clean_words.pkl') # only words

In [68]:
words_df.dropna().sample(10)

Unnamed: 0,formal,suggestions
262,necessitate,"[cause, need]"
137,It’s in regards to,[It’s about]
500,preowned,[used]
432,in some instances,[sometimes]
210,desire,"[want, wish]"
528,than was formerly the case,[now]
523,subsequent to,"[later, next, after, then]"
462,it is requested,"[please, we request, I request]"
291,remain,[stay]
358,by means of,"[by, with]"


In [2]:
words_df = pd.read_pickle('data/acrolinx_gzt/initial_words.pkl') # with vectors, determined here

In [4]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('/home/rebekah/Documents/Word Embeddings/GoogleNews-vectors-negative300.bin', binary=True) 

In [58]:
word = 'oppose'
w2v.most_similar(positive=[word, 'peruse', 'penurious', 'amidst', 'endeavor'], negative=['read', 'poor', 'among', 'try'])[:3]

[('countenance', 0.3350488245487213),
 ('revivified', 0.32268810272216797),
 ('unstinted', 0.32051318883895874)]

In [98]:
word = 'oppose'
w2v.most_similar(positive=[word, 'caveat'], negative=['warning'])

[('opposed', 0.5896098613739014),
 ('opposes', 0.5139723420143127),
 ('vehemently_opposed', 0.5054149031639099),
 ('concur', 0.4950396716594696),
 ('vehemently_oppose', 0.4931677281856537),
 ('disagree', 0.4864474833011627),
 ('adamantly_opposed', 0.48512935638427734),
 ('agree', 0.47763556241989136),
 ('unalterably_opposed', 0.462196409702301),
 ('favor', 0.4584296643733978)]

In [66]:
def process_word_lists(wl, embed, one_word_only = False):
    if len(wl) == 1:
        if wl[0] in embed:
            return embed[wl[0]]
    elif len(wl) > 1 and one_word_only == False:
        vecs = [0.0] * len(embed['word'])
        for w in wl:
            if w in embed:
                vecs = list(map(sum, zip(vecs, embed[w])))
        if vecs != [0.0] * len(embed['word']):
            return vecs
    return np.nan

def make_data(df, embed, X, y, one_word_only = False):
    X_ph = np.nan * len(df)
    y_ph = np.nan * len(df)
    df[X] = X_ph
    df[X] = df[X].astype(object)
    df[y] = y_ph
    df[y] = df[y].astype(object)
    
    for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
        formal_words = nltk.word_tokenize(row['formal'])
        df.at[idx, X] = process_word_lists(formal_words, embed, one_word_only) 
        informal_words = []
        if type(row['suggestions']) != float:
            for word in row['suggestions']:
                word = nltk.word_tokenize(word)
                word = process_word_lists(word, embed, one_word_only)
                informal_words.append(word)
        if len(informal_words) > 0:
            df.at[idx, y] = informal_words[0]
                
    return df

In [69]:
words_df = make_data(words_df, w2v, 'X_w2v', 'y_w2v', one_word_only = True)
#words_df.to_pickle('data/acrolinx_gzt/initial_words.pkl')




In [70]:
words_df.dropna()

Unnamed: 0,formal,suggestions,X_w2v,y_w2v
103,whosoever,"[whoever, whomever]","[-0.21972656, -0.19921875, 0.030517578, 0.3574...","[0.045166016, -0.20703125, 0.083984375, 0.0354..."
112,commence,"[begin, start]","[-0.27734375, 0.025756836, 0.115234375, 0.1455...","[0.055664062, 0.12695312, 0.16308594, 0.150390..."
119,depart,"[leave, go]","[0.032226562, 0.140625, -0.053466797, 0.007019...","[0.18554688, 0.008178711, 0.032958984, 0.17675..."
120,retain,[keep],"[0.13769531, -0.064453125, -0.24121094, -0.056...","[0.060546875, -0.012939453, -0.10888672, 0.143..."
121,cease,[stop],"[-0.2109375, -0.20019531, 0.296875, 0.17089844...","[-0.057861328, 0.013183594, 0.115234375, 0.069..."
123,reside,"[live, house]","[-0.091796875, -0.05419922, -0.092285156, -0.0...","[0.016967773, 0.017333984, -0.041748047, 0.126..."
128,inexpensive,[cheap],"[0.0546875, -0.13671875, -0.14746094, 0.227539...","[0.06738281, -0.08105469, -0.103027344, 0.2539..."
129,subsequently,"[next, later]","[-0.0079956055, -0.114746094, 0.107910156, -0....","[0.18261719, -0.044921875, 0.13867188, 0.01165..."
162,abominate,[hate],"[0.028930664, 0.037109375, 0.13378906, 0.11083...","[0.1328125, 0.080078125, 0.28710938, 0.0986328..."
164,accrue,"[add, gain]","[0.12402344, 0.07714844, -0.17773438, 0.324218...","[-0.008728027, 0.1015625, -0.056884766, 0.1416..."


In [71]:
train = words_df.dropna()

In [72]:
for idx, row in train.iterrows():
    assert len(row['X_w2v']) == 300

In [73]:
X = np.array(list(train['X_w2v']))
y = np.array(list(train['y_w2v']))

In [74]:
lr = LinearRegression().fit(X, y)

In [75]:
lr.coef_.shape

(300, 300)

In [76]:
lr.score(X, y)



0.9999999999982279

In [77]:
words_df.sample(10)

Unnamed: 0,formal,suggestions,X_w2v,y_w2v
62,sirrah,,,
140,"Kind regards,",,,
17,doth,,"[0.33007812, 0.100097656, -0.016723633, 0.3535...",
138,It concerns,[It’s about],,
144,Mrs,,"[0.23242188, -0.1875, -0.28125, -0.06542969, -...",
411,identical,[same],"[0.08154297, -0.039794922, 0.125, 0.043701172,...","[0.17089844, -0.012084961, 0.036132812, 0.1728..."
35,henceforward,,"[0.0546875, -0.032714844, 0.17382812, 0.053466...",
225,enounce,,,
86,whencesoever,,,
10,anon,,"[0.16015625, 0.027954102, -0.40234375, 0.34570...",


In [78]:
def predict(word):
    pred = lr.predict([word]).reshape(-1, 1)
    pred = pred.reshape(300,)
    return w2v.similar_by_vector(pred, topn = 10)

In [79]:
w2v_pred = []
for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
    if type(row['X_w2v']) == float:
        formal = nltk.word_tokenize(row['formal'])
        vec = process_word_lists(formal, w2v)
        if type(vec) == float:
            w2v_pred.append(np.nan)
        else:
            w2v_pred.append(predict(vec))
    else:
        w2v_pred.append(predict(row['X_w2v']))
words_df['pred_w2v'] = w2v_pred




In [80]:
words_df['pred_w2v'] = w2v_pred
words_df.head()

Unnamed: 0,formal,suggestions,X_w2v,y_w2v,pred_w2v
0,set forth,,,,"[(facililty, 0.28416314721107483), (same, 0.28..."
1,abeyance,,"[-0.115234375, 0.059814453, 0.18066406, 0.0791...",,"[(gonig, 0.4544585347175598), (Oooops, 0.44092..."
2,in abeyance,,,,"[(next, 0.4528557062149048), (just, 0.43177771..."
3,afore,,"[-0.20898438, 0.042236328, 0.0022277832, -0.03...",,"[(hate, 0.5216549634933472), (think, 0.5096905..."
4,afore mentioned,,,,"[(hate, 0.4496977925300598), (thought, 0.43439..."


In [83]:
def display(df):
    for idx, row in df.iterrows():
        print('Original Word:\t' + row['formal'])
        train = type(row['y_w2v']) != float
        print('Training Data?:\t' + str(train))
        if type(row['suggestions']) != float:
            print('Given Answer:\t' + str(row['suggestions']))
        else:
            print()
        if type(row['pred_w2v']) != float:
            ans = ''
            for item in row['pred_w2v']:
                ans += item[0] + '\t' + str('%s' % float('%.3g' % item[1])) + '\n\t\t'
            print('Pred Answers:\t' + ans)

In [105]:
words_df.dropna(subset=['suggestions', 'X_w2v'])

Unnamed: 0,formal,suggestions,X_w2v,y_w2v,pred_w2v
103,whosoever,"[whoever, whomever]","[-0.21972656, -0.19921875, 0.030517578, 0.3574...","[0.045166016, -0.20703125, 0.083984375, 0.0354...","[(whoever, 1.0), (whomever, 0.7898032069206238..."
112,commence,"[begin, start]","[-0.27734375, 0.025756836, 0.115234375, 0.1455...","[0.055664062, 0.12695312, 0.16308594, 0.150390...","[(begin, 1.0), (begins, 0.7261765003204346), (..."
115,ascertain,"[find out, learn]","[-0.22460938, -0.05493164, -0.23242188, -0.159...",,"[(understand, 0.4546816945075989), (know, 0.44..."
119,depart,"[leave, go]","[0.032226562, 0.140625, -0.053466797, 0.007019...","[0.18554688, 0.008178711, 0.032958984, 0.17675...","[(leave, 1.0), (leaving, 0.6598549485206604), ..."
120,retain,[keep],"[0.13769531, -0.064453125, -0.24121094, -0.056...","[0.060546875, -0.012939453, -0.10888672, 0.143...","[(keep, 0.9999999403953552), (kept, 0.77060246..."
121,cease,[stop],"[-0.2109375, -0.20019531, 0.296875, 0.17089844...","[-0.057861328, 0.013183594, 0.115234375, 0.069...","[(stop, 1.0), (stopped, 0.6834868788719177), (..."
123,reside,"[live, house]","[-0.091796875, -0.05419922, -0.092285156, -0.0...","[0.016967773, 0.017333984, -0.041748047, 0.126...","[(live, 0.9999999403953552), (living, 0.577273..."
128,inexpensive,[cheap],"[0.0546875, -0.13671875, -0.14746094, 0.227539...","[0.06738281, -0.08105469, -0.103027344, 0.2539...","[(cheap, 1.0), (Cheap, 0.7455264329910278), (i..."
129,subsequently,"[next, later]","[-0.0079956055, -0.114746094, 0.107910156, -0....","[0.18261719, -0.044921875, 0.13867188, 0.01165...","[(next, 0.9999999403953552), (Next, 0.62757861..."
162,abominate,[hate],"[0.028930664, 0.037109375, 0.13378906, 0.11083...","[0.1328125, 0.080078125, 0.28710938, 0.0986328...","[(hate, 1.0), (despise, 0.6712517142295837), (..."


In [100]:
display(words_df.dropna())

Original Word:	whosoever
Training Data?:	True
Given Answer:	['whoever', 'whomever']
Pred Answers:	whoever	1.0
		whomever	0.79
		Whoever	0.764
		Whomever	0.66
		somebody	0.627
		whatever	0.62
		somebody_else	0.607
		someone	0.583
		nobody	0.583
		everybody	0.575
		
Original Word:	commence
Training Data?:	True
Given Answer:	['begin', 'start']
Pred Answers:	begin	1.0
		begins	0.726
		commence	0.724
		start	0.685
		commences	0.63
		began	0.614
		begun	0.608
		resume	0.597
		beginning	0.581
		recommence	0.559
		
Original Word:	depart
Training Data?:	True
Given Answer:	['leave', 'go']
Pred Answers:	leave	1.0
		leaving	0.66
		stay	0.579
		depart	0.556
		Leaving	0.549
		left	0.525
		leaves	0.513
		return	0.507
		vacate	0.494
		quit	0.484
		
Original Word:	retain
Training Data?:	True
Given Answer:	['keep']
Pred Answers:	keep	1.0
		kept	0.771
		keeping	0.754
		keeps	0.728
		stay	0.676
		Keeping	0.657
		Keep	0.633
		maintain	0.576
		Kept	0.52
		remain	0.518
		
Original Word:	cease
Training Data?:

In [107]:
predict(w2v['ascertain'])

[('understand', 0.4546816945075989),
 ('know', 0.4448050558567047),
 ('Unrivalled_insight', 0.42923736572265625),
 ('discern', 0.4218718409538269),
 ('believe', 0.41339924931526184),
 ('slowly_Jasny', 0.41248685121536255),
 ('determine', 0.41123640537261963),
 ('knowing', 0.410343199968338),
 ('clear', 0.409426748752594),
 ('deduce', 0.4068732261657715)]

# NN

In [189]:
import tensorflow as tf
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Activation, Embedding, RNN, LSTM, LSTMCell, Dense, Dropout, Concatenate
from keras.layers import TimeDistributed, Bidirectional, Lambda, Layer
from keras.layers import concatenate
from keras.layers.recurrent import Recurrent
from keras.layers.core import Reshape
from keras.activations import tanh, softmax
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import metrics, optimizers

Using TensorFlow backend.
