In [22]:
from tqdm import tqdm_notebook as tqdm
import nltk
import numpy as np
import pandas as pd
import json
import re
import os
from collections import Counter
from glob import glob
import gensim
import xml.etree.ElementTree as ET
from ast import literal_eval
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import sklearn
import pickle
from sklearn.linear_model import LinearRegression

# put existing acrolinx gzt and files into pd df

# add microsoft words to dataframe

In [23]:
with open('data/microsoft/words.pkl', 'rb') as f:
    mic_words = pickle.load(f)

In [27]:
mic_words = [str(x).strip().lower() for x in list(mic_words)]
mic_words

['make an effort',
 'as soon as possible',
 'provides guidance for',
 'publish',
 'embodies',
 'routing',
 'enabling',
 'enumerates',
 'prioritized',
 'migration',
 'discovery',
 'regulations',
 'modified',
 'compliance',
 'ensures',
 'universally',
 'alternatively',
 'exemplifies',
 'primary',
 'numerous',
 'et al',
 'depict',
 'endeavor',
 'invoked',
 'provisions',
 'on behalf of',
 'analyzer',
 'in excess of',
 'he has',
 'foremost',
 'subsequent',
 'asserted',
 'increment',
 'remediation',
 'e.g.',
 'designates',
 'terminated',
 'completed',
 'herein',
 'attempting',
 'provider',
 'discontinued',
 'cannot',
 'transformations',
 'selected',
 'residing',
 'provisions',
 'regarding',
 'endorse',
 'in the process of',
 'there is',
 'allocating',
 'have not',
 'limits',
 'specifications',
 'modifications',
 'contained',
 'reflected',
 'consolidates',
 'ensuring',
 'exceed',
 'distribution',
 'rendering',
 'terminated',
 'hence',
 'are readable',
 'exhibit',
 'utilization',
 'certificate

In [38]:
sugg = [np.nan] * len(mic_words)
sugg += list(words_df['suggestions'])

In [39]:
mic_words += list(words_df['formal'])

In [40]:
mic_df = pd.DataFrame()
mic_df['words'] = mic_words
mic_df['sugg'] = sugg

In [45]:
mic_df.sample(10)

Unnamed: 0,words,sugg
644,compute,
1499,ensuring,
898,integrity,
194,requesters,
1483,e.g.,
90,permitted,
649,invokes,
1254,request,
693,devices,
913,apparent,


In [46]:
mic_df.to_pickle('data/lexical_repl/all_words_clean.pkl')

# next: extrapolate to the other words using embeddings

In [47]:
words_df = pd.read_pickle('data/lexical_repl/all_words_clean.pkl')

In [2]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('/home/rebekah/Documents/Word Embeddings/GoogleNews-vectors-negative300.bin', binary=True) 

In [48]:
# The basic way.
word = 'oppose'
w2v.most_similar(positive=[word, 'caveat'], negative=['warning'])

[('opposed', 0.5896098613739014),
 ('opposes', 0.5139723420143127),
 ('vehemently_opposed', 0.5054149031639099),
 ('concur', 0.4950396716594696),
 ('vehemently_oppose', 0.4931677281856537),
 ('disagree', 0.4864474833011627),
 ('adamantly_opposed', 0.48512935638427734),
 ('agree', 0.47763556241989136),
 ('unalterably_opposed', 0.462196409702301),
 ('favor', 0.4584296643733978)]

In [51]:
def process_word_lists(wl, embed, one_word_only = False):
    if len(wl) == 1:
        if wl[0] in embed:
            return embed[wl[0]]
    elif len(wl) > 1 and one_word_only == False:
        vecs = [0.0] * len(embed['word'])
        for w in wl:
            if w in embed:
                vecs = list(map(sum, zip(vecs, embed[w])))
        if vecs != [0.0] * len(embed['word']):
            return vecs
    return np.nan

def make_data(df, embed, X, y, one_word_only = False):
    X_ph = np.nan * len(df)
    y_ph = np.nan * len(df)
    df[X] = X_ph
    df[X] = df[X].astype(object)
    df[y] = y_ph
    df[y] = df[y].astype(object)
    
    for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
        formal_words = nltk.word_tokenize(row['words'])
        df.at[idx, X] = process_word_lists(formal_words, embed, one_word_only) 
        informal_words = []
        if type(row['sugg']) != float:
            for word in row['sugg']:
                word = nltk.word_tokenize(word)
                word = process_word_lists(word, embed, one_word_only)
                informal_words.append(word)
        if len(informal_words) > 0:
            df.at[idx, y] = informal_words[0]
                
    return df

In [52]:
words_df = make_data(words_df, w2v, 'X_w2v', 'y_w2v', one_word_only = True)




In [58]:
words_df.dropna(subset=['X_w2v']).sample(10)

Unnamed: 0,words,sugg,X_w2v,y_w2v
1242,fore,,"[0.018798828, 0.123046875, -0.03100586, -0.125...",
1584,distributions,,"[0.18066406, -0.15429688, -0.024414062, 0.3339...",
1529,invoke,,"[0.2734375, 0.23828125, 0.19726562, 0.078125, ...",
769,attempt,,"[0.20410156, 0.15820312, -0.05419922, -0.00970...",
936,cornerstone,,"[0.01977539, 0.09765625, 0.07373047, 0.203125,...",
1214,pending,,"[0.16601562, 0.06640625, 0.29101562, -0.273437...",
559,item,,"[0.024291992, 0.010803223, -0.107421875, 0.302...",
1465,initiate,,"[-0.21386719, 0.015991211, 0.096191406, 0.1259...",
2046,acknowledge,,"[-0.16601562, -0.13183594, -0.15625, 0.1367187...",
91,decreased,,"[-0.2578125, -0.140625, -0.23925781, 0.0893554...",


# train linear regression

In [65]:
train = words_df.dropna() # only the words that have suggestions and vectors for both

In [66]:
for idx, row in train.iterrows():
    assert len(row['X_w2v']) == 300

In [67]:
X = np.array(list(train['X_w2v']))
y = np.array(list(train['y_w2v']))

In [68]:
lr = LinearRegression().fit(X, y)

In [69]:
lr.coef_.shape

(300, 300)

In [70]:
lr.score(X, y)



0.9999999999982279

In [71]:
def predict(word):
    pred = lr.predict([word]).reshape(-1, 1)
    pred = pred.reshape(300,)
    return w2v.similar_by_vector(pred, topn = 10)

In [80]:
np.isnan(np.nan)

True

In [82]:
# predict for all other words

w2v_pred = []

for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
    if type(row['X_w2v']) == float:
        w2v_pred.append(None)
        #formal = nltk.word_tokenize(row['words'])
        #vec = process_word_lists(formal, w2v, one_word_only = True)
        #if type(vec) == float:
        #    w2v_pred.append(np.nan)
        #else:
        #    w2v_pred.append(predict(vec))
    else:
        w2v_pred.append(predict(row['X_w2v']))

words_df['pred_w2v'] = w2v_pred

In [108]:
def display(df, num = 10):
    current = 0
    for idx, row in df.sample(frac=1).iterrows():
        if current > num:
            break
        train = type(row['y_w2v']) != float
        if train:
            continue
        print('Original Word:\t' + row['words'])
        #print('Training Data?:\t' + str(train))
        if type(row['sugg']) != float:
            print('Given Answer:\t' + str(row['sugg']))
        else:
            print()
        if type(row['pred_w2v']) != float:
            ans = ''
            for item in row['pred_w2v']:
                ans += item[0] + '\t' + str('%s' % float('%.3g' % item[1])) + '\n\t\t'
            print('Pred Answers:\t' + ans)
        current += 1

In [110]:
display(words_df.dropna(subset=['pred_w2v']))

Original Word:	yon

Pred Answers:	think	0.511
		know	0.51
		choose	0.479
		guess	0.469
		suppose	0.462
		MARK_LATHAM_OPPOSITION_LEADER	0.459
		do	0.458
		BEGALA	0.453
		Mr._NAVARRETTE	0.451
		RENDELL_Well	0.451
		
Original Word:	television
Given Answer:	['T.V.']
Pred Answers:	many	0.563
		nowadays	0.465
		laypersons_alike	0.446
		often	0.437
		simplifiers	0.424
		simplistic_notions	0.423
		loathe	0.422
		Th_ere	0.419
		culturally_ingrained	0.416
		hesays	0.414
		
Original Word:	components

Pred Answers:	Jin_Qi	0.324
		prebuilt_templates	0.317
		also	0.311
		ability	0.309
		ease	0.304
		Strikers_Charged	0.299
		LUCRF	0.292
		electromechanical_steering	0.292
		sturdiness	0.286
		rappel_tower	0.285
		
Original Word:	resource

Pred Answers:	meet	0.382
		ease	0.357
		use	0.354
		utilize	0.352
		purchase	0.345
		outplace	0.341
		select	0.339
		NSLI_Y	0.333
		workwith	0.327
		reasonable	0.326
		
Original Word:	enumerate

Pred Answers:	count	1.0
		counts	0.689
		counted	0.594
		counting	0.533


In [111]:
words_df.to_pickle('data/lexical_repl/all_words_filled.pkl')

In [117]:
with open('data/lexical_repl/doccano_to_check.txt', 'w') as f:
    for idx, row in tqdm(words_df.iterrows(), total=len(words_df)):
        if row['pred_w2v'] != None:
            ans = ''
            for item in row['pred_w2v']:
                ans += item[0] + '\t'
            f.write(row['words'].upper() + '\t' + ans + '\n')

# repeat with glove

# repeat with fasttext