In [33]:
import os
if os.getcwd().endswith('/notebook'):
    os.chdir('..')

In [45]:
import pandas as pd
from joblib import dump
import numpy as np
from evaluation import create_test_set_from_positive_examples_same_freq, \
        create_test_set_from_positive_examples_unigram_freq

In [35]:
df = pd.read_csv('output/ukwac-triples-filtered.tsv.gz', delimiter='\t', compression='gzip')
df.sample(5)

Unnamed: 0,sbj,verb,dobj
6950493,town,have,bent
571478,driver,eliminate,need
5051387,devil,point,finger
5261930,festival,attract,entry
7133210,pupil,take,pseudonym


In [36]:
print('{0:,d}'.format(len(df)))

7,299,599


In [37]:
from sklearn.model_selection import train_test_split

Randomly split the available tuples into training and testing. Following Cruys (2014), we will perform instance-based training and type-based testing. The duplicates in the training set gives the model a sense of typicality.

In [38]:
train_ds, dev_ds = train_test_split(df, test_size=0.1, random_state=285230)

In [40]:
dev_ds_with_negs_same_freq = create_test_set_from_positive_examples_same_freq(dev_ds)

In [43]:
dev_ds_with_negs_same_freq.sample(5)

Unnamed: 0,verb,pos_sbj,pos_dobj,neg_sbj,neg_dobj
35072,prevent,forest,flood,tale,estimation
63400,have,reader,impression,supply,secret
159950,see,advisor,client,sin,act
72243,cover,part,letter,failure,programme
296380,form,platform,part,department,distance


In [41]:
dev_ds_with_negs_unigram_freq = create_test_set_from_positive_examples_unigram_freq(dev_ds)

In [44]:
dev_ds_with_negs_unigram_freq.sample(5)

Unnamed: 0,verb,pos_sbj,pos_dobj,neg_sbj,neg_dobj
190871,increase,act,penalty,engine,question
283929,provide,company,research,market,time
75499,reach,news,camp,people,open
521806,have,programme,expansion,trend,humour
534693,underline,introduction,importance,reduction,capability


# Examine the datasets

In [16]:
len(train_ds)

6569639

In [17]:
len(dev_ds)

729960

# Encode into numerical values

In [20]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [21]:
encoder.fit(list(train_ds.values.reshape(-1)) + ['<unkn>'])
vocab = set(encoder.classes_)

In [22]:
len(vocab)

12856

In [23]:
def transform_dataset(df):
    num_cols = len(df.columns)
    vals = df.values.reshape(-1)
    vals = [(v if v in vocab else '<unkn>') for v in vals]
    x = encoder.transform(vals).reshape(-1, num_cols)
    return x

In [24]:
encoded_train_ds = transform_dataset(train_ds)
encoded_dev_ds = transform_dataset(dev_ds)

In [25]:
encoded_train_ds[:5]

array([[12807,  6523,  8275],
       [ 3552,  6102,  9201],
       [ 2432, 10335,  4218],
       [ 4929,  9187,  4929],
       [   78,   441,  3297]])

In [26]:
encoded_dev_ds[:5]

array([[ 2469,   561,  8813],
       [ 1960, 11460,  3320],
       [12554,  2759,  9044],
       [11607, 12729,  4756],
       [ 1912,  1943,  7542]])

In [29]:
dump(encoder, 'output/ukwac-encoder.pkl')

['output/ukwac-encoder.pkl']

In [30]:
np.save('output/ukwac-train.npy', encoded_train_ds)
np.save('output/ukwac-dev.npy', encoded_dev_ds)

In [47]:
encoded_dev_same_freq = transform_dataset(dev_ds_with_negs_same_freq)
encoded_dev_unigram = transform_dataset(dev_ds_with_negs_unigram_freq)

In [49]:
encoded_dev_same_freq[:5]

array([[ 6011,  1985, 10117,  3858,  8676],
       [10332,  9883,  9254,  8451,  3926],
       [ 2107,  4547, 11824,  7985,  1943],
       [ 4502,  6675,  7912,  2033,  2198],
       [ 4563,   630,  3172,  9059,  1580]])

In [51]:
np.save('output/ukwac-dev-same-freq.npy', encoded_dev_same_freq)
np.save('output/ukwac-dev-unigram-freq.npy', encoded_dev_unigram)

# Coverage

In [31]:
sum(1 for w in dev_ds.values.flatten() if w in vocab)

2189879

In [32]:
dev_ds.values.size

2189880