In [4]:
triples_path_pattern = '../output/ukwac-triples/%02d.tsv.gz'

In [5]:
%matplotlib inline
import pandas as pd
from collections import Counter
import seaborn as sns

# Reading data

In [6]:
dfs = [pd.read_csv(triples_path_pattern % i, delimiter='\t', compression='gzip',
                   header = None, names = ['sbj', 'verb', 'dobj'])
       for i in range(5)]

In [7]:
triples_df = df = pd.concat(dfs)
triples_df.head()

Unnamed: 0,sbj,verb,dobj
0,child,terrorise,resident
1,citizen,describe,birthplace
2,fan,wear,shirt
3,crew,suffer,embarrassment
4,star,visit,room


In [62]:
print("Number of original tuples: {0:,d}".format(len(triples_df)))

Number of original tuples: 8,025,271


"For the multi-way model, we select the 2K most frequent verbs, together with the 10K most frequent subjects and the 10K most frequent objects (that appear within a transitive frame)." (van de Cruys, 2014)

In [45]:
def extract_most_common_values(df, col, n):
    return pd.DataFrame(df[col].value_counts().iloc[:n].index, columns=[col])

In [46]:
mc_verbs = extract_most_common_values(triples_df, 'verb', 2000)
mc_sbjs = extract_most_common_values(triples_df, 'sbj', 10000)
mc_dobjs = extract_most_common_values(triples_df, 'dobj', 10000)

In [48]:
filtered_triples_df = triples_df
filtered_triples_df = pd.merge(filtered_triples_df, mc_verbs, on='verb', how='inner')
filtered_triples_df = pd.merge(filtered_triples_df, mc_sbjs, on='sbj', how='inner')
filtered_triples_df = pd.merge(filtered_triples_df, mc_dobjs, on='dobj', how='inner')

In [63]:
print("Number of tuple instances after filtering: {0:,d}".format(len(filtered_triples_df)))

Number of tuple instances after filtering: 7,299,599


In [64]:
print("Number of tuple types after filtering: {0:,d}".format(len(filtered_triples_df.drop_duplicates())))

Number of tuple types after filtering: 4,261,954


Somehow my data has much more tuples and types than van de Cruys (2014)'s: "For the three-way model, our corpus consists of about 5,5M tuple instances (750K types)"

# Writing results

In [52]:
filtered_triples_df.to_csv('../output/ukwac-triples-filtered.tsv.gz', header=True, index=False, 
                           sep='\t', compression='gzip')