In [None]:
# -*- coding: utf-8 -*-
from __future__ import division
import sys
sys.path.append('hyperwords')
from representations.embedding import Embedding
from representations.matrix_serializer import save_vocabulary
import codecs
import numpy as np
import pandas as pd
from scipy import sparse
import os

In [None]:
%run ./frequency_lib.ipynb

In [None]:
training_corpus = 'bible'        
enrich_corpus = 'twitter'
baseline_output_path = './' + training_corpus + '/'
enriched_output_path = './' + training_corpus + '-' + enrich_corpus + '/'
baseline_sgns_dir = 'sgns'
enriched_sgns_dir = 'sgns_enriched_0'
eval_dir = 'wiktionary-eval'
lang_pairs = [('en','fr'),('en','es'),('en','fi'),('fr','en'),('es','en'),('fi','en')]
eval_subsets = ['inv', 'oov']
lang_codes = list(set([pair[0] for pair in lang_pairs] + [pair[1] for pair in lang_pairs]))
baseline_sim_dir = baseline_sgns_dir + '/similarity-matrices/'
baseline_sim_set_filepath = baseline_sim_dir + '-'.join(lang_codes) + '.vecs'
baseline_sim_csv_filepath = baseline_sim_dir + '-'.join(lang_codes)
enriched_sim_dir = enriched_sgns_dir + '/similarity-matrices/'
enriched_sim_set_filepath = enriched_sim_dir + '-'.join(lang_codes) + '.vecs'
enriched_sim_csv_filepath = enriched_sim_dir + '-'.join(lang_codes)
similarity_threshold = 0.32

In [None]:
if not os.path.exists(baseline_output_path + baseline_sim_dir):
    os.mkdir(baseline_output_path + baseline_sim_dir) 
if not os.path.exists(enriched_output_path + enriched_sim_dir):
    os.mkdir(enriched_output_path + enriched_sim_dir) 

In [None]:
def read_vectors(path):
    vectors = {}
    with open(path) as f:
        first_line = False
        for line in f:
            if first_line:
                first_line = False
                continue
            tokens = line.strip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])

    return vectors

In [None]:
def text2numpy_nonewline(path):
    
    matrix = read_vectors(path)
    iw = sorted(matrix.keys())
    
    new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32)
    for i, word in enumerate(iw):
        if word in matrix:
            new_matrix[i, :] = matrix[word]

    npy_file = path + '.npy'
    vocab_file = path + '.vocab'
    
    np.save(npy_file, new_matrix)
    save_vocabulary(vocab_file, iw)

In [None]:
def multi2subset(path, outpath, lang_codes, vocab_subset):
    subset = list()

    with codecs.open(path, encoding='utf-8', errors='replace') as fin:
        for line in fin:
            token = line.split()[0]
            language = token[:2]
            new_line = token + line.strip()[len(token):]
            if language in lang_codes and token in vocab_subset:
                subset.append(new_line)

    with codecs.open(outpath, 'w', encoding='utf-8', errors='replace') as fout:
        fout.write('\n'.join(subset))

In [None]:
def combine_lang_vecs(lang_codes, path, sim_set_filename, vocab_subset):
    
    sgns_words_path = path + '/words'
    
    multi2subset(sgns_words_path, sim_set_filename, lang_codes, vocab_subset)
    
    text2numpy_nonewline(sim_set_filename)

In [None]:
def similarity_matrix(sim_set_filename):
        
    Es = Embedding(sim_set_filename, True)
    Et = Embedding(sim_set_filename, True)
    
    Es_sparse = sparse.csr_matrix(Es.m)
    Et_sparse = sparse.csr_matrix(Et.m)
    
    score_matrix = Es_sparse.dot(Et_sparse.T)
    
    return [score_matrix, Es.iw]

In [None]:
def get_count_tuple(corpus_vocab, word):
    word = word.decode('utf-8')
    if (word) in corpus_vocab:
        return corpus_vocab[word]
    else:
        print word
        return (0,0)

In [None]:
def sim_matrix_to_edges(df_similarity_matrix, word_index):    
    # Gephi input seems to require a flat tabular input of nodes and edges?
    df_edge_table = pd.DataFrame(columns=['source','target','similarity'])
    
    for src_word in word_index:
        for trg_word in word_index:
            if src_word != trg_word and trg_word not in df_edge_table['source'].unique():
                sim = df_similarity_matrix.loc[src_word][trg_word]
                if not np.isnan(sim):
                    df_edge_table = df_edge_table.append(
                        {'source':src_word, 'target':trg_word, 'similarity':sim} ,ignore_index=True)
    
    df_edge_table['Type'] = 'Undirected'
    return df_edge_table

In [None]:
def word_index_to_nodes(word_index, corpus_vocab):
    df_node_table = pd.DataFrame(columns=['Id','Label','language','from_corpus','train_count','enrich_count'])
    for word in word_index:
    
        if word not in df_node_table['Id'].unique():
            count_tuple = get_count_tuple(corpus_vocab, word)
            df_node_table = df_node_table.append(
                {'Id': word, 'Label': word, 'language': word[0:2], 
                 'from_corpus': 'train' if count_tuple[0] > 0 else 'enrich',
                 'train_count': count_tuple[0], 'enrich_count': count_tuple[1]}, ignore_index=True)
    return df_node_table   

In [None]:
# Build training+enrich corpus vocab counts & the frequency distributions of evaluation words,
# so that we can extract a smaller subset of interesting words to investigate in Gephi
corpus_vocab = collate_training_enrich_vocab_counts(training_corpus, enrich_corpus)
freq_dists = eval_set_corpus_freq_dist(eval_dir, corpus_vocab, lang_pairs, eval_subsets)

In [None]:
# Discard words with frequency>1000 - likely to be stop words, not very interesting
# Limit the number of words per language to avoid memory issues computing similarity matrix
df_freq_dists = freq_dists_to_data_frames(freq_dists)
vocab_subset = set()
for key,df_freq_dist in df_freq_dists.items():
    if key.startswith('oov') and key.endswith('src'):
        df_subset = df_freq_dist.query('total < 1000').sort_values(['train'], ascending=False).head(200)
        vocab_subset.update(df_subset.index.values)

In [None]:
combine_lang_vecs(lang_codes, baseline_output_path + baseline_sgns_dir, 
                  baseline_output_path + baseline_sim_set_filepath, vocab_subset)

In [None]:
[baseline_score_matrix, baseline_word_index] = similarity_matrix(baseline_output_path + baseline_sim_set_filepath)

In [None]:
df_baseline_sim_matrix = pd.DataFrame(baseline_score_matrix.toarray(), 
                                      index=baseline_word_index, columns=baseline_word_index)

In [None]:
df_baseline_sim_matrix = df_baseline_sim_matrix[df_baseline_sim_matrix.gt(similarity_threshold)]

In [None]:
df_baseline_sim_matrix

In [None]:
df_baseline_node_table = word_index_to_nodes(baseline_word_index, corpus_vocab)

In [None]:
df_baseline_edge_table = sim_matrix_to_edges(df_baseline_sim_matrix, baseline_word_index)

In [None]:
df_baseline_node_table.to_csv(baseline_output_path + baseline_sim_csv_filepath + '-node.csv', 
                              encoding='utf-8', header=True, index=True)
df_baseline_edge_table.to_csv(baseline_output_path + baseline_sim_csv_filepath + '-edge.csv', 
                              encoding='utf-8', header=True, index=True)

In [None]:
df_baseline_node_table

In [None]:
df_baseline_edge_table

In [None]:
combine_lang_vecs(lang_codes, enriched_output_path + enriched_sgns_dir, 
                  enriched_output_path + enriched_sim_set_filepath, vocab_subset)

In [None]:
[enriched_score_matrix, enriched_word_index] = similarity_matrix(enriched_output_path + enriched_sim_set_filepath)

In [None]:
df_enriched_sim_matrix = pd.DataFrame(enriched_score_matrix.toarray(), 
                                      index=enriched_word_index, columns=enriched_word_index)

In [None]:
df_enriched_sim_matrix = df_enriched_sim_matrix[df_enriched_sim_matrix.gt(similarity_threshold)]

In [None]:
df_enriched_sim_matrix

In [None]:
df_enriched_node_table = word_index_to_nodes(enriched_word_index, corpus_vocab)

In [None]:
df_enriched_edge_table = sim_matrix_to_edges(df_enriched_sim_matrix, enriched_word_index)

In [None]:
df_enriched_node_table.to_csv(enriched_output_path + enriched_sim_csv_filepath + '-node.csv', 
                              encoding='utf-8', header=True, index=True)
df_enriched_edge_table.to_csv(enriched_output_path + enriched_sim_csv_filepath + '-edge.csv', 
                              encoding='utf-8', header=True, index=True)

In [None]:
df_enriched_node_table

In [None]:
df_enriched_edge_table