#  Visual Sicilian

###  Eryk Wdowiak

This notebook attempts to illustrate the Sicilian text that we're using to develop a neural machine translator.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import cm

% matplotlib inline

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.collocations import *

# import string
# import re

from wordcloud import WordCloud

In [None]:
import mxnet as mx
from mxnet import gluon
from mxnet import nd
import gluonnlp as nlp

from data import transform_data_word2vec, preprocess_dataset
from model import SG, CBOW
from utils import print_time

context = mx.cpu()

In [None]:
##  on the Sicilian side, this function only runs NLTK tokenizer
##  on the English side, it will do much more
def process_line(line):
    tokens = word_tokenize(line)
    return tokens

In [None]:
##  read in the lemmatized data
df = pd.read_csv('dataset/train-mparamu_v2-lemmatized.sc', header=None)
df.columns = ['sc_text']
# df.head()

###  frequencies

In [None]:
##  flatten data to count words
proc_scn = list(map(process_line, df.sc_text))
flat_scn = [item for sublist in proc_scn for item in sublist]
freq_scn = FreqDist(flat_scn)

In [None]:
freq_scn.most_common(10)

### counts

In [None]:
# create counts
scn_bar_words = [x[0] for x in freq_scn.most_common(25)]
scn_bar_counts = [x[1] for x in freq_scn.most_common(25)]

# put data into dictionary
scn_dict = dict(zip(scn_bar_words, scn_bar_counts))

In [None]:
# set the color of our bar graphs
color = cm.viridis_r(np.linspace(.4, .8, 30))

In [None]:
fig, axs = plt.subplots(figsize=(8, 4))

axs.bar(scn_bar_words, scn_bar_counts, color=color)
axs.title.set_text('most common Sicilian lemmas')

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)

plt.tight_layout(pad=0)

plt.savefig('wb-sc_lemmas.png')
plt.show()

In [None]:
# create cloud of Sicilian words by frequency
wordcloud = WordCloud(colormap='Spectral').generate_from_frequencies(scn_dict)

plt.figure(figsize=(10, 10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)

plt.savefig('wb-sc_lemma-cloud.png')
plt.show()

### make wordcloud from embeddings

In [None]:
##  load datafile (so that we can retrieve vocabulary)
datafile = 'dataset/train-mparamu_v3-lemmatized.sc.tsv'

##  CBOW model
# model = CBOW
# parmfile = './logs/sc-cbow-r4-e01.params'
# fname_insert = 'cbow'

##  skipgram model
model = SG
parmfile = './logs/sc-skip-r2-e23.params'
fname_insert = 'skip'

##  both trained with hyperparameters:
output_dim = 300
batch_size = 128

In [None]:
##  load the data
data = nlp.data.TSVDataset(datafile)
data, vocab, idx_to_counts = preprocess_dataset(data)

##  load the model
embedding = model(token_to_idx=vocab.token_to_idx, output_dim=output_dim,
                  batch_size=batch_size,  #num_negatives=num_negatives,
                  negatives_weights=mx.nd.array(idx_to_counts))
embedding.load_parameters(parmfile)

##  get the word vectors
wvecs = embedding.embedding_out.weight.data()

##  count words with at least "min_words" appearances
min_words = 10
num_over_min = len(
    np.array(idx_to_counts)[np.array(idx_to_counts) >= min_words])

print('vocabulary length:    ' + str(len(vocab)))
print('lemmas over ' + str(min_words) + ' times: ' + str(num_over_min))

In [None]:
##  pairwise cosine similarity
def cos_sim(wordx, wordy):
    xx = wvecs[vocab.token_to_idx[wordx],]
    yy = wvecs[vocab.token_to_idx[wordy],]
    return nd.dot(xx, yy) / (nd.norm(xx) * nd.norm(yy))


##  full matrix of cosine similarity
def cos_mat(vecs):
    ##  dot product divided by the norms
    xtx = nd.dot(vecs, vecs.T)
    nmx = nd.sqrt(nd.diag(xtx)).reshape((-1, 1))
    cnm = nd.dot(nmx, nmx.T)
    return xtx / cnm

In [None]:
##  create "WC Dict" ("word-to-cosine dictionary") for wordcloud
def mk_wcdict(word, k_words):
    ##  where to start?  first two tokens are: <BOS> <EOS>
    sv_start = 2

    ##  get cosine matrix
    cosmat = cos_mat(wvecs[sv_start:-1, ])

    ##  get the row of cosines
    idx_to_lookup = vocab.token_to_idx[word] - sv_start
    row_looked_up = cosmat[idx_to_lookup,]

    ##  nearest neighbors by cosine similarity
    knn_cosmat = row_looked_up.argsort()[::-1][1:k_words + 1].astype(
        int).asnumpy()

    ##  indexes of nearest neighbors in vocab list
    knn_vocab_idx = list(knn_cosmat + sv_start)

    ##  get the words and cosine measures
    knn_vocab_words = [vocab.idx_to_token[idx] for idx in knn_vocab_idx]
    knn_vocab_cosines = [cosmat[idx_to_lookup, idx].asnumpy()[0] for idx in
                         knn_cosmat]

    ##  return the dictionary for wordcloud
    return dict(zip(knn_vocab_words, knn_vocab_cosines))

In [None]:
# create a cloud of 25 words for Don Chisciotti!
knn_wc_dict = mk_wcdict('chisciotti', 25)
wordcloud = WordCloud(colormap='Spectral').generate_from_frequencies(
    knn_wc_dict)

plt.figure(figsize=(10, 10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)

fname = 'wc-sc-' + fname_insert + '_chisciotti.png'
plt.savefig(fname)
plt.show()

In [None]:
# create a cloud of 25 words for Sanciu Panza!
knn_wc_dict = mk_wcdict('sanciu', 25)
wordcloud = WordCloud(colormap='Spectral').generate_from_frequencies(
    knn_wc_dict)

plt.figure(figsize=(10, 10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)

fname = 'wc-sc-' + fname_insert + '_sanciu.png'
plt.savefig(fname)
plt.show()

### bigrams and trigrams

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [None]:
scn_bi_finder = BigramCollocationFinder.from_words(flat_scn)
# scn_bi_finder.apply_freq_filter(5)
scn_bi_scored = scn_bi_finder.score_ngrams(bigram_measures.raw_freq)
scn_bi_scored[:10]

In [None]:
scn_bi_pmi_finder = BigramCollocationFinder.from_words(flat_scn)
scn_bi_pmi_finder.apply_freq_filter(5)
scn_bi_pmi_scored = scn_bi_pmi_finder.score_ngrams(bigram_measures.pmi)
scn_bi_pmi_scored[0:10]

In [None]:
scn_tri_finder = TrigramCollocationFinder.from_words(flat_scn)
# scn_tri_finder.apply_freq_filter(5)
scn_tri_scored = scn_tri_finder.score_ngrams(trigram_measures.raw_freq)
scn_tri_scored[:10]

In [None]:
scn_tri_pmi_finder = TrigramCollocationFinder.from_words(flat_scn)
scn_tri_pmi_finder.apply_freq_filter(5)
scn_tri_pmi_scored = scn_tri_pmi_finder.score_ngrams(trigram_measures.pmi)
scn_tri_pmi_scored[0:10]