In [100]:
import os
import re
import numpy as np
import pandas as pd

from collections import defaultdict

import ds_utils as ds
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [99]:
eda_model_dir = '../models/EDA'
pretrn_model_dir = '../models/pretrained_word_embeddings'
data_dir = '../raw_data/'
resource_path = os.path.join(model_dir,'resources')

## Data Prep

In [30]:
data_files = [f for f in os.listdir(data_dir) if not f.startswith('.')]
print('\n'.join(data_files))

all_pt_text.txt
all_es_text.txt
all_nl_text.txt
all_pl_text.txt
all_it_text.txt
all_fr_text.txt
all_de_text.txt
all_ja_text.txt
all_ru_text.txt
all_en_text.txt
all_ar_text.txt


In [47]:
with open(os.path.join(data_dir, data_files[0]), 'r') as f:
    print(f.read())

Foi laureada com o Prémio Princesa das Astúrias de 2015 na categoria Cooperação Internacional. O júri refere que "figura entre os dez sites mais visitados do mundo, [...] tem crescido continuamente [...], incluíndo um número de línguas indígenas. [...] exemplo importante de cooperação internacional, democrática, aberta e participatória [...]. [250]A Wikipédia também tem sido utilizada como fonte no jornalismo,[238] muitas vezes sem atribuição, sendo que vários jornalistas foram demitidos por plagiar a Wikipédia.[239][240][241] Em julho de 2007, a Wikipédia foi foco de um documentário de 30 minutos da BBC Radio 4[242] que argumentava que, com o aumento do uso, o número de referências da Wikipédia na cultura popular é tal que o termo é um membro do grupo de marcas do século XXI que são tão familiares (Google, Facebook, YouTube) que já não precisam de explicações e estão a par com termos do século XX como Hoovering ou Coca-Cola.Páginas fortemente atacadas, artigos em particular, podem ser

In [53]:
def text_chunks(text, n):
    """Produce `n`-character chunks from `text`."""
    for start in range(0, len(text), n):
        yield text[start:start+n]

In [92]:
# read data, clean, create df with language as label
df_list = []
for fn in data_files:
    with open(os.path.join(data_dir,fn), 'r') as f:
        language = re.search('_(.+?)_', fn).group(1)
        # get rid of '[...]', '(...)', and digits
        text = re.sub('([\[\(].+?[\]\)]|\d+)', ' ', f.read()) 
        # split text into chunks of text to make more data samples
        texts = [txt.strip() for txt in text_chunks(text, (len(text)//84))]
        print(f'# samples for language {language}: {len(texts)} with avg. lengths of {np.mean([len(t) for t in texts])}')
        labels = [language]*len(texts)
        df_list.append(pd.DataFrame({'language':labels, 'content':texts}))
lang_df = pd.concat(df_list, axis=0)
print(f'\nNow DataFrame contains all languages with {lang_df.shape[0]} rows.\n')
print(lang_df.language.value_counts().to_string())
print(f'\nSaved at {os.path.join(data_dir,"languages.tsv")}')
lang_df.to_csv(os.path.join(data_dir,'languages.tsv'),sep='\t')

# samples for language pt: 85 with avg. lengths of 762.4470588235295
# samples for language es: 85 with avg. lengths of 681.4470588235295
# samples for language nl: 85 with avg. lengths of 229.96470588235294
# samples for language pl: 85 with avg. lengths of 261.88235294117646
# samples for language it: 85 with avg. lengths of 494.3529411764706
# samples for language fr: 85 with avg. lengths of 666.564705882353
# samples for language de: 85 with avg. lengths of 934.4941176470588
# samples for language ja: 85 with avg. lengths of 145.4
# samples for language ru: 85 with avg. lengths of 386.3764705882353
# samples for language en: 85 with avg. lengths of 916.7529411764706
# samples for language ar: 85 with avg. lengths of 103.47058823529412

Now DataFrame contains all languages with 935 rows.

ar    85
pt    85
fr    85
es    85
ja    85
ru    85
pl    85
de    85
nl    85
en    85
it    85

Saved at ../raw_data/languages.tsv


In [94]:
!ls ../raw_data/*.tsv

../raw_data/languages.tsv


## Checkpoint - Curate Sets

In [95]:
data_path = os.path.join(data_dir,'languages.tsv')
lang_df = pd.read_csv(data_path, delimiter='\t', index_col=0)
lang_df.head()

Unnamed: 0,content,language
0,Foi laureada com o Prémio Princesa das Astúria...,pt
1,não precisam de explicações e estão a par com ...,pt
2,"Bachelet, trazia a notícia sobre a sua vitória...",pt
3,mitida. Artigos disponíveis em mais de uma lí...,pt
4,s sobre a necessidade da liberdade de panorama...,pt


In [120]:
dev, test = train_test_split(lang_df, test_size=0.2, stratify=lang_df.language)
train, val = train_test_split(dev, test_size=0.25, stratify=dev.language)

# assign new column 'set' to specify train, test, or val for a sample
test = test.assign(set = ['test'] * len(test))
train = train.assign(set = ['train'] * len(train))
val = val.assign(set = ['val'] * len(val))

print(f'Training Distributions: \n{train.language.value_counts().to_string()}')
print(f'\nValidation Distributions: \n{val.language.value_counts().to_string()}')
print(f'\nTest Distributions: \n{test.language.value_counts().to_string()}')

lang_set_df = pd.concat([train,test,val], axis=0)
print(f'\nSample Size Distribution: \n{lang_set_df.set.value_counts().to_string()}')

lang_set_df.to_csv(os.path.join(data_dir,'language_sets.tsv'),sep='\t')

Training Distributions: 
pl    51
pt    51
de    51
nl    51
en    51
it    51
ar    51
es    51
ja    51
fr    51
ru    51

Validation Distributions: 
ar    17
de    17
es    17
pt    17
en    17
pl    17
nl    17
ja    17
fr    17
ru    17
it    17

Test Distributions: 
pl    17
ar    17
es    17
de    17
nl    17
ja    17
pt    17
fr    17
en    17
ru    17
it    17

Sample Size Distribution: 
train    561
test     187
val      187


## Checkpoint - Visualize Train Set w/ TSNE in Tensorboard

In [143]:
data_path = os.path.join(data_dir,'language_sets.tsv')
lang_set_df = pd.read_csv(data_path, delimiter='\t', index_col=0)
lang_set_df.head()

Unnamed: 0,content,language,set
32,echter geen sprake van een volledige afspiegel...,nl,train
42,chtelijke kennisgevingen niet gerealiseerd wor...,nl,train
45,め一般には大きな問題とされることは少ないだが英語版ではメインペジのいたずら書きが何回も発生し...,ja,train
1,był w języku angielskim .Wikipedia działa w...,pl,train
53,انتونيزية · السوندية · الكيشوية · الزازا...,ar,train


In [144]:
# from https://stackoverflow.com/questions/41258391/tensorboard-embedding-example
log_path = os.path.join(eda_model_dir,'log')

vec = CountVectorizer(analyzer='char_wb', ngram_range=(1,2))

train = lang_set_df[lang_set_df.set == 'train']
labels = train.language
train_X = vec.fit_transform(train.content)
vocab = vec.vocabulary_

dim = len(vocab)
print('Document Vector Dimensions:', dim)

# create a list of document vectors and write labels (texts)
embedding = np.empty((train.shape[0], dim), dtype=np.float32)
with open(os.path.join(log_path,'metadata.tsv'),'w') as f:
    for i, (_,row) in enumerate(train.iterrows()):
        embedding[i] = vec.transform([row.content]).A[0]
        f.write(f'{row.language}__{"_".join(row.content.split())}\n')

# setup a TensorFlow session
tf.reset_default_graph()
sess = tf.InteractiveSession()
X = tf.Variable([0.0], name='embedding')
place = tf.placeholder(tf.float32, shape=embedding.shape)
set_x = tf.assign(X, place, validate_shape=False)
sess.run(tf.global_variables_initializer())
sess.run(set_x, feed_dict={place: embedding})

# create a TensorFlow summary writer
summary_writer = tf.summary.FileWriter(log_path, sess.graph)
config = projector.ProjectorConfig()
embedding_conf = config.embeddings.add()
embedding_conf.tensor_name = 'embedding:0'
embedding_conf.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(summary_writer, config)

# save the model
saver = tf.train.Saver()
saver.save(sess, os.path.join(log_path, 'model.ckpt'))
sess.close()

Document Vector Dimensions: 7126


In [145]:
!ls ../models/EDA/log

checkpoint
events.out.tfevents.1526416622.JGIERINGER.local
metadata.tsv
model.ckpt.data-00000-of-00001
model.ckpt.index
model.ckpt.meta
projector_config.pbtxt


In [None]:
!tensorboard --logdir=../models/EDA/log

  from ._conv import register_converters as _register_converters
2018-05-24 17:14:09.080667: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
TensorBoard 1.8.0 at http://JGIERINGER.local:6006 (Press CTRL+C to quit)


In [138]:
!ls ../models/EDA/log/

# Pretrained Word Embeddings

In [21]:
# DO NOT RUN IF YOU DON'T WANT TO DOWNLOAD A 4 GIG FILE :)

# Check if model_dir has pretrained vectors
numbatch_url = 'https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz'
numbatch_fn = 'numberbatch-17.06.txt.gz'
numbatch_path = os.path.join(resource_path,numbatch_fn)
resources = os.listdir(resource_path)
if not any('numberbatch' in f for f in resources):
    print('Downloading Numberbatch Multilingual')
    os.system('wget -0 {} {}'.format(numbatch_path, numbatch_url))
    os.system('gunzip -c {} > {}'.format(numbatch_path, resource_path))
else:
    print('Woohoo you don\'t need to download a big file!')

Woohoo you don't need to download a big file!


In [None]:
# from https://stackoverflow.com/questions/41258391/tensorboard-embedding-example
# load model
# embedding_fn = os.path.join(resource_path,'numberbatch-17.06.txt')
log_path = '../models/pretrained_word_embeddings/log'
embedding_fn = '/Users/joshua.gieringer/jupyter_workspace/pretrained_vectors/numberbatch-en-17.06.txt'
word2vec = ds.load_word_embeddings(embedding_fn,skip_lines=1,binary=False)
dim = len(word2vec['a'])

# create a list of vectors and write labels (words)
embedding = np.empty((len(word2vec), dim), dtype=np.float32)
with open(os.path.join(log_path,'metadata.tsv'),'w') as f:
    for i, word in enumerate(word2vec):
        embedding[i] = word2vec[word]
        f.write(word + '\n')

# setup a TensorFlow session
tf.reset_default_graph()
sess = tf.InteractiveSession()
X = tf.Variable([0.0], name='embedding')
place = tf.placeholder(tf.float32, shape=embedding.shape)
set_x = tf.assign(X, place, validate_shape=False)
sess.run(tf.global_variables_initializer())
sess.run(set_x, feed_dict={place: embedding})

# create a TensorFlow summary writer
summary_writer = tf.summary.FileWriter(log_path, sess.graph)
config = projector.ProjectorConfig()
embedding_conf = config.embeddings.add()
embedding_conf.tensor_name = 'embedding:0'
embedding_conf.metadata_path = os.path.join('metadata.tsv')
projector.visualize_embeddings(summary_writer, config)

# save the model
saver = tf.train.Saver()
saver.save(sess, os.path.join(log_path, 'model.ckpt'))
sess.close()

In [8]:
!ls ..models/pretrained_word_embeddings/log

In [None]:
!tensorboard --logdir=../models/pretrained_word_embeddings/log