### importing the required libraries

In [3]:
import random, collections, math, os, zipfile, time, re
import numpy as np
import tensorflow as tf

from matplotlib import pylab
%matplotlib inline

from six.moves import range
from six.moves.urllib.request import urlretrieve

### download and extract dataset

In [4]:
dataset_link = 'http://mattmahoney.net/dc/'
zip_file = 'text8.zip'

def data_download(zip_file):
    if not os.path.exists(zip_file):
        zip_file, _ = urlretrieve(dataset_link + zip_file, zip_file)
        print('Dataset downloaded')
    print('Dataset already exists')    
    return None
data_download(zip_file)

Dataset already exists


In [5]:
extracted_folder = 'dataset'

if not os.path.isdir(extracted_folder):
    with zipfile.ZipFile(zip_file) as zf:
        zf.extractall(extracted_folder)
with open('dataset/text8') as ft_:
    full_text = ft_.read()

### Text processing

In [6]:
def text_processing(ft8_text):
    ft8_text = ft8_text.lower()
    ft8_text = ft8_text.replace('.', '<period> ')
    ft8_text = ft8_text.replace(',', '<comma> ')
    ft8_text = ft8_text.replace('""', '<quotation> ')
    ft8_text = ft8_text.replace(';', '<semicolon> ')
    ft8_text = ft8_text.replace('!', '<exclamation> ')
    ft8_text = ft8_text.replace('?', '<question> ')
    ft8_text = ft8_text.replace('(', '<paren_l> ')
    ft8_text = ft8_text.replace(')', '<paren_r> ')
    ft8_text = ft8_text.replace('--', '<hyphen> ')
    ft8_text = ft8_text.replace(':', '<colon> ')
    ft8_text_tokens = ft8_text.split()
    
    return ft8_text_tokens

ft_tokens = text_processing(full_text)

#### remove noise related to word

In [7]:
word_cnt = collections.Counter(ft_tokens)
print(len(word_cnt.values()))

shortlisted_words = [w for w in ft_tokens if word_cnt[w] > 7]
print(shortlisted_words[:10])

print(len(shortlisted_words))
print("Unique ones: ", len(set(shortlisted_words)))

253854
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
16616688
Unique ones:  53721


#### create a dictionary of the words present in dataset with their frequency order

In [14]:
def dict_creation(shortlisted_words):
    counts = collections.Counter(shortlisted_words)
    vocabulary = sorted(counts, key=counts.get, reverse=True)
    rev_dictionary_ = {ii: word for ii, word in enumerate(vocabulary)}
    dictionary_ = {word: ii for ii, word in rev_dictionary_.items()}
    return dictionary_, rev_dictionary_

dictionary_, rev_dictionary_ = dict_creation(shortlisted_words)

words_cnt = [dictionary_[word] for word in shortlisted_words]

print(words_cnt[0])

5233


## Let's start with Skip-Gram Model 

### Create a threshold and perform the subsampling

In [18]:
thresh = 0.00005
word_counts = collections.Counter(words_cnt)
total_count = len(words_cnt)
freqs = {word: count / total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(thresh/freqs[word]) for word in word_counts}

train_words = [word for word in words_cnt if p_drop[word] < random.random()]

print(train_words[0])

5233


In [None]:
def skipG_target_set_generation(batch_, batch_index, word_window): 
    """The function combines the words of given word_window size next to the index, for the SkipGram model"""
    random_num = np.random.randint(1, word_window+1)
    words_start = batch_index - random_num if (batch_index - random_num) > 0 else 0
    words_stop = batch_index + random_num
    window_target = set(batch_[words_start:batch_index] + batch_[batch_index+1:words_stop+1])
    return list(window_target)