# Word2vec - Word Embeddings

In [2]:
!pip install adjustText --quiet

  Building wheel for adjustText (setup.py) ... [?25l[?25hdone


In [3]:
import zipfile
import re
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from adjustText import adjust_text

## **Understanding the data**
## Downloading the data

In [4]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'

def download_data(url, data_dir):
  """Download a file if not present, and make sure it's the right size."""

  os.makedirs(data_dir, exist_ok=True)

  file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

  if not os.path.exists(file_path):
    print('Downloading file...')
    filename, _ = urlretrieve(url, file_path)
  else:
    print("File already exists")

  extract_path = os.path.join(data_dir, 'bbc')

  if not os.path.exists(extract_path):
    with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
      zipf.extractall(data_dir)
  else:
    print("bbc-fulltext.zip has already been extracted")


download_data(url, 'data')

Downloading file...


## Read Data without Preprocessing

Reads data as it is to a string and tokenize it using spaces and returns a list of words


In [5]:
def read_data(data_dir):
    
    # This will contain the full list of stories
    news_stories = []
    
    print("Reading files")
    
    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):
        
        for fi, f in enumerate(files):
            
            # We don't read the readme file
            if 'README' in f:
                continue
            
            # Printing progress
            i += 1
            print("."*i, f, end='\r')
            
            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as f:
                
                story = []
                # Read all the lines
                for row in f:
                                        
                    story.append(row.strip())
                    
                # Create a single string with all the rows in the doc
                story = ' '.join(story)                        
                # Add that to the list
                news_stories.append(story)  
                
        print('', end='\r')
        
    print(f"\nDetected {len(news_stories)} stories")
    return news_stories
                
  
news_stories = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## **Build a Tokenizer**

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


## Exploring the tokenizer

In [7]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))

Vocabulary size: 32360

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {'shefrin': 32350, 'holly': 32351, 'frankin': 32352, 'bloopers': 32353, "tabloids'": 32354, 'scrapbook': 32355, 'souvenir': 32356, 'stepdaughter': 32357, 'ass': 32358, 'saver': 32359}


## Build a Tokenizer (Refined)

Here, we will restrict the vocabulary to 15000 and eleminate words except the first most common 15000 words


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer


n_vocab = 15000 + 1
tokenizer = Tokenizer(
    num_words=n_vocab-1,
    lower=True,
    split=' ',
    oov_token=''
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


## Checking the results of the tokenizer

In [12]:
print(f"Original: {news_stories[0][:100]}")
print(f"Sequence IDs: {tokenizer.texts_to_sequences([news_stories[0][:100]])[0]}")

Original: Real will finish abandoned match  Real Madrid and Real Socieded will play the final six minutes of t
Sequence IDs: [286, 23, 1881, 4501, 328, 286, 1076, 5, 286, 1, 23, 153, 2, 253, 191, 495, 4, 1360]


**Converting all articles to word ID sequences**

In [13]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

## **Generating skip-grams from the corpus**

In TensorFlow you have the convenient `tf.keras.preprocessing.sequence.skipgrams()` function to generate skipgrams.


In [17]:
news_sequences[0][:5]

[286, 23, 1881, 4501, 328]

In [18]:
sample_word_ids = news_sequences[0][:5]
sample_phrase = ' '.join([tokenizer.index_word[wid] for wid in sample_word_ids])
print(f"Sample phrase: {sample_phrase}")
print(f"Sample word IDs: {sample_word_ids}\n")

window_size = 1 # How many words to consider left and right.

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids, 
    vocabulary_size=n_vocab, 
    window_size=window_size, negative_samples=1.0, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)


print("Sample skip-grams")

for inp, lbl in zip(inputs, labels):
    print(f"\tInput: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")

Sample phrase: real will finish abandoned match
Sample word IDs: [286, 23, 1881, 4501, 328]

Sample skip-grams
	Input: [286, 23] (['real', 'will']) / Label: 1
	Input: [23, 286] (['will', 'real']) / Label: 1
	Input: [23, 1881] (['will', 'finish']) / Label: 1
	Input: [1881, 23] (['finish', 'will']) / Label: 1
	Input: [1881, 4501] (['finish', 'abandoned']) / Label: 1
	Input: [4501, 1881] (['abandoned', 'finish']) / Label: 1
	Input: [4501, 328] (['abandoned', 'match']) / Label: 1
	Input: [328, 4501] (['match', 'abandoned']) / Label: 1
	Input: [4501, 13729] (['abandoned', "cameroon's"]) / Label: 0
	Input: [1881, 5840] (['finish', 'housewives']) / Label: 0
	Input: [23, 7357] (['will', 'advances']) / Label: 0
	Input: [1881, 13328] (['finish', 'barnes']) / Label: 0
	Input: [328, 6533] (['match', 'betamax']) / Label: 0
	Input: [286, 6115] (['real', 'sells']) / Label: 0
	Input: [4501, 9146] (['abandoned', 'ginepri']) / Label: 0
	Input: [23, 11943] (['will', 'discovering']) / Label: 0



## Generating negative candidates

Word2vec algorithms rely on negative candidates to understand words that do not appear in the context of a given target word.


In [19]:
inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids, 
    vocabulary_size=len(tokenizer.word_index.items())+1, 
    window_size=window_size, negative_samples=0, shuffle=False,    
)

inputs, labels = np.array(inputs), np.array(labels)

negative_sampling_candidates, true_expected_count, sampled_expected_count = tf.random.log_uniform_candidate_sampler(
    # A true context word that appears in the context of the target
    true_classes=inputs[:1,1:], # [b, 1] sized tensor
    num_true=1, # number of true words per example
    num_sampled=10,
    unique=True,
    range_max=n_vocab,            
    name="negative_sampling"
)

print(f"Positive sample: {inputs[:1,1:]}")
print(f"Negative samples: {negative_sampling_candidates}")
print(f"true_expected_count: {true_expected_count}")
print(f"sampled_expected_count: {sampled_expected_count}")

Positive sample: [[23]]
Negative samples: [  44  832  819  226    1   32    9 2025   23    0]
true_expected_count: [[0.04571897]]
sampled_expected_count: [0.02485704 0.00137159 0.00139331 0.00501681 0.37742305 0.03362463
 0.1037828  0.00056434 0.04571897 0.5608634 ]
