# Word2vec - Word Embeddings

In [2]:
!pip install adjustText --quiet

  Building wheel for adjustText (setup.py) ... [?25l[?25hdone


In [3]:
import zipfile
import re
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from adjustText import adjust_text

## **Understanding the data**
## Downloading the data

In [4]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'

def download_data(url, data_dir):
  """Download a file if not present, and make sure it's the right size."""

  os.makedirs(data_dir, exist_ok=True)

  file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

  if not os.path.exists(file_path):
    print('Downloading file...')
    filename, _ = urlretrieve(url, file_path)
  else:
    print("File already exists")

  extract_path = os.path.join(data_dir, 'bbc')

  if not os.path.exists(extract_path):
    with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
      zipf.extractall(data_dir)
  else:
    print("bbc-fulltext.zip has already been extracted")


download_data(url, 'data')

Downloading file...


## Read Data without Preprocessing

Reads data as it is to a string and tokenize it using spaces and returns a list of words


In [5]:
def read_data(data_dir):
    
    # This will contain the full list of stories
    news_stories = []
    
    print("Reading files")
    
    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):
        
        for fi, f in enumerate(files):
            
            # We don't read the readme file
            if 'README' in f:
                continue
            
            # Printing progress
            i += 1
            print("."*i, f, end='\r')
            
            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as f:
                
                story = []
                # Read all the lines
                for row in f:
                                        
                    story.append(row.strip())
                    
                # Create a single string with all the rows in the doc
                story = ' '.join(story)                        
                # Add that to the list
                news_stories.append(story)  
                
        print('', end='\r')
        
    print(f"\nDetected {len(news_stories)} stories")
    return news_stories
                
  
news_stories = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## **Build a Tokenizer**

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


## Exploring the tokenizer

In [7]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))

Vocabulary size: 32360

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {'shefrin': 32350, 'holly': 32351, 'frankin': 32352, 'bloopers': 32353, "tabloids'": 32354, 'scrapbook': 32355, 'souvenir': 32356, 'stepdaughter': 32357, 'ass': 32358, 'saver': 32359}


## Build a Tokenizer (Refined)

Here, we will restrict the vocabulary to 15000 and eleminate words except the first most common 15000 words


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer


n_vocab = 15000 + 1
tokenizer = Tokenizer(
    num_words=n_vocab-1,
    lower=True,
    split=' ',
    oov_token=''
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer
