In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from nltk.tokenize import sent_tokenize
import time
import torch
import spacy
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

import nltk
nltk.download('punkt')


In [None]:
# prepare text using the both the nltk sentence tokenizer (https://www.nltk.org/api/nltk.tokenize.html)
# AND the spacy english pipeline (see https://spacy.io/models/en)


def prepare_texts(text, min_frequency=3):
    
    # Get a callable object from spacy that processes the text - lemmatizes and determines part of speech

    nlp = spacy.load("en_core_web_sm")
    
    # Some text cleaning. Do it by sentence, and eliminate punctuation.
    lemmas = []
    for sent in sent_tokenize(text):  # sent_tokenize separates the sentences 
        for tok in nlp(sent):         # nlp processes as in Part III
            if tok.pos_ not in ["PUNCT", "SPACE", "SYM", "NUM", "X"] and tok.lemma_ not in "[]|.,/?'\"+-=":
                lemmas.append(tok.lemma_)
    
    # Count the frequency of each lemmatized word
    freqs = Counter()  # word -> occurrence
    for w in lemmas:
        freqs[w] += 1
        
    vocab = list(freqs.items())  # List of (word, occurrence)
    vocab = sorted(vocab, key=lambda item: item[1], reverse=True)  # Sort by decreasing frequency
    
    # per Mikolov, don't use the infrequent words, as there isn't much to learn in that case
    
    frequent_vocab = list(filter(lambda item: item[1]>=min_frequency, vocab))
    
    # Create the dictionaries to go from word to index or vice-verse
    
    w2i = {w[0]:i for i,w in enumerate(frequent_vocab)}
    i2w = {i:w[0] for i,w in enumerate(frequent_vocab)}
    
    # Create an Out Of Vocabulary (oov) token as well
    w2i["<oov>"] = len(frequent_vocab)
    i2w[len(frequent_vocab)] = "<oov>"
    
    # Set all of the words not included in vocabulary nuas oov
    filtered_lemmas = []
    for lem in lemmas:
        if lem not in w2i:
            filtered_lemmas.append("<oov>")
        else:
            filtered_lemmas.append(lem)
    
    return filtered_lemmas, w2i, i2w

### tokenize_and_preprocess_text creates the training samples for the model. It walks through each word in the corpus, and looks at a window (of size 'window') of words and creates input/output prediction pairs.  We need both positive (in window) samples and negative (out of window) samples.

In [None]:
def tokenize_and_preprocess_text(textlist, w2i, window):
    """
    Skip-gram negative sampling: Predict if the target word is in the context.
    Uses binary prediction so we need both positive and negative samples
    """
    X, T, Y = [], [], []
    
    # Tokenize the input
    
    # TO DO
    
    # Loop through each token
    
    # TO DO
    
    return X, T, Y

## Define Model that will be trained to produce word vectors

In [None]:
class SkipGramNegativeSampling(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        
        # TO DO
        
    def forward(self, x, t):
        
        # x: torch.tensor of shape (batch_size), context word
        # t: torch.tensor of shape (batch_size), target ("output") word.
        

        # TO DO

        return prediction

#### The training function - give it the text and it does the rest

In [None]:
def train_sgns(textlist, window, embedding_size):
    # Set up a model with Skip-gram with negative sampling (predict context with word)
    # textlist: a list of strings
    
    # Create Training Data 
    
    # Split the training data
    
    # instantiate the network & set up the optimizer


    return network

### Run Training and retrieve embedding

In [None]:
# Run the training loop



### Reduce the Dimensionality of Embeddings and Display

In [None]:
from sklearn.decomposition import PCA #see https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

def visualize_embedding(embedding, most_frequent_from=20, most_frequent_to=80):
    print ("Visualizing the {} to {} most frequent words".format(most_frequent_from, most_frequent_to))
    
    # since the embeddings are ordered from most frequent words to least frequent, 
    # we can easily select a sub range of the most frequent words:
    
    selected_words = embedding[most_frequent_from:most_frequent_to, :]
    
    # The function below will reduce a vector to 2 principle components
    
    pca = PCA(n_components=2)
    
    # Transform the selected embeddings to have 2 dimensions
    
    embeddings = pca.fit_transform(selected_words)
    
    # Plot the the reduced embeddings - a point and the word itself
    
    # TO DO
    
visualize_embedding(embedding.detach().numpy(), most_frequent_from=20, most_frequent_to=80)

