In [1]:
# begin by implementing the simple model provided by the author
# copying from https://github.com/Eligijus112/word-embedding-creation/

import itertools
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm

# Drawing the embeddings
import matplotlib.pyplot as plt

In [2]:
# Deep learning: more dependencies
# luckily all the required packages were already installed in my mlenv anaconda environment used in class

from keras.models import Input, Model
from keras.layers import Dense

from scipy import sparse

# Custom functions- my own version saved in my local folder
from utils import text_preprocessing, create_unique_word_dict, clean_text

In [3]:
# here is where the real code begins 

# Defining the window for context
window = 2

# Reading the text from the input folder
texts = pd.read_csv('input/sample.csv')
# restore later after I play around with manually introducing new text
texts = [x for x in texts['text']]

# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []

for text in texts:

    # Cleaning the text
    text = text_preprocessing(text)

    # Appending to the all text list
    all_text += text 

    # Creating a context dictionary
    for i, word in enumerate(text):
        for w in range(window):
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text): 
                word_lists.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i - w - 1)]])
                
unique_word_dict = create_unique_word_dict(all_text)

# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

# Creating the X and Y matrices using one hot encoding
X = []
Y = []

In [4]:
# for simplicity and debugging I am splitting up the code into cells that can be run individually
for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])

    # Creating the placeholders   
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)

# Converting the matrices into a sparse format because the vast majority of the data are 0s
X = sparse.csr_matrix(X)
Y = sparse.csr_matrix(Y)


84it [00:00, ?it/s]


In [5]:
# lets inspect the contents of the variables before moving to the next steps 
# looks like the X variable has 1355 data points

print(X)

  (0, 6)	1.0
  (1, 6)	1.0
  (2, 7)	1.0
  (3, 7)	1.0
  (4, 11)	1.0
  (5, 11)	1.0
  (6, 4)	1.0
  (7, 12)	1.0
  (8, 17)	1.0
  (9, 11)	1.0
  (10, 10)	1.0
  (11, 10)	1.0
  (12, 8)	1.0
  (13, 8)	1.0
  (14, 8)	1.0
  (15, 2)	1.0
  (16, 2)	1.0
  (17, 2)	1.0
  (18, 7)	1.0
  (19, 7)	1.0
  (20, 10)	1.0
  (21, 10)	1.0
  (22, 20)	1.0
  (23, 20)	1.0
  (24, 20)	1.0
  :	:
  (59, 7)	1.0
  (60, 7)	1.0
  (61, 7)	1.0
  (62, 7)	1.0
  (63, 13)	1.0
  (64, 13)	1.0
  (65, 13)	1.0
  (66, 13)	1.0
  (67, 19)	1.0
  (68, 19)	1.0
  (69, 19)	1.0
  (70, 3)	1.0
  (71, 3)	1.0
  (72, 11)	1.0
  (73, 11)	1.0
  (74, 10)	1.0
  (75, 10)	1.0
  (76, 10)	1.0
  (77, 1)	1.0
  (78, 1)	1.0
  (79, 1)	1.0
  (80, 9)	1.0
  (81, 9)	1.0
  (82, 1)	1.0
  (83, 8)	1.0


In [None]:
# here is the next chunk of code to make sure it runs 
# we get an error from the model.fit so lets walk it through 1 line at a time

# STOP HERE FOR NOW UNTIL I COMPLETE MORE TROUBLESHOOTING 

# Defining the size of the embedding
embed_size = 2

# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')



In [None]:
# we get an error here so maybe its the version of keras or maybe something did not install?
# I am going to go back to using the provided sample.csv to train the model see if that has better luck
# version of tensorflow required should be 4.43.0 but my mlenv environment has version 4.59.0 - could that be the reason?

# Optimizing the network weights
model.fit(
    x=X, 
    y=Y, 
    batch_size=256,
    epochs=1000
    )

# Obtaining the weights from the neural network. 
# These are the so called word embeddings

# The input layer 
weights = model.get_weights()[0]

# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
        })

# Ploting the embeddings
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
    coord = embedding_dict.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))       

# Saving the embedding vector to a txt file
try:
    os.mkdir(f'{os.getcwd()}\\output')        
except Exception as e:
    print(f'Cannot create output folder: {e}')

with open(f'{os.getcwd()}\\output\\embedding.txt', 'w') as f:
    for key, value in embedding_dict.items():
        try:
            f.write(f'{key}: {value}\n')   
        except Exception as e:
            print(f'Cannot write word {key} to dict: {e}')   