<a href="https://colab.research.google.com/github/cr0wley-zz/Embeddings/blob/main/Glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt

In [7]:
data = """
My name is Devjyoti. Your name is Aritra. Aritra loves coding. Devjyoti Loves Sleeping. Aritra loves cricket. Devjyoti loves watching movies. Aritra likes 
western movies. Devjyoti likes eastern movies. 
"""

In [20]:
def word_processor(data):
  # Converts the data into tokens
  tokenized_text = tf.keras.preprocessing.text.text_to_word_sequence(data)
  tokenized_text_size = len(tokenized_text)
  # Creates a vocab of unique words
  vocab = sorted(set(tokenized_text))
  vocab_size = len(vocab)
  # Map the vocab words to individual indices
  vocab_to_ix = {c:ix for ix,c in enumerate(vocab)}
  # Map the indices to the words in vocab
  ix_to_vocab = np.array(vocab)
  # Convert the data into numbers
  text_as_int = np.array([vocab_to_ix[c] for c in tokenized_text])
  return vocab ,vocab_size, vocab_to_ix, ix_to_vocab, text_as_int

In [21]:
word_processor(data)

(['aritra',
  'coding',
  'cricket',
  'devjyoti',
  'eastern',
  'is',
  'likes',
  'loves',
  'movies',
  'my',
  'name',
  'sleeping',
  'watching',
  'western',
  'your'],
 15,
 {'aritra': 0,
  'coding': 1,
  'cricket': 2,
  'devjyoti': 3,
  'eastern': 4,
  'is': 5,
  'likes': 6,
  'loves': 7,
  'movies': 8,
  'my': 9,
  'name': 10,
  'sleeping': 11,
  'watching': 12,
  'western': 13,
  'your': 14},
 array(['aritra', 'coding', 'cricket', 'devjyoti', 'eastern', 'is',
        'likes', 'loves', 'movies', 'my', 'name', 'sleeping', 'watching',
        'western', 'your'], dtype='<U8'),
 array([ 9, 10,  5,  3, 14, 10,  5,  0,  0,  7,  1,  3,  7, 11,  0,  7,  2,
         3,  7, 12,  8,  0,  6, 13,  8,  3,  6,  4,  8]))

In [11]:
co_occurance_matrix = np.zeros((vocab_size, vocab_size))
co_occurance_matrix.shape

(15, 15)

In [43]:
def compute_co_occurrence_matrix(corpus, window_size=1):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).

      Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
            number of co-occurring words.
            
            For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4,
            "All" will co-occur with "<START>", "that", "glitters", "is", and "not".

      Params:
          corpus (list of list of strings): corpus of documents
          window_size (int): size of context window
      Return:
          M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 
              Co-occurence matrix of word counts. 
              The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
          word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    vocab ,vocab_size, vocab_to_ix, ix_to_vocab, text_as_int  = word_processor(corpus)
    
    

    # ------------------
    # Write your implementation here.
    M = np.zeros((vocab_size, vocab_size))

    doc_len =  len(text_as_int)
    for current_idx in range(doc_len):
      left_boudary = max(current_idx-window_size, 0)
      right_boundary = min(current_idx+window_size+1, doc_len)
      outside_words = np.append(text_as_int[left_boudary:current_idx],text_as_int[current_idx+1:right_boundary])
      center_word = text_as_int[current_idx]
      
      
      for outside_word in outside_words:
          M[center_word, outside_word] += 1
      
      current_idx += 1
    # ------------------

    return M, vocab_to_ix, vocab_size




In [44]:
co_occurance_matrix ,vocab_to_ix, vocab_size  = compute_co_occurrence_matrix(data, 1)

In [35]:
co_occurance_matrix

array([[2., 0., 0., 0., 0., 1., 1., 2., 1., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 1., 1., 2., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0.],
       [1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [2., 1., 1., 2., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.],
       [1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 2., 0., 0., 0., 1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.

In [38]:
vocab_to_ix

{'aritra': 0,
 'coding': 1,
 'cricket': 2,
 'devjyoti': 3,
 'eastern': 4,
 'is': 5,
 'likes': 6,
 'loves': 7,
 'movies': 8,
 'my': 9,
 'name': 10,
 'sleeping': 11,
 'watching': 12,
 'western': 13,
 'your': 14}

In [39]:
probability_matrix = co_occurance_matrix/co_occurance_matrix.sum(axis = 0)

In [40]:
co_occurance_matrix.sum(axis = 0)

array([8., 2., 2., 8., 2., 4., 4., 8., 5., 1., 4., 2., 2., 2., 2.])

In [41]:
probability_matrix

array([[0.25 , 0.   , 0.   , 0.   , 0.   , 0.25 , 0.25 , 0.25 , 0.2  ,
        0.   , 0.   , 0.5  , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.125, 0.   , 0.   , 0.   , 0.125, 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.125, 0.   , 0.   , 0.   , 0.125, 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.5  , 0.5  , 0.   , 0.   , 0.25 , 0.25 , 0.25 , 0.2  ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.5  ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.25 , 0.   , 0.2  ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.125, 0.   , 0.   , 0.125, 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.5  , 0.   , 0.   , 0.   , 0.   ],
       [0.125, 0.   , 0.   , 0.125, 0.5  , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.5  , 0.   ],
       [0.25 , 0.5  , 0.5  , 0.25 , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.5  , 0.5  , 0.   , 0.   ],
       [0.125, 0.   , 0.

In [47]:
EMBEDDING_SIZE = 2
context_vector =  tf.Variable(np.random.rand(vocab_size, EMBEDDING_SIZE))
center_vector = tf.Variable(np.random.rand(vocab_size, EMBEDDING_SIZE))
bias_center = tf.Variable(np.random.rand(vocab_size, 1))
bias_context = tf.Variable(np.random.rand(vocab_size, 1))

In [49]:
def weighted_func(x):
  return np.power((x/100), (3/4)) if x< 100 else 1

In [None]:
def train_step(indices, loss_list):
  """The training step

  Arguments:
    indices (list): The indices of the vocab in the window
  """
  
  with tf.GradientTape() as tape:
     
   
    
  loss_list.append(np.array(log_loss))
  grad = tape.gradient(log_loss, [context_vector, center_vector])
  opt.apply_gradients(zip(grad, [context_vector, center_vector]))

In [None]:
loss_list = []
for iter in tqdm(range(iterations)):
  # Creation of window
  for i in (vocab):
    for j in rang

In [48]:
text_as_int

array([ 9, 10,  5,  3, 14, 10,  5,  0,  0,  7,  1,  3,  7, 11,  0,  7,  2,
        3,  7, 12,  8,  0,  6, 13,  8,  3,  6,  4,  8])