This experiments solely focuses on handling entity embeddings from glove datasets with only numpys, sklearn and scipy. I wanted to craft networks from scratch rather than using pytorch (like I usually do and didn't want to explode my pc again)


In [1]:

names = [
    "Alice",
    "Bob",
    "Charlie",
    "David",
    "Emma",
    "Fiona",
    "George",
    "Hannah",
    "Isabella",
    "Jack"
]


import numpy as np 
from gloveutils import GloveBox 
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

class Profiles:
    def __init__(self, names: list, **kwargs: dict):
        self.username = names
        self.friends = kwargs['embedding'] # size=[num of entities, 50]
        
        assert isinstance(self.friends, np.ndarray), f'Expected np type, but received {type(self.friends)}'
        assert self.friends.shape[-1] == 50, f"Glove embedding has unusual shape: {self.friends.shape}"
        
    def location(self):
        phone = cosine_similarity(self.friends, self.friends)
        assert phone.shape[0] == phone.shape[-1], f'Expected N by N matrix but received: {phone.shape}'
        data = pd.DataFrame(phone, columns=self.username, index=self.username)
        print(data)

book = GloveBox()
book.enter_basement()

Entering basement now. Fetching books . . .


  from tqdm.autonotebook import tqdm, trange


0it [00:00, ?it/s]

N.o of books loaded in shelf! There are 400001 books on the shelves


In [5]:
name_book = [book.search(word.lower()) for word in names]
name_book

[array([ 0.16386 ,  0.57795 , -0.59197 , -0.32446 ,  0.29762 ,  0.85151 ,
        -0.76695 , -0.20733 ,  0.21491 , -0.51587 , -0.17517 ,  0.94459 ,
         0.12705 , -0.33031 ,  0.75951 ,  0.44449 ,  0.16553 , -0.19235 ,
         0.065533, -0.12394 ,  0.61446 ,  0.89784 ,  0.17413 ,  0.41149 ,
         1.191   , -0.39461 , -0.459   ,  0.022161, -0.50843 , -0.44464 ,
         0.68721 , -0.7167  ,  0.20835 , -0.23437 ,  0.02604 , -0.47993 ,
         0.31873 , -0.29135 ,  0.50273 , -0.55144 , -0.066692,  0.43873 ,
        -0.24293 , -1.0247  ,  0.029375,  0.068499,  0.25451 , -1.9663  ,
         0.26673 ,  0.88486 ], dtype=float32),
 array([-1.075    ,  0.18316  ,  0.32895  ,  0.63907  , -0.56016  ,
         0.57065  , -1.6973   ,  0.23407  , -0.66     , -0.79543  ,
        -0.87456  ,  0.47696  , -0.54104  , -0.010141 ,  0.33098  ,
        -0.023072 ,  0.61555  ,  0.078931 , -0.26537  , -0.673    ,
        -0.47385  ,  0.68288  ,  0.19332  , -0.35322  ,  0.38568  ,
        -1.0739   , -

In [22]:
people = np.stack(name_book)

In [33]:
pd.DataFrame(cosine_similarity(people, people), columns=names, index=names)

Unnamed: 0,Alice,Bob,Charlie,David,Emma,Fiona,George,Hannah,Isabella,Jack
Alice,1.0,0.446056,0.540106,0.469889,0.804588,0.704335,0.509538,0.795454,0.547919,0.605657
Bob,0.446056,1.0,0.729476,0.644884,0.32123,0.312639,0.659827,0.41092,0.026504,0.764855
Charlie,0.540106,0.729476,1.0,0.536244,0.419993,0.373742,0.615513,0.527851,0.187277,0.781151
David,0.469889,0.644884,0.536244,1.0,0.385114,0.264023,0.631671,0.402353,0.130685,0.638008
Emma,0.804588,0.32123,0.419993,0.385114,1.0,0.696199,0.452665,0.705395,0.585676,0.560839
Fiona,0.704335,0.312639,0.373742,0.264023,0.696199,1.0,0.170604,0.62113,0.457493,0.41653
George,0.509538,0.659827,0.615513,0.631671,0.452665,0.170604,1.0,0.461869,0.328048,0.70829
Hannah,0.795454,0.41092,0.527851,0.402353,0.705395,0.62113,0.461869,1.0,0.481333,0.621056
Isabella,0.547919,0.026504,0.187277,0.130685,0.585676,0.457493,0.328048,0.481333,1.0,0.13143
Jack,0.605657,0.764855,0.781151,0.638008,0.560839,0.41653,0.70829,0.621056,0.13143,1.0


In [51]:
profile_app = Profiles(names=names, embedding=people)
profile_app.location.shape

(10, 50)

In [52]:
profile_app.tracker()

             Alice       Bob   Charlie     David      Emma     Fiona  \
Alice     1.000000  0.446056  0.540106  0.469889  0.804588  0.704335   
Bob       0.446056  1.000000  0.729476  0.644884  0.321230  0.312639   
Charlie   0.540106  0.729476  1.000000  0.536244  0.419993  0.373742   
David     0.469889  0.644884  0.536244  1.000000  0.385114  0.264023   
Emma      0.804588  0.321230  0.419993  0.385114  1.000000  0.696199   
Fiona     0.704335  0.312639  0.373742  0.264023  0.696199  1.000000   
George    0.509538  0.659827  0.615513  0.631671  0.452665  0.170604   
Hannah    0.795454  0.410920  0.527851  0.402353  0.705395  0.621130   
Isabella  0.547919  0.026504  0.187277  0.130685  0.585676  0.457493   
Jack      0.605657  0.764855  0.781151  0.638008  0.560839  0.416530   

            George    Hannah  Isabella      Jack  
Alice     0.509538  0.795454  0.547919  0.605657  
Bob       0.659827  0.410920  0.026504  0.764855  
Charlie   0.615513  0.527851  0.187277  0.781151  
Dav

## Custom model

In [130]:

class CustomModel:
    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int, loss_function, **kwargs: dict):
        # Initialize weights and biases
        self.weights = np.random.randn(input_dim, hidden_dim)
        self.biases = np.random.randn(hidden_dim)
        self.w2 = np.random.randn(hidden_dim, output_dim)
        self.b2 = np.random.randn(output_dim)
        self.criterion = loss_function 
        self.learning_rate = 0.05 if 'lr' not in kwargs else kwargs['lr']
        
    def _weight_step(self, weights, biases, inputs, targets, output):
        inputs = inputs.reshape(inputs.shape[0], -1)
        grad_output = 2 * (output - targets) / output.shape[0]
        grad_weights = np.dot(inputs.T, grad_output)
        grad_biases = np.sum(grad_output, axis=0)
        weights -= self.learning_rate * grad_weights 
        biases -= self.learning_rate * grad_biases 
        
    def step(self, inputs, targets):
        assert hasattr(self, 'max_epoch'), 'No max epoch attr found. Config max epoch attr before training...'
        outputs = self.forward(inputs)
        output = np.dot(outputs, self.w2) + self.b2
        print(f"Output shape: {output.shape}")
        loss = self.criterion(output, targets)
        self._weight_step(self.weights, self.biases, inputs, targets, output)
        self._weight_step(self.w2, self.b2, inputs, targets, output)
        print(f"EPOCH {self.max_epoch[0]} \t loss {loss}")
        self.max_epoch = self.max_epoch[1:]
        return self.step(inputs, targets) if len(self.max_epoch) != 0 else None 
    
    def forward(self, inputs):  
        assert inputs.shape[-1] == 50 and len(inputs.shape) == 3, f"Expected input shape (batch_size, 1, 50) but received input shape {inputs.shape}"
        x = np.dot(inputs, self.weights) + self.biases # [10, 1, hidden_dim]
        assert x.shape[0] == inputs.shape[0], f"Expected 1st dim shape of x to be {inputs.shape} but received: {x.shape}"
        return x
    
    def update(self, batch_input: np.ndarray, batch_target: np.ndarray, max_epoch: np.ndarray):
        self.max_epoch = list(range(max_epoch))
        self.step(batch_input, batch_target)
        
def mse_loss(predictions: np.ndarray, targets: np.ndarray) -> float:
    return np.mean((predictions - targets) ** 2)

def target_location(hit_list: list, target_list: list, kill_list: list, x: int, y: int):
    assert len(target_list) == len(kill_list)
    assert len(hit_list) == len(target_list)

    grid = np.zeros((x, y))
    for index in range(len(hit_list)):
        hits = hit_list[index]
        target = target_list[index]
        kills = kill_list[index]
        if x > 1 and y > 1:
            grid[hits, target] = 1
            grid[hits, kills] = -1
            grid[target, hits] = 1
            grid[kills, hits] = -1 
        else: 
            assert x != y, f"this would mean input shape (x, y) == (1, 1) which is invalid"
            if x == 1 and y > 1:
                grid[:, hits] = 1
                grid[:, kills] = -1
            elif x > 1 and y == 1:
                grid[hits, :] = 1
                grid[kills, :] = -1
            
    return grid 

def evaluate_model(model: CustomModel, entity_embedding: np.ndarray, threshold: int = 0.5, name_tags: list = None):
    prev_mat = cosine_similarity(entity_embedding, entity_embedding)
    output = [model.forward(i) for i in entity_embedding]
    output = np.stack(output) # [10, 30]?!
    #assert output.shape == entity_embedding.shape, f"Output shape: {output.shape} Embedding shape: {entity_embedding.shape}"
    new_mat = cosine_similarity(output, output)
    diff_mat = prev_mat - new_mat 
    
    assert new_mat.shape == prev_mat.shape 
    if name_tags:
        import pandas as pd 
        prev_df = pd.DataFrame(np.round(prev_mat, 5), columns=name_tags, index=name_tags)
        new_df = pd.DataFrame(np.round(new_mat, 5), columns=name_tags, index=name_tags)
        #diff_df = pd.DataFrame(diff_mat, columns=name_tags, index=name_tags)
        print(prev_df)
        print(new_df)
    else:
        print(np.round(prev_mat), 5)
        print(np.round(new_mat), 5)
    print(np.where(diff_mat > threshold, 1, 0))
    
model = CustomModel(50, 1, 30, mse_loss)
model.update(batch_input=profile_app.location, batch_target=np.ones((10, 1)), max_epoch=10)

EPOCH 0 	 loss 22.214220176684307
EPOCH 1 	 loss 10.68148720855603
EPOCH 2 	 loss 6.857888446926399
EPOCH 3 	 loss 5.323384460519429
EPOCH 4 	 loss 4.520614143451544
EPOCH 5 	 loss 3.990100361587179
EPOCH 6 	 loss 3.587378390461643
EPOCH 7 	 loss 3.2605692501158132
EPOCH 8 	 loss 2.986626982461855
EPOCH 9 	 loss 2.7526526622685856


In [134]:
[f"Mean: {model.forward(i).mean()}" for i in profile_app.location]

['Mean: 1.0649443524286095',
 'Mean: 0.8311697866211062',
 'Mean: 1.5073113730794947',
 'Mean: 1.4912497850036057',
 'Mean: 0.6693361137691792',
 'Mean: 0.6065770308561704',
 'Mean: 0.6699219604907855',
 'Mean: 0.9044316598610441',
 'Mean: 1.398800954665618',
 'Mean: 0.8264205221334359']

In [139]:
results = [model.forward(i) for i in profile_app.location] 
np.stack(results).shape

(10, 30)

In [161]:
evaluate_model(model=model, entity_embedding=profile_app.location)

[[1. 0. 1. 0. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 0. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 0. 0. 1. 1. 0. 1.]
 [0. 1. 1. 1. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 1. 0. 1. 1. 1.]
 [1. 0. 0. 0. 1. 1. 0. 1. 0. 0.]
 [1. 1. 1. 1. 0. 0. 1. 0. 0. 1.]
 [1. 0. 1. 0. 1. 1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 1. 1. 1. 1. 0. 1. 1. 0. 1.]] 5
[[ 1.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0. -0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0. -0.  0.  0.  0.]
 [ 0.  0. -0.  0.  1.  0. -0.  0. -0.  0.]
 [ 0.  0.  0.  0.  0.  1. -0.  0.  0.  0.]
 [ 0.  0.  0. -0. -0. -0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]] 5
[[-0.   -0.12  0.12  0.07  0.56  0.5   0.44  0.5   0.2   0.59]
 [-0.12  0.    0.49  0.41  0.13 -0.11  0.49  0.11 -0.27  0.57]
 [ 0.12  0.49 -0.    0.19  0.49  0.3   0.47  0.3  -0.3   0.73]
 [ 0.07  0.41  0.19 -0.    0.    0.02  0.67  

## Trialing out with a actual target vector

Explaination on constructing the target vector
1.	hit_list: These are the indices of entity embeddings you want to change (modify the embeddings).
2.	target_list: These are the indices of the entity embeddings you want the hit_list entities to be closer to.
3.	kill_list: These are the indices of the entity embeddings you want the hit_list entities to be farther from.

Ideally the objective loss function would be able to:

1.	Minimizes the distance between the embeddings in the hit_list and the target_list.
2.	Maximizes the distance between the embeddings in the hit_list and the kill_list.

However, for the sake of simplifying this experiment and my curiousity on other things, I have used MSE as the loss function

In [175]:
model = CustomModel(50, 1, 30, mse_loss)
model.update(batch_input=profile_app.location, batch_target=target_location(hit_list=[1, 2, 3], target_list=[4, 5, 6], kill_list=[7, 8, 9], x=10, y=1), max_epoch=10)

EPOCH 0 	 loss 24.784918799453397
EPOCH 1 	 loss 11.338007865873994
EPOCH 2 	 loss 7.060437121456583
EPOCH 3 	 loss 5.462876413166307
EPOCH 4 	 loss 4.691157749288646
EPOCH 5 	 loss 4.20593135870951
EPOCH 6 	 loss 3.8431567587684823
EPOCH 7 	 loss 3.54767039975097
EPOCH 8 	 loss 3.2971879670920257
EPOCH 9 	 loss 3.0802994211231636


In [180]:
evaluate_model(model, entity_embedding=profile_app.location, name_tags=profile_app.username)

            Alice      Bob  Charlie    David     Emma    Fiona   George  \
Alice     1.00000  0.44606  0.54011  0.46989  0.80459  0.70434  0.50954   
Bob       0.44606  1.00000  0.72948  0.64488  0.32123  0.31264  0.65983   
Charlie   0.54011  0.72948  1.00000  0.53624  0.41999  0.37374  0.61551   
David     0.46989  0.64488  0.53624  1.00000  0.38511  0.26402  0.63167   
Emma      0.80459  0.32123  0.41999  0.38511  1.00000  0.69620  0.45266   
Fiona     0.70434  0.31264  0.37374  0.26402  0.69620  1.00000  0.17060   
George    0.50954  0.65983  0.61551  0.63167  0.45266  0.17060  1.00000   
Hannah    0.79545  0.41092  0.52785  0.40235  0.70540  0.62113  0.46187   
Isabella  0.54792  0.02650  0.18728  0.13069  0.58568  0.45749  0.32805   
Jack      0.60566  0.76485  0.78115  0.63801  0.56084  0.41653  0.70829   

           Hannah  Isabella     Jack  
Alice     0.79545   0.54792  0.60566  
Bob       0.41092   0.02650  0.76485  
Charlie   0.52785   0.18728  0.78115  
David     0.40235 