In [40]:
# this is just to refresh my mind on using all the huggingface stuff locally
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings



In [41]:
model_kwargs = {'device': 'cuda:0'}  # specify GPU device
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs=model_kwargs,
                                   encode_kwargs = encode_kwargs)

In [42]:
# load up positive set! 
positive_set = pd.read_csv("../raw_data/positive_set.csv")

In [43]:
positive_set.head()

Unnamed: 0.1,Unnamed: 0,tune_a,tune_b,setting_a,setting_b,abc_a,abc_b
0,0,1,1,22061,31475,"ED|""Em""EBBA B2 EB|""Em""B2 AB dBAG|""D""F/E/D AD B...","|:D2|""Em""EBBA B2 EB|B2 AB dBAG|""D""FDAD BDAD|FD..."
1,1,2,2,29662,48833,dB|A2 A2 A2 A2|AFAB ^D3B|A2FA A2Bd|(3efg fd (3...,|:A2 FA A2dB|A2FA BEE2|1 A2 FA A2Bd|egfd edBd:...
2,2,3,3,12349,3,B|GE E/E/E GDDB|GE E/E/E B2 AB|GE E/E/E GDDG|E...,BA|G2 BG AFDF|G2Bd c2BA|G2 BG AFDF|GEEG c2BA|\...
3,3,4,4,42849,45019,ag|:fa df Ad FA|DF Ad f2 ef|gb eg ce AF|GA BG ...,|:ag|(3fga df AdFA|DFAd (3fgf ef|gbec eBAF|GA ...
4,4,5,5,34014,25996,|:DED DEG|A2A ABc|BAG AGE|GEA GED|\r\nDED DEG|...,|:D3 DEG|A3 ABc|BAG AGE|GEA GED|\r\nD3 DEG|A3 ...


In [44]:
positive_set_embedding_a = embeddings.embed_documents(list(positive_set.abc_a))

In [45]:
len(positive_set_embedding_a)

9306

In [46]:
positive_set["embedding_a"] = list(positive_set_embedding_a)

In [47]:
positive_set_embedding_b = embeddings.embed_documents(list(positive_set.abc_b))

In [48]:
positive_set["embedding_b"] = list(positive_set_embedding_b)

In [49]:
positive_set.to_csv("../processed_data/positive_set.csv")

In [50]:
# now do the same for the negative set
negative_set = pd.read_csv("../raw_data/negative_set.csv")
negative_set_embedding_a = embeddings.embed_documents(list(negative_set.abc_a))
negative_set["embedding_a"] = list(negative_set_embedding_a)
negative_set_embedding_b = embeddings.embed_documents(list(negative_set.abc_b))
negative_set["embedding_b"] = list(negative_set_embedding_b)
negative_set.to_csv("../processed_data/negative_set.csv")

In [60]:
import numpy as np
from numpy.linalg import norm


In [65]:
A = negative_set_embedding_a[0]
B = negative_set_embedding_b[0]

In [66]:
np.dot(A, B) / (norm(A) * norm(B))

np.float64(0.645343715135031)

In [69]:
# positive_set similarity score
def cosine_similarity(x,y):
    return np.dot(x,y) / (norm(x) * norm(y))

positive_set["similarity"] = positive_set.apply(lambda row: cosine_similarity(row["embedding_a"], row["embedding_b"]), axis = 1)
negative_set["similarity"] = negative_set.apply(lambda row: cosine_similarity(row["embedding_a"], row["embedding_b"]), axis = 1)

In [68]:
positive_set.similarity.describe()

count    9306.000000
mean        0.816034
std         0.121615
min         0.211253
25%         0.732850
50%         0.832362
75%         0.915044
max         1.000000
Name: similarity, dtype: float64

In [70]:
negative_set.similarity.describe()

count    9306.000000
mean        0.653411
std         0.097812
min         0.140859
25%         0.593914
50%         0.663395
75%         0.722377
max         0.918185
Name: similarity, dtype: float64

In [71]:
positive_set.head()

Unnamed: 0.1,Unnamed: 0,tune_a,tune_b,setting_a,setting_b,abc_a,abc_b,embedding_a,embedding_b,similarity
0,0,1,1,22061,31475,"ED|""Em""EBBA B2 EB|""Em""B2 AB dBAG|""D""F/E/D AD B...","|:D2|""Em""EBBA B2 EB|B2 AB dBAG|""D""FDAD BDAD|FD...","[0.010539693757891655, -0.039622943848371506, ...","[0.0156656913459301, -0.0031629279255867004, -...",0.932536
1,1,2,2,29662,48833,dB|A2 A2 A2 A2|AFAB ^D3B|A2FA A2Bd|(3efg fd (3...,|:A2 FA A2dB|A2FA BEE2|1 A2 FA A2Bd|egfd edBd:...,"[-0.05852976068854332, -0.06965914368629456, -...","[-0.02331891469657421, -0.07907111197710037, -...",0.814261
2,2,3,3,12349,3,B|GE E/E/E GDDB|GE E/E/E B2 AB|GE E/E/E GDDG|E...,BA|G2 BG AFDF|G2Bd c2BA|G2 BG AFDF|GEEG c2BA|\...,"[0.003211917821317911, 0.02360629104077816, -0...","[-0.03601918742060661, -0.02183515951037407, -...",0.836771
3,3,4,4,42849,45019,ag|:fa df Ad FA|DF Ad f2 ef|gb eg ce AF|GA BG ...,|:ag|(3fga df AdFA|DFAd (3fgf ef|gbec eBAF|GA ...,"[-0.08714918792247772, -0.06767382472753525, -...","[-0.08165708184242249, -0.08736856281757355, -...",0.81438
4,4,5,5,34014,25996,|:DED DEG|A2A ABc|BAG AGE|GEA GED|\r\nDED DEG|...,|:D3 DEG|A3 ABc|BAG AGE|GEA GED|\r\nD3 DEG|A3 ...,"[0.024247556924819946, 0.07042411714792252, -0...","[0.012631156481802464, 0.05618187412619591, -0...",0.96097
