In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
# from sentence_transformers import SentenceTransformer, util

In [3]:
raw_df = pd.read_csv('./data/raw.csv')

In [4]:
sender_df = raw_df[['sender_id', 'sender_name', 'country_of_origin']].copy()
sender_df.rename(columns={'sender_id':'organization_id', 'sender_name':'name', 'country_of_origin':'country'}, inplace=True)

In [5]:
receiver_df = raw_df[['receiver_id', 'receiver_name', 'country_of_destination']].copy()
receiver_df.rename(columns={'receiver_id':'organization_id', 'receiver_name':'name', 'country_of_destination':'country'}, inplace=True)

In [6]:
branch_df = pd.concat([sender_df, receiver_df], ignore_index=True)

In [8]:
assert branch_df.shape[0] == 2*raw_df.shape[0]

In [9]:
assert branch_df.shape[1] == 3

In [10]:
branch_df.head()

Unnamed: 0,entity_id,name,country
0,52c8642c34649eafe2d044eee3d884e1,Global Fire Protection,US
1,831a31a1466f0ace3eb20b52d4575f92,Carboline (India) Private Limited,
2,f580bc1756d06768c94634b0332e1871,Paladin Paints & Chemicals Private Limited,IN
3,149081be00548b006dc38a88264eae32,Alpha Pacific Group Pte. Ltd.,CN
4,f4864ac3d5d716cc586d60afe8a403ef,TREMCO CPG INDIA PRIVATE LIMITED,IN


In [11]:
branch_df.name = branch_df.name.str.lower()

In [12]:
branch_df

Unnamed: 0,entity_id,name,country
0,52c8642c34649eafe2d044eee3d884e1,global fire protection,US
1,831a31a1466f0ace3eb20b52d4575f92,carboline (india) private limited,
2,f580bc1756d06768c94634b0332e1871,paladin paints & chemicals private limited,IN
3,149081be00548b006dc38a88264eae32,alpha pacific group pte. ltd.,CN
4,f4864ac3d5d716cc586d60afe8a403ef,tremco cpg india private limited,IN
...,...,...,...
199763,66ebf2a3aa71c90b4df3fc93590e22db,tengizchevroil llp,KZ
199764,66ebf2a3aa71c90b4df3fc93590e22db,tengizchevroil llp,KZ
199765,66ebf2a3aa71c90b4df3fc93590e22db,tengizchevroil llp,KZ
199766,66ebf2a3aa71c90b4df3fc93590e22db,tengizchevroil llp,KZ


In [13]:
branch_df.country.isna().mean()

np.float64(0.19092647471066437)

In [25]:
branch_df.drop_duplicates(inplace=True)

In [26]:
branch_df.shape

(2280, 3)

# Transformer

In [14]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
model = AutoModel.from_pretrained(model_name)

In [15]:
def encode_branch_names(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :]  # Get the [CLS] token embeddings
    return embeddings

In [16]:
embeddings = encode_branch_names(branch_df.name.head(10).to_list())

In [17]:
embeddings

tensor([[-0.1593,  0.5316,  0.1424,  ..., -0.2873, -0.5618,  0.0940],
        [-0.3550,  0.0010, -0.2111,  ...,  0.1767, -0.1864, -0.3681],
        [-0.1751,  0.2805,  0.0979,  ...,  0.2655, -0.1599,  0.1864],
        ...,
        [-0.3601,  0.2911,  0.1464,  ..., -0.4111, -0.5530,  0.0299],
        [ 0.0975,  0.0013,  0.1086,  ...,  0.0643, -0.2248, -0.0738],
        [-0.8813, -0.1397, -0.0803,  ..., -0.0224,  0.0229, -0.0959]])

In [18]:
embeddings.shape

torch.Size([10, 384])

In [33]:
embeddings2 = encode_branch_names(['basf hong kong ltd', 'covestro hon kong limited', 'basf hongkong ltd'])
print(cosine_similarity(embeddings2[0], embeddings2[1]))
print(cosine_similarity(embeddings2[0], embeddings2[2]))
print(cosine_similarity(embeddings2[1], embeddings2[2]))

0.7208491563796997
0.992590606212616
0.7118538022041321


In [34]:
embeddings3 = encode_branch_names(['Craig International Supplies Inc located in US', 'Craig International Supplies Inc located in UK'])
print(cosine_similarity(embeddings3[0], embeddings3[1]))

0.976394534111023


In [19]:
similarity_matrix = util.pytorch_cos_sim(embeddings, embeddings)

print(similarity_matrix)

NameError: name 'util' is not defined

In [20]:
def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()

In [21]:
cosine_similarity(embeddings[0], embeddings[1])

0.5538130402565002

In [22]:
branch_df.head()

Unnamed: 0,entity_id,name,country
0,52c8642c34649eafe2d044eee3d884e1,global fire protection,US
1,831a31a1466f0ace3eb20b52d4575f92,carboline (india) private limited,
2,f580bc1756d06768c94634b0332e1871,paladin paints & chemicals private limited,IN
3,149081be00548b006dc38a88264eae32,alpha pacific group pte. ltd.,CN
4,f4864ac3d5d716cc586d60afe8a403ef,tremco cpg india private limited,IN


In [None]:
branch_similarity_df = pd.DataFrame(columns=['organization_id', 'name_a', 'country_a', 'name_b', 'country_b', 'cosine_similarity'])

In [None]:
embeddings2 = encode_branch_names(['basf hong kong ltd', 'covestro hon kong limited', 'basf hongkong ltd'])

In [1]:
# for name, group in branch_df.groupby('organization_id'):
#     print(f"Group: {name}")
#     print(group)

In [None]:
branch_df.