In [1]:
import pandas as pd

In [2]:
#Load SNI code descriptions
df = pd.read_csv('en-sni2007.csv', dtype='object')
df = df.rename(columns={'Detailed group':'SNI_CODE'})

In [3]:
#Convert to list
list0_sni = df.SNI_CODE.values.tolist()
list0_descr = df.Description.values.tolist()

In [None]:
#!pip install -I transformers --no-cache-dir --force-reinstall

In [4]:
#Sentence Transformer
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [6]:
#Calculate embeddings
embeddings = model.encode(list0_descr)

In [7]:
#Load utils
from sentence_transformers import util

In [8]:
#Calculate cosine similarity of embeddings
cos_sim = util.cos_sim(embeddings,embeddings)

In [9]:
#add all pairs to list
all_sni_desc_combinations=[]
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sni_desc_combinations.append((cos_sim[i][j],i,j))

In [10]:
#add all pairs to list with text
all_sni_desc_combinations_text=[]
for score, i,j in all_sni_desc_combinations[0:]:
    all_sni_desc_combinations_text.append((list0_sni[i],list0_descr[i],list0_sni[j],list0_descr[j],cos_sim[i][j].detach().item()))

In [11]:
#convert list to dataframe
df4 = pd.DataFrame(all_sni_desc_combinations_text, columns =['SNI_CODE1','Description1','SNI_CODE2','Description2','Similarity_Score'])

In [12]:
#Increased width of columns
pd.set_option('display.max_colwidth', 1)

In [13]:
df4[df4['Similarity_Score'].between(0.45, 0.5)].sort_values(by=['Similarity_Score'],ascending=False)

Unnamed: 0,SNI_CODE1,Description1,SNI_CODE2,Description2,Similarity_Score
125118,20300,"Manufacture of paints, varnishes and similar coatings, printing ink and mastics",26700,Manufacture of optical instruments and photographic equipment,0.499989
137969,23190,"Manufacture and processing of other glass, including technical glassware",32990,Other manufacturing n.e.c.,0.499977
186781,28230,Manufacture of office machinery and equipment (except computers and peripheral equipment),33190,Repair of other equipment,0.499966
134099,22230,Manufacture of builders’ ware of plastic,25910,Manufacture of steel drums and similar containers,0.499966
127765,20520,Manufacture of glues,32400,Manufacture of games and toys,0.499953
...,...,...,...,...,...
142889,23490,Manufacture of other ceramic products,25290,"Manufacture of other tanks, reservoirs and containers of metal",0.450021
105880,17111,Manufacture of mechanical or semi-chemical pulp,28220,Manufacture of lifting and handling equipment,0.450020
138526,23200,Manufacture of refractory products,25940,Manufacture of fasteners and screw machine products,0.450018
288854,47913,Retail sale of books and other media goods via mail order houses or via internet,73112,Delivery of advertising material,0.450018


In [14]:
#Write results to CSV
df4.to_csv('en_sweden_sni_similarity.csv')