In [11]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
stock_info = pd.read_csv('../data/stock_info.tsv', sep='\t')

In [13]:
stock_info.columns

Index(['address1', 'city', 'zip', 'country', 'phone', 'fax', 'website',
       'industry', 'industryKey', 'industryDisp',
       ...
       'shortPercentOfFloat', 'totalAssets', 'navPrice', 'beta3Year',
       'fundFamily', 'fundInceptionDate', 'legalType', 'yield', 'ytdReturn',
       'threeYearAverageReturn'],
      dtype='object', length=146)

In [14]:
stock_info['text_for_embedding'] = stock_info['sector'] + ' ' + stock_info['industry']  + ' ' + stock_info['country']
# convert text_for_embedding to string
stock_info['text_for_embedding'] = stock_info['text_for_embedding'].astype(str)

In [15]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(stock_info['text_for_embedding'])

In [16]:

df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [17]:
df_tfidf['isin'] = stock_info['isin']
# set isin as index
df_tfidf.set_index('isin', inplace=True)

In [18]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=stock_info['isin'], columns=stock_info['isin'])
np.fill_diagonal(cosine_sim_df.values, np.nan)

In [19]:
cosine_sim_df['JP3435000009'].sort_values(ascending=False).head(10)

isin
US0378331005    0.748241
KR7005930003    0.532537
JP3633400001    0.358295
JP3538800008    0.355751
JP3621000003    0.299956
JP3705200008    0.267820
IE00B4BNMY34    0.262319
JP3566800003    0.256446
JP3890350006    0.250363
JP3802400006    0.226913
Name: JP3435000009, dtype: float64