In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [12]:
stock_list_df = pd.read_csv('http://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt', sep='|', skipfooter=1)
# stock_df.to_csv('data/nasdaq_stock_info/nasdaqtraded.txt', sep='|')


In [13]:
stock_list_df = pd.read_csv('nasdaq_stock_info/nasdaqtraded.txt', sep='|')
stock_list_df


In [14]:
feature_list = ['shortRatio','volume','beta','averageVolume','averageVolume10days','averageDailyVolume10Day',
 'marketCap','totalDebt','ebitda','totalCash','totalCashPerShare','currentRatio','totalRevenue','revenuePerShare',
 'returnOnAssets','freeCashflow','operatingCashflow','revenueGrowth','grossMargins','operatingMargins',
 'enterpriseToRevenue','enterpriseToEbitda']


In [15]:
import yfinance as yf
from tqdm.auto import tqdm

def get_company_info(stock_code):
    company = yf.Ticker(stock_code)
    info = company.info
    return info

description_list = []
for stock_code in tqdm(stock_list_df['Symbol'].unique()):
    try:
        info = get_company_info(stock_code)
        features = {'code':stock_code}
        for i in ['longBusinessSummary'] + feature_list:
            if i in info:
                features[i] = info[i]

        description_list.append(features)
        
    except Exception as e:
        print(e)
        


  0%|          | 0/4819 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
stock_description = pd.DataFrame(description_list)
stock_description.to_csv('./data/nasdaq_stock_info/stock_info.csv', index=False)


## Embedding the sentences

In [35]:
stock_description = stock_description.dropna(subset=['longBusinessSummary'])

In [47]:
## Embedding
from sentence_transformers import SentenceTransformer
logger = []

sentences = stock_description['longBusinessSummary'].values
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)


In [60]:
## add the embedded to dataframe
new_stock_description = stock_description.copy()
del new_stock_description['longBusinessSummary']

new_stock_description = pd.concat([new_stock_description.reset_index(drop=True), pd.DataFrame(embeddings).reset_index(drop=True)], axis=1)

new_stock_description = new_stock_description.set_index('code')


In [62]:
## fillna
for i in new_stock_description.columns:
    new_stock_description[i] = new_stock_description[i].fillna(np.nanmean(new_stock_description[i].values))


In [69]:
## Final standardization
from sklearn.preprocessing import StandardScaler
for i in new_stock_description.columns:
    new_stock_description[i] = StandardScaler().fit_transform(new_stock_description[[i]].values)



## Get distance matrix (dissimilarity)

In [80]:
## PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=300).fit(new_stock_description.values)
new_features = pd.DataFrame(pca.transform(new_stock_description.values), index=new_stock_description.index)


In [84]:
from scipy.spatial import distance_matrix
dist_mat = distance_matrix(new_features, new_features)


In [87]:
dist_mat = pd.DataFrame(dist_mat, index=new_features.index, columns=new_features.index)
dist_mat.round(3).to_csv('./data/nasdaq_stock_info/distance_matrix.csv',index=True)
