In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, OPTICS, Birch
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, RobustScaler, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mutual_info_score, adjusted_mutual_info_score, rand_score, adjusted_rand_score, completeness_score, fowlkes_mallows_score, homogeneity_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/Users/nickdimmitt/Desktop/finance/data/stocks_clean.csv")

In [None]:
df = df.loc[:,~df.columns.str.contains("Unnamed")]

## Feature Addition

In [None]:
df['year_2'] = df.groupby('ticker').cumcount()+1

In [None]:
df = df.replace(-9999, np.nan)
df = df.replace(-np.Inf, np.nan)
df = df.replace(np.Inf, np.nan)

In [None]:
## market cap
mkt_cap = pd.read_csv("/Users/nickdimmitt/Desktop/finance/data/market_cap.csv")
mkt_cap['ticker'] = mkt_cap['Symbol']
mkt_cap = mkt_cap.drop('Symbol', axis=1)

df = df.merge(right=mkt_cap, how='inner', on='ticker')

columns = {
    'Market Cap':'mkt_cap',
    'Country':'country',
    'IPO Year':'ipo_year',
    'Secter': 'sector'}
df = df.rename(columns=columns)

df.head()

In [None]:
df = df[~df['mkt_cap'].isna()]
df["Sector"] = df["Sector"].fillna("N/A")
df = df.fillna(-1)
df[df['year'] == 2023]['mkt_cap'] = 0

In [None]:
df.to_csv("/Users/nickdimmitt/Desktop/finance/data/cluster_df.csv")

## Clustering

In [2]:
mkt_cap = pd.read_csv("/Users/nickdimmitt/Desktop/finance/data/market_cap.csv")
df = pd.read_csv("/Users/nickdimmitt/Desktop/finance/data/cluster_df.csv")

In [3]:
features = ['revenue', 'cogs', 'gross_profit', 'gross_profit_ratio',
       'operating_expenses', 'r_&_d_expenses', 'selling_g_&_a_exp',
       'general_and_admin_exp', 'selling_and_marketing_exp',
       'other_expenses', 'cogs_and_expenses', 'interest_income',
       'interest_expense', 'depreciation_and_amortization', 'ebitda',
       'ebitda_ratio', 'operating_income', 'operating_income_ratio',
       'total_other_income_exp_gains', 'income_before_tax',
       'income_before_tax_ratio', 'income_tax_expense_gain', 'net_income',
       'net_income_ratio', 'eps', 'eps_diluted',
       'weighted_avg_shares_outs', 'weighted_avg_shares_outs_dil',
       'cash_and_cash_equivalents', 'short_term_investments',
       'cash_&_short_term_investments', 'net_receivables', 'inventory',
       'other_current_assets', 'total_current_assets', 'pp_&_e',
       'goodwill', 'intangible_assets', 'goodwill_and_intangible_assets',
       'investments', 'tax_assets', 'other_non_current_assets',
       'total_non_current_assets', 'other_assets', 'total_assets',
       'accounts_payable', 'short_term_debt', 'tax_payable',
       'deferred_revenue', 'other_current_liabilities',
       'total_current_liabilities', 'long_term_debt',
       'deferred_revenue_1', 'deferred_tax_liabilities',
       'other_non_current_liabilities', 'total_non_current_liabilities',
       'other_liabilities', 'capital_lease_obligations',
       'total_liabilities', 'preferred_stock', 'common_stock',
       'retained_earnings', 'other_compreh_income_loss',
       'other_total_stockhold_equity', 'total_stockholders_equity',
       'total_liab_&_stockhold_equity', 'minority_interest',
       'total_liabilities_&_equity', 'net_income_1',
       'depreciation_and_amortization_1', 'deferred_income_tax',
       'stock_based_compensation', 'change_in_working_capital',
       'accounts_receivable', 'inventory_1', 'accounts_payable_1',
       'other_working_capital', 'other_non_cash_items',
       'cash_provided_by_operating_activities', 'capex',
       'acquisitions_net', 'purchases_of_investments',
       'sales_maturities_of_investments', 'other_investing_activities',
       'cash_used_for_investing_activities', 'debt_repayment',
       'common_stock_issued', 'common_stock_repurchased',
       'dividends_paid', 'other_financing_activities',
       'cash_used_provided_by_financing_activities',
       'effect_of_forex_changes_on_cash', 'net_change_in_cash',
       'cash_at_the_end_of_period', 'cash_at_the_beginning_of_period',
       'free_cash_flow', 'forex_rate', 'year_2', 'sector_trans']

In [20]:
le = LabelEncoder()
label = le.fit_transform(df['Sector'])
df['sector_trans'] = label

X = df[features]

X = RobustScaler().fit_transform(X)

### K-Means

In [None]:
Sum_of_squared_distances = []
K = range(1,2000,75)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    print(km.inertia_)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [22]:
final = pd.DataFrame()
for x in range(250,500,10):
    kmeans = KMeans(n_clusters=x)
    df['labels'] = kmeans.fit_predict(X)


    labels = list(df[df['year'] == 2022]['labels'].unique())
    df['proj_mktcap_lbl'] = df.groupby('labels')['mkt_cap'].transform(lambda x:x.mean())
    final = pd.concat([final, df[df['labels'].isin(labels)]])

### Ward Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import silhouette_score

range_n_clusters = range(100,1000,100)
silhouette_avg = []
for num_clusters in range_n_clusters:
 
 # initialise kmeans
    ward = AgglomerativeClustering(n_clusters=num_clusters, linkage='single')
    ward.fit(X)
    cluster_labels = ward.labels_
    print(num_clusters)
 # silhouette score
    silhouette_avg.append(silhouette_score(X, cluster_labels))
    
plt.plot(range_n_clusters,silhouette_avg,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score') 
plt.title('Silhouette analysis For Optimal k')
plt.show()

In [23]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import silhouette_score


clusters = range(50,200,25)
for x in clusters:
    ward = AgglomerativeClustering(n_clusters=x, linkage='single')
    df['labels'] = ward.fit_predict(X)

    labels = list(df[df['year'] == 2022]['labels'].unique())
    df['proj_mktcap_lbl'] = df.groupby('labels')['mkt_cap'].transform(lambda x:x.mean())
    final = pd.concat([final, df[df['labels'].isin(labels)]])

In [24]:
final = final[final['year'] == 2022]
final['proj_mktcap_comp'] = final.groupby(['ticker'])['proj_mktcap_lbl'].transform(lambda x:x.mean())
final = final.drop_duplicates(subset='ticker')
final.sort_values(by='proj_mktcap_comp', ascending=False).to_csv("/Users/nickdimmitt/Desktop/finance/data/clustered.csv")

In [29]:
final[['ticker', 'ipo_year', 'Sector', 'proj_mktcap_comp']].sort_values(by='proj_mktcap_comp', ascending=False).to_csv("clustered_1.csv")

In [4]:
df2 = pd.read_csv("clustered_1.csv")

In [15]:
clustered_2 = pd.merge(df2, mkt_cap, on='ticker')

In [16]:
clustered_2

Unnamed: 0.1,Unnamed: 0,ticker,ipo_year_x,Sector_x,proj_mktcap_comp,mkt_cap,country,ipo_year_y,Sector_y
0,60442,MSFT,1986.0,Technology,7.916959e+11,1.850455e+12,United States,1986.0,Technology
1,40604,GOOG,2004.0,Technology,7.471791e+11,1.165565e+12,United States,2004.0,Technology
2,316,AAPL,1980.0,Technology,6.076068e+11,2.574595e+12,United States,1980.0,Technology
3,57538,META,2012.0,Technology,5.087026e+11,4.654047e+11,United States,2012.0,Technology
4,5569,AMZN,1997.0,Consumer Discretionary,4.207077e+11,9.233491e+11,United States,1997.0,Consumer Discretionary
...,...,...,...,...,...,...,...,...,...
2558,87254,SWN,-1.0,Energy,5.569869e+09,5.451679e+09,United States,,Energy
2559,34312,FCNCA,-1.0,Finance,5.119886e+09,8.938430e+09,United States,,Finance
2560,34345,FCNCO,-1.0,Finance,5.119886e+09,0.000000e+00,United States,,Finance
2561,35684,FLNC,2021.0,Utilities,3.097405e+09,3.097405e+09,United States,2021.0,Utilities


In [11]:
columns = {
    'Symbol': 'ticker',
    'Market Cap':'mkt_cap',
    'Country':'country',
    'IPO Year':'ipo_year',
    'Secter': 'sector'}
mkt_cap = mkt_cap.rename(columns=columns)

In [17]:
clustered_2 = clustered_2[['ticker','country','ipo_year_x','Sector_x','proj_mktcap_comp','mkt_cap']]

In [18]:
clustered_2['difference'] = clustered_2['proj_mktcap_comp'] - clustered_2['mkt_cap']

In [20]:
clustered_2.sort_values('difference', ascending=False).to_csv('clustered_2.csv')