In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, OPTICS, Birch
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, RobustScaler, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mutual_info_score, adjusted_mutual_info_score, rand_score, adjusted_rand_score, completeness_score, fowlkes_mallows_score, homogeneity_score

In [2]:
df = pd.read_csv("/Users/nickdimmitt/Desktop/finance/data/stocks_clean.csv")

In [3]:
df = df.loc[:,~df.columns.str.contains("Unnamed")]

## Feature Addition

In [4]:
df['year_2'] = df.groupby('ticker').cumcount()+1

In [5]:
df = df.replace(-9999, np.nan)
df = df.replace(-np.Inf, np.nan)
df = df.replace(np.Inf, np.nan)

In [6]:
## market cap
mkt_cap = pd.read_csv("/Users/nickdimmitt/Desktop/finance/data/market_cap.csv")
mkt_cap['ticker'] = mkt_cap['Symbol']
mkt_cap = mkt_cap.drop('Symbol', axis=1)

df = df.merge(right=mkt_cap, how='inner', on='ticker')

columns = {
    'Market Cap':'mkt_cap',
    'Country':'country',
    'IPO Year':'ipo_year',
    'Secter': 'sector'}
df = df.rename(columns=columns)

df.head()

Unnamed: 0,year,revenue,cogs,gross_profit,gross_profit_ratio,operating_expenses,r_&_d_expenses,selling_g_&_a_exp,general_and_admin_exp,selling_and_marketing_exp,...,cash_at_the_end_of_period,cash_at_the_beginning_of_period,free_cash_flow,forex_rate,ticker,year_2,mkt_cap,country,ipo_year,Sector
0,1998.0,7952.0,4035.0,3917.0,0.4926,3475.0,948.0,2050.0,2050.0,,...,,,341.0,,A,1,40154130000.0,United States,1999.0,Industrials
1,1999.0,8331.0,4388.0,3943.0,0.4733,3202.0,997.0,2205.0,2205.0,,...,,,27.0,,A,2,40154130000.0,United States,1999.0,Industrials
2,2000.0,10773.0,5522.0,5251.0,0.4874,4198.0,1258.0,2940.0,2940.0,,...,996.0,,14.0,,A,3,40154130000.0,United States,1999.0,Industrials
3,2001.0,8396.0,5166.0,3230.0,0.3847,4008.0,1349.0,2659.0,2659.0,,...,1170.0,996.0,621.0,,A,4,40154130000.0,United States,1999.0,Industrials
4,2002.0,6010.0,3694.0,2316.0,0.3854,3923.0,1169.0,2754.0,2754.0,,...,1844.0,1170.0,-799.0,,A,5,40154130000.0,United States,1999.0,Industrials


In [7]:
df = df[~df['mkt_cap'].isna()]

In [12]:
df.to_csv("clust_df.csv")

## Clustering

In [8]:
features = ['revenue', 'cogs', 'gross_profit', 'gross_profit_ratio',
       'operating_expenses', 'r_&_d_expenses', 'selling_g_&_a_exp',
       'general_and_admin_exp', 'selling_and_marketing_exp',
       'other_expenses', 'cogs_and_expenses', 'interest_income',
       'interest_expense', 'depreciation_and_amortization', 'ebitda',
       'ebitda_ratio', 'operating_income', 'operating_income_ratio',
       'total_other_income_exp_gains', 'income_before_tax',
       'income_before_tax_ratio', 'income_tax_expense_gain', 'net_income',
       'net_income_ratio', 'eps', 'eps_diluted',
       'weighted_avg_shares_outs', 'weighted_avg_shares_outs_dil',
       'cash_and_cash_equivalents', 'short_term_investments',
       'cash_&_short_term_investments', 'net_receivables', 'inventory',
       'other_current_assets', 'total_current_assets', 'pp_&_e',
       'goodwill', 'intangible_assets', 'goodwill_and_intangible_assets',
       'investments', 'tax_assets', 'other_non_current_assets',
       'total_non_current_assets', 'other_assets', 'total_assets',
       'accounts_payable', 'short_term_debt', 'tax_payable',
       'deferred_revenue', 'other_current_liabilities',
       'total_current_liabilities', 'long_term_debt',
       'deferred_revenue_1', 'deferred_tax_liabilities',
       'other_non_current_liabilities', 'total_non_current_liabilities',
       'other_liabilities', 'capital_lease_obligations',
       'total_liabilities', 'preferred_stock', 'common_stock',
       'retained_earnings', 'other_compreh_income_loss',
       'other_total_stockhold_equity', 'total_stockholders_equity',
       'total_liab_&_stockhold_equity', 'minority_interest',
       'total_liabilities_&_equity', 'net_income_1',
       'depreciation_and_amortization_1', 'deferred_income_tax',
       'stock_based_compensation', 'change_in_working_capital',
       'accounts_receivable', 'inventory_1', 'accounts_payable_1',
       'other_working_capital', 'other_non_cash_items',
       'cash_provided_by_operating_activities', 'capex',
       'acquisitions_net', 'purchases_of_investments',
       'sales_maturities_of_investments', 'other_investing_activities',
       'cash_used_for_investing_activities', 'debt_repayment',
       'common_stock_issued', 'common_stock_repurchased',
       'dividends_paid', 'other_financing_activities',
       'cash_used_provided_by_financing_activities',
       'effect_of_forex_changes_on_cash', 'net_change_in_cash',
       'cash_at_the_end_of_period', 'cash_at_the_beginning_of_period',
       'free_cash_flow', 'forex_rate', 'year_2']

In [9]:
X = df[features]
y = df['mkt_cap']

X_train,X_test, y_train, y_test = train_test_split(X,y)

pipe = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('kmeans', KMeans()),
    ('regressor',XGBRegressor())
])

parameters = {
    'impute__strategy':['mean', 'median', 0], 
    'scaler':[StandardScaler(), MinMaxScaler(), Normalizer(), RobustScaler(), MaxAbsScaler()],
    'kmeans__n_clusters':[5,15, 30, 60, 120, 240],
    'regressor__n_estimators':[500, 1000,1500],
    'regressor__learning_rate':[0.005, 0.01],
    'regressor__max_depth':[1,3,5,7,9]
}

grid = GridSearchCV(estimator=pipe, param_grid=parameters,scoring='r2').fit(X_train, y_train)

best_pipe = grid.best_estimator_
best_params = grid.best_params_

print(f'best pipe: {best_pipe}')
print(f'best params: {best_params}')

print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))
# df['Groups'] = best_pipe.labels_

