# Financial Model Prediction

In [1]:
import pandas as pd
import numpy as np
import seaborn as sbn
import re

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

import lightgbm as lgb

%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
def add_shifts(df_, col_to_shift, new_col, shift):
    """
    This function add shifted columns to data by ticker.
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param str col_to_shift: Column over to create the shift.
    :param str new_col: Name of the shifted column.
    :param int shift: Days to use as shift.
    :return pd.DataFrame: Dataframe with the shift added.
    """
    
    for id_ in df_['ticker'].unique():
        df_by_id = df_[df_['ticker'] == id_]
        df_.loc[df_['ticker'] == id_, new_col] = df_by_id[col_to_shift] - df_by_id[col_to_shift].shift(shift)
        
    return df_

In [29]:
def get_non_n_cols(df_, n):
    """
    Get the columns that his window is not n days
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param int n: n days to get columns with.
    :return list: List with the name of the columns.
    """
    
    return [elem for elem in df_.columns if (re.search(r'\d+$', elem) is not None) and (int(elem[-2:].strip().strip('_')) < n)]

In [4]:
def interpolate_nan_values(df_, to_interpolate):
    """
    Interpolate and extrapolate nan values for numerical columns.
    
    :param pd.DataFrame df_: Dataframe with financial data with NaN values.
    :param list to_interpolate: List with columns to interpolate.
    :return pd.DataFrame: Dataframe with financial data without NaN values.
    """
    
    list_df = []
    for tick in df_['ticker'].unique():
        df_by_ticker = df_[df_['ticker'] == tick]
        for col in to_interpolate:
            df_by_ticker[col] = df_by_ticker[col].interpolate(method='linear', limit_direction='both')
        list_df.append(df_by_ticker)
    return pd.concat(list_df)

In [5]:
def categorize_each_difference(num_list, df_):
    """
    This function categorize the shifted columns in Weak Bull o Bear (W. Bull, W. Bear), 
    Bull or Bear and Strong Bull or Bear (S. Bull, S. Bear) depending on the value of the shifted column and
    his statistics (median, p25, p75) by ticker, year, month and sign.
    
    :param list num_list: List with days to categorize.
    :param pd.DataFrame df_: Dataframe to categorize.
    :return pd.DataFrame: Dataframe recalculated.
    """
    cols_to_keep = list(df_.columns)
    df_['year'], df_['month'] = df_['date'].dt.year, df_['date'].dt.month
    for num_ in num_list:
        df_.loc[df_['close_shifted_%i' % num_] >= 0, 'sign_%i' % num_] = 'Bull'
        df_.loc[df_['close_shifted_%i' % num_] < 0, 'sign_%i' % num_] = 'Bear'
        group = df_.groupby(['ticker', 'year', 'month', 'sign_%i' % num_])['close_shifted_%i' % num_].describe()
        group = group[['25%', '50%', '75%', 'std']].reset_index()
        group.rename({'std': 'std_%i' % num_}, axis='columns', inplace=True)
        df_ = pd.merge(left=df_, right=group, on=['ticker', 'year', 'month', 'sign_%i' % num_], how='inner')
        
        df_.loc[(df_['sign_%i' % num_] == 'Bull') & 
                (df_['close_shifted_%i' % num_] <= df_['50%']), 'cat_close_shifted_%i' % num_] = 'W. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bull') & 
                   (df_['close_shifted_%i' % num_] > df_['50%']) &
                   (df_['close_shifted_%i' % num_] < df_['75%']), 'cat_close_shifted_%i' % num_] = df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bull') &
                   (df_['close_shifted_%i' % num_] >= df_['75%']), 'cat_close_shifted_%i' % num_] = 'S. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                (df_['close_shifted_%i' % num_] >= df_['50%']), 'cat_close_shifted_%i' % num_] = 'W. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                   (df_['close_shifted_%i' % num_] < df_['50%']) &
                   (df_['close_shifted_%i' % num_] > df_['25%']), 'cat_close_shifted_%i' % num_] = df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                   (df_['close_shifted_%i' % num_] <= df_['25%']), 'cat_close_shifted_%i' % num_] = 'S. ' + df_['sign_%i' % num_]
        
        df_.drop(['25%', '50%', '75%'], axis='columns', inplace=True)
        cols_to_keep.extend(['cat_close_shifted_%i' % num_, 'std_%i' % num_])
    return df_[cols_to_keep]

In [6]:
df_categorical = pd.read_csv('../data/db_bsm_categorical.csv')
df_financial = pd.read_csv('../data/db_bsm_financial.csv')

In [7]:
df_financial.replace(0, np.NaN, inplace=True)
df_financial.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,1081
volume,1996


In [8]:
df_financial_not_nan = interpolate_nan_values(df_financial, ['close', 'volume'])

In [9]:
df_financial_not_nan.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,0
volume,0


In [10]:
num_list = [3, 5, 7, 14, 21]

In [11]:
for num_ in num_list:
    df_fin = add_shifts(df_financial_not_nan, 'close', 'close_shifted_%i' % num_, num_)

In [12]:
df_fin.dropna(subset=['close_shifted_21'], inplace=True)

In [13]:
df_fin_not_nan = interpolate_nan_values(df_fin, list(df_fin.select_dtypes(float)))

In [14]:
df_fin_not_nan.isnull().sum().to_frame('Null Values').sort_values(by='Null Values', ascending=False).head(3)

Unnamed: 0,Null Values
ADX 14,0
RSI 21,0
ROCR 14,0


In [15]:
df_fin_not_nan['date'] = pd.to_datetime(df_fin_not_nan['date'])

In [16]:
%time df_final = categorize_each_difference(num_list, df_fin_not_nan)

Wall time: 13min 46s


In [17]:
df_categorical = df_categorical.dropna()
df_categorical = df_categorical.drop_duplicates(subset=['ticker'], keep='first')

In [18]:
df_final = pd.merge(left=df_final, right=df_categorical, how='inner', on='ticker')

In [19]:
df_final['sector_gics'].unique()

array(['Financials', 'Consumer discretionary', 'Communication services',
       'Energy', 'Industrials', 'Healthcare', 'Information technology',
       'Consumer staples', 'Utilities'], dtype=object)

In [23]:
# %time df_final = pd.get_dummies(df_final, columns='sector_gics', prefix='sector_gics_')

In [24]:
# df_final.loc[df_final['sector_gics'] == 'Utilities', 'sector_gics_Utilities'] = 1
# df_final.loc[df_final['sector_gics'] == 'Healthcare', 'sector_gics_Heatlhcare'] = 1
# df_final.loc[df_final['sector_gics'] == 'Financials', 'sector_gics_Financials'] = 1
# df_final.loc[df_final['sector_gics'] == 'Consumer discretionary', 'sector_gics_Consumer_discretionary'] = 1
# df_final.loc[df_final['sector_gics'] == 'Communication services', 'sector_gics_Communication_services'] = 1
# df_final.loc[df_final['sector_gics'] == 'Energy', 'sector_gics_Energy'] = 1
# df_final.loc[df_final['sector_gics'] == 'Industrials', 'sector_gics_Industrials'] = 1
# df_final.loc[df_final['sector_gics'] == 'Information technology', 'sector_gics_Information_technology'] = 1
# df_final.loc[df_final['sector_gics'] == 'Consumer staples', 'sector_gics_Consumer_staples'] = 1

In [25]:
# df_final[['sector_gics_Utilities', 'sector_gics_Heatlhcare',
#           'sector_gics_Financials', 'sector_gics_Consumer_discretionary',
#           'sector_gics_Communication_services', 'sector_gics_Energy',
#           'sector_gics_Industrials', 'sector_gics_Information_technology',
#           'sector_gics_Consumer_staples']] = df_final[['sector_gics_Utilities', 'sector_gics_Heatlhcare',
#           'sector_gics_Financials', 'sector_gics_Consumer_discretionary',
#           'sector_gics_Communication_services', 'sector_gics_Energy',
#           'sector_gics_Industrials', 'sector_gics_Information_technology',
#           'sector_gics_Consumer_staples']].replace(np.NaN, 0)

In [39]:
df_final = df_final.dropna()

In [40]:
df_final_train = df_final[df_final['date'].dt.year < 2019]
df_final_test = df_final[df_final['date'].dt.year >= 2019]

# Inforation Tech

In [47]:
df_final_train_inf_tech = df_final_train[df_final_train['sector_gics'] == 'Information technology']
df_final_test_inf_tech = df_final_test[df_final_test['sector_gics'] == 'Information technology']

In [48]:
df_final_train_inf_tech_filt = df_final_train_inf_tech

In [49]:
list_to_drop = get_non_n_cols(df_final_train_inf_tech_filt, 7)

In [50]:
df_final_train_inf_tech_filt.drop(list_to_drop, axis='columns', inplace=True)
df_final_test_inf_tech.drop(list_to_drop, axis='columns', inplace=True)

# 7 Days prediction

In [51]:
target_train = df_final_train_inf_tech_filt['cat_close_shifted_7']
features_train = df_final_train_inf_tech_filt[df_final_train_inf_tech_filt.select_dtypes(float).columns]
features_test = df_final_test_inf_tech[df_final_test_inf_tech.select_dtypes(float).columns]
target_test = df_final_test_inf_tech['cat_close_shifted_7']

In [52]:
random_forest = GridSearchCV(RandomForestClassifier(criterion='entropy'),
                   param_grid={'n_estimators': range(10, 21),
                              'max_depth': [7, 12, 15, 17]}, 
                   scoring='recall_macro', 
                   cv=5, verbose=10)
%time random_forest.fit(features_train.values, target_train.values)
random_forest = RandomForestClassifier(**random_forest.best_params_)
random_forest.fit(features_train.values, target_train.values)
print(classification_report(target_test.values, random_forest.predict(features_test.values)))

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[CV] max_depth=7, n_estimators=10 ....................................
[CV]  max_depth=7, n_estimators=10, score=0.533663419280502, total=   3.3s
[CV] max_depth=7, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


[CV]  max_depth=7, n_estimators=10, score=0.5318872924066862, total=   3.0s
[CV] max_depth=7, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.9s remaining:    0.0s


[CV]  max_depth=7, n_estimators=10, score=0.5285016028267123, total=   3.0s
[CV] max_depth=7, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.2s remaining:    0.0s


[CV]  max_depth=7, n_estimators=10, score=0.5328195251988442, total=   3.2s
[CV] max_depth=7, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.7s remaining:    0.0s


[CV]  max_depth=7, n_estimators=10, score=0.5283981382590321, total=   3.1s
[CV] max_depth=7, n_estimators=11 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.1s remaining:    0.0s


[CV]  max_depth=7, n_estimators=11, score=0.5290952870336657, total=   3.6s
[CV] max_depth=7, n_estimators=11 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   21.1s remaining:    0.0s


[CV]  max_depth=7, n_estimators=11, score=0.5250574796857622, total=   3.6s
[CV] max_depth=7, n_estimators=11 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   24.9s remaining:    0.0s


[CV]  max_depth=7, n_estimators=11, score=0.5291682547479153, total=   3.5s
[CV] max_depth=7, n_estimators=11 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   28.8s remaining:    0.0s


[CV]  max_depth=7, n_estimators=11, score=0.5376368011811351, total=   3.5s
[CV] max_depth=7, n_estimators=11 ....................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   32.6s remaining:    0.0s


[CV]  max_depth=7, n_estimators=11, score=0.5261514652584481, total=   3.5s
[CV] max_depth=7, n_estimators=12 ....................................
[CV]  max_depth=7, n_estimators=12, score=0.5417330823401131, total=   3.7s
[CV] max_depth=7, n_estimators=12 ....................................
[CV]  max_depth=7, n_estimators=12, score=0.5250548893249803, total=   4.2s
[CV] max_depth=7, n_estimators=12 ....................................
[CV]  max_depth=7, n_estimators=12, score=0.5207855348574771, total=   3.9s
[CV] max_depth=7, n_estimators=12 ....................................
[CV]  max_depth=7, n_estimators=12, score=0.5398364380830121, total=   3.9s
[CV] max_depth=7, n_estimators=12 ....................................
[CV]  max_depth=7, n_estimators=12, score=0.5310183828687846, total=   3.9s
[CV] max_depth=7, n_estimators=13 ....................................
[CV]  max_depth=7, n_estimators=13, score=0.5327236638738555, total=   4.1s
[CV] max_depth=7, n_estimators=13 ........

[Parallel(n_jobs=1)]: Done 220 out of 220 | elapsed: 23.8min finished


Wall time: 23min 58s
             precision    recall  f1-score   support

       Bear       0.15      0.07      0.10        28
       Bull       0.42      0.30      0.35        56
    S. Bear       0.59      0.50      0.54        52
    S. Bull       0.61      0.72      0.66        86
    W. Bear       0.66      0.83      0.74        93
    W. Bull       0.78      0.78      0.78       157

avg / total       0.63      0.65      0.63       472



In [36]:
features_test = df_final_test[df_final_test.select_dtypes(float).columns]
target_test = df_final_test['cat_close_shifted_3']
print(classification_report(target_test.values, knn.predict(features_test.values)))

              precision    recall  f1-score   support

        Bear       0.07      0.11      0.08       226
        Bull       0.11      0.06      0.07       482
     S. Bear       0.13      0.18      0.15       455
     S. Bull       0.18      0.14      0.16       725
     W. Bear       0.20      0.32      0.25       828
     W. Bull       0.34      0.23      0.28      1317

   micro avg       0.20      0.20      0.20      4033
   macro avg       0.17      0.17      0.17      4033
weighted avg       0.22      0.20      0.20      4033

