# Financial Model Prediction

In [58]:
import pandas as pd
import numpy as np
import seaborn as sbn
import re

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.decomposition import PCA

import lightgbm as lgb

%matplotlib inline

In [4]:
def add_shifts(df_, col_to_shift, new_col, shift):
    """
    This function add shifted columns to data by ticker.
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param str col_to_shift: Column over to create the shift.
    :param str new_col: Name of the shifted column.
    :param int shift: Days to use as shift.
    :return pd.DataFrame: Dataframe with the shift added.
    """
    
    for id_ in df_['ticker'].unique():
        df_by_id = df_[df_['ticker'] == id_]
        df_.loc[df_['ticker'] == id_, new_col] = df_by_id[col_to_shift] - df_by_id[col_to_shift].shift(shift)
        
    return df_

In [5]:
def get_non_n_cols(df_, n):
    """
    Get the columns that his window is not n days
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param int n: n days to get columns with.
    :return list: List with the name of the columns.
    """
    
    return [elem for elem in df_.columns if (re.search(r'\d+$', elem) is not None) and (int(elem[-2:].strip().strip('_')) < n)]

In [6]:
def get_unwanted_cols(df_):
    
    return [elem for elem in df_.columns if elem.startswith('close_shifted') or elem.startswith('cat_close_shifted')]

In [7]:
def interpolate_nan_values(df_, to_interpolate):
    """
    Interpolate and extrapolate nan values for numerical columns.
    
    :param pd.DataFrame df_: Dataframe with financial data with NaN values.
    :param list to_interpolate: List with columns to interpolate.
    :return pd.DataFrame: Dataframe with financial data without NaN values.
    """
    
    list_df = []
    for tick in df_['ticker'].unique():
        df_by_ticker = df_[df_['ticker'] == tick]
        for col in to_interpolate:
            df_by_ticker[col] = df_by_ticker[col].interpolate(method='linear', limit_direction='both')
        list_df.append(df_by_ticker)
    return pd.concat(list_df)

In [8]:
def categorize_each_difference(num_list, df_):
    """
    This function categorize the shifted columns in Weak Bull o Bear (W. Bull, W. Bear), 
    Bull or Bear and Strong Bull or Bear (S. Bull, S. Bear) depending on the value of the shifted column and
    his statistics (median, p25, p75) by ticker, year, month and sign.
    
    :param list num_list: List with days to categorize.
    :param pd.DataFrame df_: Dataframe to categorize.
    :return pd.DataFrame: Dataframe recalculated.
    """
    cols_to_keep = list(df_.columns)
    df_['year'], df_['month'] = df_['date'].dt.year, df_['date'].dt.month
    for num_ in num_list:
        df_.loc[df_['close_shifted_%i' % num_] >= 0, 'sign_%i' % num_] = 'Bull'
        df_.loc[df_['close_shifted_%i' % num_] < 0, 'sign_%i' % num_] = 'Bear'
        group = df_.groupby(['ticker', 'year', 'month', 'sign_%i' % num_])['close_shifted_%i' % num_].describe()
        group = group[['25%', '50%', '75%', 'std']].reset_index()
        group.rename({'std': 'std_%i' % num_}, axis='columns', inplace=True)
        df_ = pd.merge(left=df_, right=group, on=['ticker', 'year', 'month', 'sign_%i' % num_], how='inner')
        
        df_.loc[(df_['sign_%i' % num_] == 'Bull') & 
                (df_['close_shifted_%i' % num_] <= df_['50%']), 'cat_close_shifted_%i' % num_] = 'W. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bull') & 
                   (df_['close_shifted_%i' % num_] > df_['50%']) &
                   (df_['close_shifted_%i' % num_] < df_['75%']), 'cat_close_shifted_%i' % num_] = df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bull') &
                   (df_['close_shifted_%i' % num_] >= df_['75%']), 'cat_close_shifted_%i' % num_] = 'S. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                (df_['close_shifted_%i' % num_] >= df_['50%']), 'cat_close_shifted_%i' % num_] = 'W. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                   (df_['close_shifted_%i' % num_] < df_['50%']) &
                   (df_['close_shifted_%i' % num_] > df_['25%']), 'cat_close_shifted_%i' % num_] = df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                   (df_['close_shifted_%i' % num_] <= df_['25%']), 'cat_close_shifted_%i' % num_] = 'S. ' + df_['sign_%i' % num_]
        
        df_.drop(['25%', '50%', '75%', 'std_%i' % num_], axis='columns', inplace=True)
        cols_to_keep.extend(['cat_close_shifted_%i' % num_])
    return df_[cols_to_keep]

In [25]:
def get_correlated_columns(X_train, features_col):

    df_ = pd.DataFrame(data=X_train, columns=features_col)
    df_ = df_.corr()
    df_ = df_.drop('close', axis=1)
    df_ = df_.drop('close', axis=0)
    dict_ = {}
    df__ = df_[df_.abs() >=.9]
    df__ = df__[df__ < 1].dropna(axis=1, how='all').dropna(axis=0, how='all')
    list_ = list(df__.columns)
    for col in list_:
        dict_[col] = list(df__[df__[col].notnull()].index)
        for elem in dict_[col]:
            if elem in list_:
                list_.remove(elem)
    return dict_

In [57]:
def split_df_train_test_by_date(df_, year_, mode_debug=True):
    """
    This function separe the df_ in test and train by year and month
    
    :param pd.DataFrame df_: Dataframe to split.
    :param int year_: Year to use as split.
    :param bool mode_debug: Choose if return a df_train shorter.
    :return tuple: Tuple which contains both df's (train, test).
    """
    
    df_train = df_[(df_['date'].dt.year < year_)]
    if mode_debug:
        df_train = df_train[df_train['date'].dt.year >= year_ - 2]
    df_test = df_[(df_['date'].dt.year >= year_)]
    return df_train, df_test

In [None]:
def basic_models(X_train, y_train, X_test, y_test, day_, sector):
    scorings = ['recall_macro', 'recall_weighted', 'recall_micro', 
                'precision_macro', 'precision_micro', 'precisison_weighted']
    dict_log_reg = {}
    dict_knn = {}
    dict_dec_tree = {}
    dict_total = {}

    for scoring in scorings:
        log_reg = GridSearchCV(LogisticRegression(),
                               param_grid={'C': [1.2, 1, 0.8],
                                           'tol': [1e-3, 1e-4, 1e-5],
                                           'multi_class': ['ovr', 'multinomial']}, 
                               scoring=scoring, cv=5)
        print('Time to train log_reg for %d days, %s sector and %s' % (day_, sector, scoring))
        %time log_reg.fit(X_train, y_train)
        best_params = log_reg.best_params_
        log_reg = LogisticRegression(**log_reg.best_params_)
        val_score_log_reg = cross_val_score(log_reg, X_train, y_train, cv=5, scoring=scoring).mean()
        report = classification_report(y_test, log_reg.predict(X_test), digits=4, output_dict=True)
        dict_log_reg['log_reg'] = [log_reg, val_score_log_reg, report, best_params]
        
        knn = GridSearchCV(KNeighborsClassifier(),
                           param_grid={'n_neighbors': range(3, 9),
                                          'weights' : ['uniform', 'distance']}, 
                           scoring=scoring, cv=5)
        print('Time to train knn for %d days, %s sector and %s' % (day_, sector, scoring))
        %time knn.fit(X_train, y_train)
        best_params = knn.best_params_
        knn = KNeighborsClassifier(**knn.best_params_)
        val_score_knn = cross_val_score(knn, X_train, y_train, cv=5, scoring=scoring).mean()
        report = classification_report(y_test, knn.predict(X_test), digits=4, output_dict=True)
        dict_knn['knn'] = [knn, val_score_knn, report, best_params]
        
        dec_tree = GridSearchCV(DecisionTreeClassifier(),
                                param_grid={'criterion': ['gini', 'entropy'],
                                           'max_depth': range(10, 17),
                                           'min_samples_split': [2, 4, 6],
                                           'min_samples_leaf': [1, 2, 3]}, 
                                scoring=scoring, cv=5)
        print('Time to train dec_tree for %d days, %s sector and %s' % (day_, sector, scoring))
        %time dec_tree.fit(X_train, y_train)
        best_params = dec_tree.best_params_
        dec_tree = DecisionTreeClassifier(**dec_tree.best_params_)
        val_score_dec_tree = cross_val_score(dec_tree, X_train, y_train, cv=5, scoring=scoring).mean()
        report = classification_report(y_test, dec_tree.predict(X_test), digits=4, output_dict=True)
        dict_dec_tree['dec_tree'] = [dec_tree, val_score_dec_tree, report, best_params]
        
        if dict_log_reg['log_reg'][1] >= dict_knn['knn'][1]:
            if dict_log_reg['log_reg'][1] >= dict_dec_tree['dec_tree'][1]:
                if dict_total['total'][1] < dict_log_reg['log_reg'][1]:
                    dict_total['total'] = dict_log_reg['log_reg']
            else:
                if dict_total['total'][1] < dict_dec_tree['log_reg'][1]:
                    dict_total['total'] = dict_dec_tree['log_reg']
        else:
            if dict_knn['knn'][1] >= dict_dec_tree['dec_tree'][1]:
                if dict_total['total'][1] < dict_knn['log_reg'][1]:
                    dict_total['total'] = dict_knn['knn']
            else:
                if dict_total['total'][1] < dict_dec_tree['log_reg'][1]:
                    dict_total['total'] = dict_dec_tree['dec_tree']
            
                    
        
        
        
        
        


In [None]:
def make_magic(df_, days_, mode_debug=True):
    df_train, df_test = split_df_train_test_by_date(df_, 2019, mode_debug)
    for sector in df_['sector_gics'].unique():
        df_train_sct = df_train[df_train['sector_gics'] == sector]
        df_test_sct = df_test[df_test['sector_gics'] == sector]
        for day_ in days_:
            list_to_drop = get_non_n_cols(df_final_train_inf_tech, int(day_))
            df_train_sct_non_n = df_train_sct.drop(df_train_sct, axis='columns')
            df_test_sct_non_n = df_test_sct.drop(df_test_sct, axis='columns')
            
            target_train = df_train_sct_non_n['cat_close_shifted_%d' % day_]
            feat_train = df_train_sct_non_n[df_train_sct_non_n.select_dtypes(float).columns]
            
            target_test = df_test_sct_non_n['cat_close_shifted_%d' % day_]
            feat_test = df_test_sct_non_n[df_test_sct_non_n.select_dtypes(float).columns]
            
            list_to_drop_2 = get_unwanted_cols(feat_train)

            feat_train = feat_train.drop(list_to_drop_2, axis='columns')
            feat_test = feat_test.drop(list_to_drop_2, axis='columns')
            


    

In [9]:
df_categorical = pd.read_csv('../data/db_bsm_categorical.csv')
df_financial = pd.read_csv('../data/db_bsm_financial.csv')

In [10]:
df_financial.replace(0, np.NaN, inplace=True)
df_financial.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,1081
volume,1996


In [11]:
df_financial_not_nan = interpolate_nan_values(df_financial, ['close', 'volume'])

In [12]:
df_financial_not_nan.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,0
volume,0


In [13]:
num_list = [3, 5, 7, 14, 21]

In [14]:
df_financial_not_nan = df_financial_not_nan.sort_values(['ticker', 'date'], ascending=[True, False])

In [15]:
for num_ in num_list:
    df_fin = add_shifts(df_financial_not_nan, 'close', 'close_shifted_%i' % num_, num_)

In [16]:
df_fin.dropna(subset=['close_shifted_21'], inplace=True)

In [17]:
df_fin_not_nan = interpolate_nan_values(df_fin, list(df_fin.select_dtypes(float)))

In [18]:
df_fin_not_nan.isnull().sum().to_frame('Null Values').sort_values(by='Null Values', ascending=False).head(3)

Unnamed: 0,Null Values
ADX 14,0
RSI 21,0
ROCR 14,0


In [19]:
df_fin_not_nan['date'] = pd.to_datetime(df_fin_not_nan['date'])

In [48]:
%time df_final = categorize_each_difference(num_list, df_fin_not_nan)

Wall time: 3min 59s


In [49]:
df_categorical = df_categorical.dropna()
df_categorical = df_categorical.drop_duplicates(subset=['ticker'], keep='first')

In [50]:
df_final = pd.merge(left=df_final, right=df_categorical, how='inner', on='ticker')

In [51]:
df_final['sector_gics'].unique()

array(['Consumer discretionary', 'Industrials', 'Financials',
       'Consumer staples', 'Energy', 'Information technology',
       'Healthcare', 'Communication services', 'Utilities'], dtype=object)

In [52]:
df_final = df_final.dropna()

In [53]:
df_train, df_test = split_df_train_test_by_date(df_final, 2019)

In [54]:
df_train.shape[0] + df_test.shape[0]

297064

In [55]:
df_final.shape

(294687, 166)

In [126]:
df_final_train = df_final[df_final['date'].dt.year < 2019]
print(df_final_train.shape)
df_final_test = df_final[df_final['date'].dt.year >= 2019]
print(df_final_test.shape)

(292310, 166)
(2377, 166)


In [77]:
df_final_train = df_final_train[df_final_train['date'].dt.year >= 2017]

# Inforation Tech

In [127]:
df_final_train_inf_tech = df_final_train[df_final_train['sector_gics'] == 'Information technology']
df_final_test_inf_tech = df_final_test[df_final_test['sector_gics'] == 'Information technology']

(31880, 166)
(272, 166)


In [112]:
list_to_drop = get_non_n_cols(df_final_train_inf_tech, 7)

In [113]:
df_final_train_inf_tech.drop(list_to_drop, axis='columns', inplace=True)
df_final_test_inf_tech.drop(list_to_drop, axis='columns', inplace=True)

# 7 Days prediction

In [114]:
target_train = df_final_train_inf_tech['cat_close_shifted_7']
features_train = df_final_train_inf_tech[df_final_train_inf_tech.select_dtypes(float).columns]

In [115]:
target_test = df_final_test_inf_tech['cat_close_shifted_7']
features_test = df_final_test_inf_tech[df_final_test_inf_tech.select_dtypes(float).columns]

In [116]:
list_to_drop_2 = get_unwanted_cols(features_train)

In [117]:
features_train.drop(list_to_drop_2, axis='columns', inplace=True)

In [118]:
features_test.drop(list_to_drop_2, axis='columns', inplace=True)

In [61]:
random_forest = GridSearchCV(RandomForestClassifier(criterion='gini'),
                   param_grid={
                              'max_depth': range(10, 15)}, 
                   scoring='kk', 
                   cv=3, verbose=4)
%time random_forest.fit(features_train.values, target_train.values)
print('Best Params: ', random_forest.best_params_)
random_forest = RandomForestClassifier(**random_forest.best_params_)
random_forest.fit(features_train.values, target_train.values)
print(classification_report(target_test.values, random_forest.predict(features_test.values)))

NameError: name 'features_train' is not defined

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [122]:
random_forest = RandomForestClassifier(max_depth=13, n_estimators=11, criterion='entropy')
random_forest.fit(features_train.values, target_train.values)
print(classification_report(target_test.values, random_forest.predict(features_test.values)))

              precision    recall  f1-score   support

        Bear       0.00      0.00      0.00        29
        Bull       0.00      0.00      0.00        15
     S. Bear       0.46      0.11      0.17        56
     S. Bull       0.20      0.11      0.14        27
     W. Bear       0.41      0.66      0.51        97
     W. Bull       0.17      0.31      0.22        48

   micro avg       0.32      0.32      0.32       272
   macro avg       0.21      0.20      0.17       272
weighted avg       0.29      0.32      0.27       272



In [136]:
knn = GridSearchCV(KNeighborsClassifier(),
                   param_grid={'n_neighbors': range(3, 10)}, 
                   scoring='recall_macro', 
                   cv=3, verbose=4)
%time knn.fit(features_train.values, target_train.values)
print('Best Params: ', knn.best_params_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......... n_neighbors=3, score=0.16195887721603572, total=   0.8s
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV] ......... n_neighbors=3, score=0.16048265200005754, total=   1.0s
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.7s remaining:    0.0s


[CV] ............ n_neighbors=3, score=0.16030873633901, total=   0.8s
[CV] n_neighbors=4 ...................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.5s remaining:    0.0s


[CV] ......... n_neighbors=4, score=0.16335392291917344, total=   0.8s
[CV] n_neighbors=4 ...................................................
[CV] ......... n_neighbors=4, score=0.15857739053911743, total=   1.1s
[CV] n_neighbors=4 ...................................................
[CV] .......... n_neighbors=4, score=0.1594448884051454, total=   0.9s
[CV] n_neighbors=5 ...................................................
[CV] ......... n_neighbors=5, score=0.16423004389879844, total=   1.0s
[CV] n_neighbors=5 ...................................................
[CV] ......... n_neighbors=5, score=0.15829957612885817, total=   1.1s
[CV] n_neighbors=5 ...................................................
[CV] ......... n_neighbors=5, score=0.15845938140385749, total=   0.9s
[CV] n_neighbors=6 ...................................................
[CV] ......... n_neighbors=6, score=0.16612131631930263, total=   0.9s
[CV] n_neighbors=6 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:   42.2s finished


Wall time: 42.7 s
Best Params:  {'n_neighbors': 9}


In [137]:
knn = KNeighborsClassifier(**knn.best_params_)
knn.fit(features_train.values, target_train.values)
print(classification_report(target_test.values, knn.predict(features_test.values)))

              precision    recall  f1-score   support

        Bear       0.00      0.00      0.00        29
        Bull       0.00      0.00      0.00        15
     S. Bear       0.23      0.25      0.24        56
     S. Bull       0.09      0.15      0.11        27
     W. Bear       0.30      0.25      0.27        97
     W. Bull       0.19      0.27      0.22        48

   micro avg       0.20      0.20      0.20       272
   macro avg       0.13      0.15      0.14       272
weighted avg       0.19      0.20      0.20       272



In [138]:
knn = GridSearchCV(KNeighborsClassifier(weights='distance'),
                   param_grid={'n_neighbors': range(3, 10)}, 
                   scoring='recall_macro', 
                   cv=3, verbose=4)
%time knn.fit(features_train.values, target_train.values)
print('Best Params: ', knn.best_params_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... n_neighbors=3, score=0.1621027090500441, total=   0.9s
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV] ......... n_neighbors=3, score=0.16081995274617153, total=   1.1s
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.8s remaining:    0.0s


[CV] ......... n_neighbors=3, score=0.16457754313528414, total=   0.8s
[CV] n_neighbors=4 ...................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.6s remaining:    0.0s


[CV] ......... n_neighbors=4, score=0.16497095896298683, total=   0.8s
[CV] n_neighbors=4 ...................................................
[CV] ......... n_neighbors=4, score=0.15789179564914255, total=   1.1s
[CV] n_neighbors=4 ...................................................
[CV] ......... n_neighbors=4, score=0.16253280912537707, total=   0.9s
[CV] n_neighbors=5 ...................................................
[CV] ......... n_neighbors=5, score=0.16494521263180248, total=   0.9s
[CV] n_neighbors=5 ...................................................
[CV] .......... n_neighbors=5, score=0.1581398323614827, total=   1.1s
[CV] n_neighbors=5 ...................................................
[CV] ......... n_neighbors=5, score=0.16023776973318316, total=   0.9s
[CV] n_neighbors=6 ...................................................
[CV] .......... n_neighbors=6, score=0.1635097351675134, total=   0.9s
[CV] n_neighbors=6 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:   43.5s finished


Wall time: 44.2 s
Best Params:  {'n_neighbors': 3}


In [144]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(features_train.values, target_train.values)
print(classification_report(target_test.values, knn.predict(features_test.values)))

              precision    recall  f1-score   support

        Bear       0.20      0.07      0.10        29
        Bull       0.10      0.20      0.14        15
     S. Bear       0.33      0.32      0.32        56
     S. Bull       0.06      0.07      0.07        27
     W. Bear       0.42      0.30      0.35        97
     W. Bull       0.16      0.25      0.20        48

   micro avg       0.24      0.24      0.24       272
   macro avg       0.21      0.20      0.20       272
weighted avg       0.28      0.24      0.25       272



In [161]:
features_train_rs = RobustScaler().fit_transform(features_train.values)
features_test_rs = RobustScaler().fit_transform(features_test.values)
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
knn.fit(features_train_rs, target_train)
print(classification_report(target_test, knn.predict(features_test_rs)))

              precision    recall  f1-score   support

        Bear       0.00      0.00      0.00        29
        Bull       0.00      0.00      0.00        15
     S. Bear       0.29      0.18      0.22        56
     S. Bull       0.23      0.26      0.25        27
     W. Bear       0.39      0.42      0.40        97
     W. Bull       0.19      0.31      0.24        48

   micro avg       0.27      0.27      0.27       272
   macro avg       0.18      0.20      0.18       272
weighted avg       0.26      0.27      0.26       272



In [184]:
pca = PCA(n_components=int(features_train.shape[1]/3))

In [185]:
X_train_pca = pca.fit_transform(features_train)
X_test_pca = pca.transform(features_test)

In [230]:
target_test.loc[target_test.str.endswith('Bear')] = 'Bear'
target_test.loc[target_test.str.endswith('Bull')] = 'Bull'

In [231]:
target_train.loc[target_train.str.endswith('Bear')] = 'Bear'
target_train.loc[target_train.str.endswith('Bull')] = 'Bull'

In [232]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train_pca, target_train)
print(classification_report(target_test, knn.predict(X_test_pca)))

              precision    recall  f1-score   support

        Bear       0.68      0.54      0.60       182
        Bull       0.34      0.48      0.40        90

   micro avg       0.52      0.52      0.52       272
   macro avg       0.51      0.51      0.50       272
weighted avg       0.57      0.52      0.54       272



In [192]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_pca, target_train)
print(classification_report(target_test, knn.predict(X_test_pca)))

              precision    recall  f1-score   support

        Bear       0.11      0.14      0.12        29
        Bull       0.13      0.33      0.19        15
     S. Bear       0.26      0.32      0.29        56
     S. Bull       0.08      0.07      0.08        27
     W. Bear       0.46      0.24      0.31        97
     W. Bull       0.18      0.21      0.19        48

   micro avg       0.23      0.23      0.23       272
   macro avg       0.20      0.22      0.20       272
weighted avg       0.28      0.23      0.24       272



In [220]:
vot = VotingClassifier(estimators=[('RanFor', RandomForestClassifier(max_depth=14, n_estimators=10)),
                                   ('KNN', KNeighborsClassifier(n_neighbors=3, weights='distance'))])

In [221]:
vot.fit(X_train_pca, target_train)

VotingClassifier(estimators=[('RanFor', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_...i',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='distance'))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [222]:
print(classification_report(target_test, vot.predict(X_test_pca)))

              precision    recall  f1-score   support

        Bear       0.15      0.07      0.10        29
        Bull       0.13      0.27      0.17        15
     S. Bear       0.33      0.38      0.35        56
     S. Bull       0.09      0.15      0.11        27
     W. Bear       0.43      0.38      0.40        97
     W. Bull       0.18      0.12      0.15        48

   micro avg       0.27      0.27      0.27       272
   macro avg       0.22      0.23      0.21       272
weighted avg       0.28      0.27      0.27       272



In [219]:
bagg = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3, weights='distance'), n_estimators=100)

In [218]:
bagg.fit(X_train_pca, target_train)
print(classification_report(target_test, bagg.predict(X_test_pca)))

              precision    recall  f1-score   support

        Bear       0.11      0.03      0.05        29
        Bull       0.11      0.20      0.14        15
     S. Bear       0.32      0.32      0.32        56
     S. Bull       0.09      0.11      0.10        27
     W. Bear       0.38      0.28      0.32        97
     W. Bull       0.16      0.25      0.19        48

   micro avg       0.24      0.24      0.24       272
   macro avg       0.20      0.20      0.19       272
weighted avg       0.26      0.24      0.24       272



In [204]:
ada = AdaBoostClassifier(n_estimators=75)

In [208]:
cross_val_score(ada, X_train_pca, target_train, scoring='recall_macro')

array([0.18140262, 0.17161353, 0.1878066 ])

In [206]:
ada.fit(X_train_pca, target_train)
print(classification_report(target_test, ada.predict(X_test_pca)))

              precision    recall  f1-score   support

        Bear       0.00      0.00      0.00        29
        Bull       0.00      0.00      0.00        15
     S. Bear       0.39      0.12      0.19        56
     S. Bull       0.00      0.00      0.00        27
     W. Bear       0.39      0.63      0.48        97
     W. Bull       0.10      0.17      0.12        48

   micro avg       0.28      0.28      0.28       272
   macro avg       0.15      0.15      0.13       272
weighted avg       0.23      0.28      0.23       272



In [148]:
light = GridSearchCV(lgb.LGBMClassifier(),
                   param_grid={'max_depth': range(9, 16),
                              'learning_rate': [0.1, 0.3, 0.5, 0.7],
                              'n_estimators': [75, 100, 125]}, 
                   scoring='recall_macro', 
                   cv=3, verbose=4)

In [149]:
%time light.fit(features_train.values, target_train.values)
print('Best params are:', light.best_params_, '\n')
light_best = lgb.LGBMClassifier(**light.best_params_)
light_best.fit(features_train.values, target_train.values)
print(classification_report(target_test.values, light_best.predict(features_test.values)))

Fitting 3 folds for each of 84 candidates, totalling 252 fits
[CV] learning_rate=0.1, max_depth=9, n_estimators=75 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.1, max_depth=9, n_estimators=75, score=0.18564433488154075, total=   7.1s
[CV] learning_rate=0.1, max_depth=9, n_estimators=75 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=9, n_estimators=75, score=0.17743719501864266, total=   7.6s
[CV] learning_rate=0.1, max_depth=9, n_estimators=75 .................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.6s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=9, n_estimators=75, score=0.1847917905718536, total=   8.6s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   25.0s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, score=0.18765261929742905, total=   9.5s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100 ................
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, score=0.17717833575650163, total=  10.4s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100 ................
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, score=0.18420068490684058, total=   9.8s
[CV] learning_rate=0.1, max_depth=9, n_estimators=125 ................
[CV]  learning_rate=0.1, max_depth=9, n_estimators=125, score=0.18876445076211956, total=  12.0s
[CV] learning_rate=0.1, max_depth=9, n_estimators=125 ................
[CV]  learning_rate=0.1, max_depth=9, n_estimators=125, score=0.1772101561275333, total=  13.4s
[CV] learning_rate=0.1, max_depth=9, n_estimators=125 ................
[CV]  learning_rate=0.1, max_depth=9, n_estimators=125, score=0.18143331959615097, total=  12.9s
[CV] learning_rate=0.1, max_depth=10, n_estimators=75 ..........

[Parallel(n_jobs=1)]: Done 252 out of 252 | elapsed: 58.3min finished


Wall time: 58min 28s
Best params are: {'learning_rate': 0.3, 'max_depth': 15, 'n_estimators': 75} 

              precision    recall  f1-score   support

        Bear       0.00      0.00      0.00        29
        Bull       0.17      0.07      0.10        15
     S. Bear       0.30      0.11      0.16        56
     S. Bull       0.14      0.15      0.15        27
     W. Bear       0.35      0.43      0.39        97
     W. Bull       0.20      0.40      0.27        48

   micro avg       0.26      0.26      0.26       272
   macro avg       0.19      0.19      0.18       272
weighted avg       0.25      0.26      0.24       272

