In [1]:
import numpy as np
import pandas as pd
import sqlite3
import json
from lightgbm import LGBMRegressor
from datetime import datetime, timedelta, date
from collections import namedtuple, defaultdict
from typing import Tuple
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from typing import List, Tuple, Dict
from sklearn.compose import ColumnTransformer
import warnings
import plotly.express as px
import plotly.graph_objects as go
from xgboost import XGBRegressor
from copy import deepcopy
import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from finance_ml.variants.linear_model.main import main
from finance_ml.variants.linear_model.hyperparams import Hyperparams
from finance_ml.variants.linear_model.preprocessing import (apply_engineered_columns,
    preprocess_data, preprocess_quarterly_data, preprocess_stockpup_data, read_stock_info,
    compare_to_market_indices)
from finance_ml.utils.constants import (
    QuarterlyColumns, StockPupColumns, STOCKPUP_TABLE_NAME, QUARTERLY_DB_FILE_PATH,
    YF_QUARTERLY_TABLE_NAME, INDEX_COLUMNS, MISSING_SECTOR, MISSING_INDUSTRY,
    STOCK_GENERAL_INFO_CSV, FORMULAE, Q_DELTA_PREFIX, YOY_DELTA_PREFIX,
    QUARTER, YEAR, VS_MKT_IDX, CATEGORICAL_COLUMNS, NUMERIC_COLUMNS
)
from finance_ml.variants.linear_model.config import FEATURE_COLUMNS


warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime, timedelta
import pandas as pd
from sklearn.pipeline import Pipeline

from finance_ml.utils.constants import (
    TARGET_COLUMN, QuarterlyColumns, CATEGORICAL_COLUMNS, INDEX_COLUMNS)
from finance_ml.utils.quarterly_index import QuarterlyIndex
from finance_ml.utils.transforms import (
    IndexSwitchTransformer, NumericalScaler, CategoricalToDummy, ColumnFilter, DateFilter,
    QuarterFilter, OutlierExtractor, CategoricalToNumeric, Splitter)

from finance_ml.variants.linear_model.preprocessing import preprocess_data
from finance_ml.variants.linear_model.hyperparams import Hyperparams
from finance_ml.variants.linear_model.train import train_and_evaluate


def _get_target_col_prediction(row: pd.Series, df: pd.DataFrame, hyperparams: Hyperparams):
    try:
        target_prediction_index = QuarterlyIndex(*row.name).time_travel(
            hyperparams.N_QUARTERS_OUT_TO_PREDICT).to_tuple()

        prediction_data = df.loc[target_prediction_index]
        return prediction_data[
            f'{hyperparams.PREDICTION_TARGET_PREFIX}{QuarterlyColumns.PRICE_AVG}']
    except:
        return None


In [3]:
hyperparams = Hyperparams()
df = preprocess_data(hyperparams)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [5]:
df_copy = df.copy()
print(df.shape)

(62466, 99)


In [6]:
columns_to_drop = list(set(df.columns).difference({*FEATURE_COLUMNS}))
print(f"Dropping columns not in features: {columns_to_drop}")
df.drop(columns=columns_to_drop, inplace=True)
print(f"Shape after dropping columns: {df.shape}")

target_col = df.apply(_get_target_col_prediction, axis=1,
                             df=df, hyperparams=hyperparams)
df[TARGET_COLUMN] = target_col
print(f"Shape after adding Target Column: {df.shape}")

# Get dataframe of rows to make predictions on (most recent rows)
prediction_candidate_df = df[df[TARGET_COLUMN].isnull()]
df.dropna(subset=[TARGET_COLUMN], inplace=True)
print(f"Shape after dropping NA's in target column: {df.shape}")

non_categorical_columns = list(set(df.columns).difference({*CATEGORICAL_COLUMNS}))

numerical_scaler = NumericalScaler(columns=non_categorical_columns)
outlier_extractor = OutlierExtractor(columns=non_categorical_columns)
categorical_to_dummy = CategoricalToDummy(CATEGORICAL_COLUMNS)
categorical_to_numeric = CategoricalToNumeric(CATEGORICAL_COLUMNS)
date_filter = QuarterFilter(
    start_date=hyperparams.START_DATE,
    end_date=hyperparams.END_DATE or datetime.now() - timedelta(days=90))

data_pipeline = Pipeline(steps=[('filter_dates', date_filter)])

if hyperparams.EXTRACT_OUTLIERS:
    data_pipeline.steps.append(('extract_outliers', outlier_extractor))

if hyperparams.ONE_HOT_ENCODE:
    data_pipeline.steps.append(('one_hot_encode', categorical_to_dummy))
elif hyperparams.NUMERIC_ENCODE_CATEGORIES:
    data_pipeline.steps.append(('cat_to_numeric', categorical_to_numeric))

if hyperparams.SCALE_NUMERICS:
    data_pipeline.steps.append(('numeric_scaler', numerical_scaler))

X_transformed = data_pipeline.fit_transform(df)
print(X_transformed.shape)



Dropping columns not in features: ['AvgRecommendationScore', 'Date', 'Revenue', 'AvgRecommendations', 'Investments', 'Split', 'NetBorrowings', 'MarketCap']
Shape after dropping columns: (62466, 91)
Shape after adding Target Column: (62466, 92)
Shape after dropping NA's in target column: (58796, 92)
QuarterFilter removed 9838 rows
QuarterFilter output size: (48958, 92)
(48958, 92)


# LGBMRegressor

In [7]:
from sklearn.metrics import mean_squared_error

X_train, y_train, X_test, y_test = Splitter().transform(X_transformed)


model = LGBMRegressor(boosting_type='gbdt',
                      num_leaves=hyperparams.NUM_LEAVES,
                      max_depth=hyperparams.MAX_DEPTH,
                      learning_rate=hyperparams.LEARNING_RATE,
                      n_estimators=hyperparams.N_ESTIMATORS,
                      random_state=hyperparams.RANDOM_SEED)

model.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1')

# y_pred = model.predict(X_test)

features = {feature: importance for feature, importance in sorted(zip(X_transformed.columns, model.feature_importances_), key=lambda i: i[1])}
print(f"Feature Importances: {features}")
print(f"Best Score: {model.best_score_}")
# print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

[1]	valid_0's l1: 0.27108	valid_0's l2: 0.199382
[2]	valid_0's l1: 0.268545	valid_0's l2: 0.192886
[3]	valid_0's l1: 0.266277	valid_0's l2: 0.187559
[4]	valid_0's l1: 0.264309	valid_0's l2: 0.183523
[5]	valid_0's l1: 0.26302	valid_0's l2: 0.180243
[6]	valid_0's l1: 0.261708	valid_0's l2: 0.177981
[7]	valid_0's l1: 0.260567	valid_0's l2: 0.174814
[8]	valid_0's l1: 0.25959	valid_0's l2: 0.173211
[9]	valid_0's l1: 0.258799	valid_0's l2: 0.172217
[10]	valid_0's l1: 0.258173	valid_0's l2: 0.171121
[11]	valid_0's l1: 0.257389	valid_0's l2: 0.169239
[12]	valid_0's l1: 0.256849	valid_0's l2: 0.168151
[13]	valid_0's l1: 0.256963	valid_0's l2: 0.168753
[14]	valid_0's l1: 0.2563	valid_0's l2: 0.167836
[15]	valid_0's l1: 0.255912	valid_0's l2: 0.167153
[16]	valid_0's l1: 0.255329	valid_0's l2: 0.166171
[17]	valid_0's l1: 0.25482	valid_0's l2: 0.165793
[18]	valid_0's l1: 0.254657	valid_0's l2: 0.165969
[19]	valid_0's l1: 0.254239	valid_0's l2: 0.165698
[20]	valid_0's l1: 0.253864	valid_0's l2: 0.16

In [8]:
y_pred = model.predict(X_test)

In [9]:
y_pred = model.predict(X_test)

from sklearn import metrics
metrics_dict = {
        'Mean Absolute Error': metrics.mean_absolute_error(y_test, y_pred),
        'Mean Squared Error': metrics.mean_squared_error(y_test, y_pred),
        'Root Mean Squared Error': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
        'Absolute Percentage Error': abs(y_pred - y_test) * 100.
    }

metrics_dict

{'Mean Absolute Error': 0.2430384672649855,
 'Mean Squared Error': 0.15435780806762875,
 'Root Mean Squared Error': 0.39288396259917346,
 'Absolute Percentage Error': TickerSymbol  Quarter  Year
 PETS          4        2017    40.743757
 GLW           2        2019    11.823908
 ALL           4        2015     5.958519
 TIF           2        2011    10.098179
 JNPR          1        2014    26.566640
                                  ...    
 ARW           2        2016    12.259128
 ADM           2        2018    16.996599
 RRD           1        2009    65.765496
 FDX           3        2011     0.719094
 ADBE          4        2009    17.119110
 Name: PredictedPrice, Length: 9791, dtype: float64}

#  XGBRegressor

In [15]:
columns_to_drop = list(set(df.columns).difference({*FEATURE_COLUMNS}))
print(f"Dropping columns not in features: {columns_to_drop}")
df.drop(columns=columns_to_drop, inplace=True)
print(f"Shape after dropping columns: {df.shape}")

target_col = df.apply(_get_target_col_prediction, axis=1,
                             df=df, hyperparams=hyperparams)
df[TARGET_COLUMN] = target_col
print(f"Shape after adding Target Column: {df.shape}")

# Get dataframe of rows to make predictions on (most recent rows)
prediction_candidate_df = df[df[TARGET_COLUMN].isnull()]
df.dropna(subset=[TARGET_COLUMN], inplace=True)
print(f"Shape after dropping NA's in target column: {df.shape}")

non_categorical_columns = list(set(df.columns).difference({*CATEGORICAL_COLUMNS}))

numerical_scaler = NumericalScaler(columns=non_categorical_columns)
outlier_extractor = OutlierExtractor(columns=non_categorical_columns)
categorical_to_dummy = CategoricalToDummy(CATEGORICAL_COLUMNS)
categorical_to_numeric = CategoricalToNumeric(CATEGORICAL_COLUMNS)
date_filter = QuarterFilter(
    start_date=hyperparams.START_DATE,
    end_date=hyperparams.END_DATE or datetime.now() - timedelta(days=90))

data_pipeline_xgb = Pipeline(steps=[
    ('filter_dates', date_filter),
    ('one_hot_encode', categorical_to_dummy)
])


# if hyperparams.EXTRACT_OUTLIERS:
#     data_pipeline.steps.append(('extract_outliers', outlier_extractor))

# if True:  # hyperparams.ONE_HOT_ENCODE:
#     data_pipeline.steps.append(('one_hot_encode', categorical_to_dummy))
# elif hyperparams.NUMERIC_ENCODE_CATEGORIES:
#     data_pipeline.steps.append(('cat_to_numeric', categorical_to_numeric))

# if hyperparams.SCALE_NUMERICS:
#     data_pipeline.steps.append(('numeric_scaler', numerical_scaler))

X_transformed_xgb = data_pipeline_xgb.fit_transform(df)
print(X_transformed_xgb.shape)

Dropping columns not in features: ['PredictedPrice']
Shape after dropping columns: (55426, 91)
Shape after adding Target Column: (55426, 92)
Shape after dropping NA's in target column: (52152, 92)
QuarterFilter removed 9664 rows
QuarterFilter output size: (42488, 92)
Categorical Transform output size: (42488, 232)
(42488, 232)


In [22]:
X_train, y_train, X_test, y_test = Splitter().transform(X_transformed_xgb)

booster = 'gbtree'  #'dart'  #'gblinear' #'gbtree'

xgb_model = XGBRegressor(seed=100,
                         n_estimators=100,
                         max_depth=3,
                         learning_rate=0.1,
                         min_child_weight=1,
                         subsample=1,
                         colsample_bytree=1,
                         colsample_bylevel=1,
                         gamma=0,
                         booster=booster).fit(X_train, y_train)

if booster == 'gblinear':
    for i, col in enumerate(X_train.columns):
        print(f'The coefficient for {col} is {xgb_model.coef_[i]}')
    print(f'The intercept for our model is {xgb_model.intercept_}')

xgb_score = xgb_model.score(X_test, y_test)
print('*'*50)
print(f'The score of our model is {xgb_score}')

y_pred = xgb_model.predict(X_test)

from sklearn import metrics
metrics_dict = {
        'Mean Absolute Error': metrics.mean_absolute_error(y_test, y_pred),
        'Mean Squared Error': metrics.mean_squared_error(y_test, y_pred),
        'Root Mean Squared Error': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
        'Absolute Percentage Error': abs(y_pred - y_test) * 100.
    }

metrics_dict

**************************************************
The score of our model is 0.4352931906770352


{'Mean Absolute Error': 0.2520980908210366,
 'Mean Squared Error': 0.14870865934351812,
 'Root Mean Squared Error': 0.3856276174543495,
 'Absolute Percentage Error': TickerSymbol  Quarter  Year
 SANM          2        2011    36.379131
 VTR           1        2000    50.450473
 MO            4        2003     0.466646
 MYL           4        1999     3.667933
 KLAC          3        2017     5.451038
                                  ...    
 MTD           4        1999    17.731257
 INCY          2        2005    60.925632
 BWA           3        2007    28.729307
 UDR           4        2010     9.850912
 UNP           3        2012    21.604318
 Name: PredictedPrice, Length: 8497, dtype: float64}

In [None]:
'Mean Absolute Error': 0.2430384672649855,
 'Mean Squared Error': 0.15435780806762875,
 'Root Mean Squared Error': 0.39288396259917346,
 'Absolute Percentage Error': TickerSymbol

In [32]:
import copy
dp = copy.deepcopy(data_pipeline)
dp.steps[1:]

[('one_hot_encode',
  CategoricalToDummy(categorical_columns=['Sector', 'Industry', 'Quarter'],
                     drop_one=True, drop_original=True))]

In [28]:
dat

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_param_names',
 '_get_params',
 '_get_tags',
 '_inverse_transform',
 '_iter',
 '_log_message',
 '_more_tags',
 '_pairwise',
 '_replace_estimator',
 '_required_parameters',
 '_set_params',
 '_transform',
 '_validate_names',
 '_validate_steps',
 'classes_',
 'decision_function',
 'fit',
 'fit_predict',
 'fit_transform',
 'get_params',
 'inverse_transform',
 'memory',
 'named_steps',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'score_samples',
 'set_para

In [51]:
y_out = X_test[FEATURE_COLUMNS[0]].reset_index()

y_out[TARGET_COLUMN] = y_pred
y_out = y_out.set_index(INDEX_COLUMNS).drop(columns=[FEATURE_COLUMNS[0]])
y_out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PredictedPrice
TickerSymbol,Quarter,Year,Unnamed: 3_level_1
SANM,2,2011,0.208810
VTR,1,2000,0.538723
MO,4,2003,0.094472
MYL,4,1999,0.261469
KLAC,3,2017,0.089598
...,...,...,...
MTD,4,1999,0.245512
INCY,2,2005,0.153492
BWA,3,2007,0.177270
UDR,4,2010,0.112256


In [55]:
a=y_out.nlargest(10, columns=[TARGET_COLUMN])

In [57]:
len(a)

10

In [44]:
y_pred.shape

(8497,)