# Module 3 Homework

In [1]:
# IMPORTS
import numpy as np
import pandas as pd

#Fin Data Sources
import yfinance as yf
import pandas_datareader as pdr

#Data viz
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px

import time
from datetime import date

# for graphs
import matplotlib.pyplot as plt

In [2]:
# full dataset for 33 stocks
df_full = pd.read_parquet("../03-modeling/content/stocks_df_combined_2024_05_07.parquet.brotli", )

In [3]:
# features
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)&(g.find('future')<0)]
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']
TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV

# manually defined features
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

# All Supported Ta-lib indicators: https://github.com/TA-Lib/ta-lib-python/blob/master/docs/funcs.md
TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']
NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO

# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP]

In [4]:
# let's define on more custom numerical features
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x) if x > 0 else 0)

# tickers, min-max date, count of daily observations
# df_full.groupby(['Ticker'])['Date'].agg(['min','max','count'])

In [5]:
# truncated df_full with 25 years of data (and defined growth variables)
df = df_full[df_full.Date >= '2000-01-01']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 203 entries, Open to ln_volume
dtypes: datetime64[ns](3), float64(129), int32(64), int64(5), object(2)
memory usage: 239.7+ MB


## Question 1 (1 point): Dummies on Month and Week-of-Month

**Find the ABSOLUTE CORRELATION VALUE of the most correlated dummy <month-week_of_month> with the binary outcome variable `is_positive_growth_5d_future`?**

You saw in the correlation analysis and modeling that September and October may be important seasonal months. In this task, we'll go futher and try to generate dummies for Month and Week-of-month (starting from 1). For example, the first week of October should be coded similar to this: 'October_w1'.
Once you've generated the new set of variables, find the most correlated (in absolute value) one with `is_positive_growth_5d_future` and round it to 3 digits after the comma.

Suggested path to a solution:
- [[Source](https://stackoverflow.com/questions/25249033/week-of-a-month-pandas)] Use this formula to get the week of month for the datetime variable d: `(d.day-1)//7+1` 
- Define a new string variable for all month-week_of_month combinations. Append it to the CATEGORICAL features set. You should have 5 variables treated as CATEGORICAL now: 'Month', 'Weekday', 'Ticker', 'ticker_type', 'month_wom'. In the end, you should get 115 dummy features, including 60 (=12*5) week_month_of_week dummies.
- Use [pandas.get_dummies()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html) to generate dummies.
- Use [pandas.DataFrame.corr()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html) function (also used in [Code Snippet 1]) to get correlations with `is_positive_growth_5d_future`, filter out only variables representing the new dummy set, and sort it by absolute values (you can define a new column "abs_corr" in the dataframe with correlations), and find the highest value (among the new dummies features set).

**NOTE**: new dummies will be used as features in the next tasks, please leave them in the dataset.

In [6]:
# generate week of month features
df['month_wom'] = df.Date.dt.strftime('%B')+"_w"+((df.Date.dt.day - 1) // 7 + 1).astype(str)

# dummy variables are not generated from Date and numeric variables
df.loc[:,'Month'] = df.Month.dt.strftime('%B')
df.loc[:,'Weekday'] = df.Weekday.astype(str)

# Generate dummy variables (no need for bool, let's have int32 instead)
CATS = CATEGORICAL + ['month_wom']
dummy_variables = pd.get_dummies(df[CATS], dtype='int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month_wom'] = df.Date.dt.strftime('%B')+"_w"+((df.Date.dt.day - 1) // 7 + 1).astype(str)
  df.loc[:,'Month'] = df.Month.dt.strftime('%B')
  df.loc[:,'Weekday'] = df.Weekday.astype(str)


In [7]:
# get dummies names in a list
DUMMIES = dummy_variables.keys().to_list()

# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

In [8]:
df_with_dummies[NUMERICAL+DUMMIES].info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 299 entries, growth_1d to month_wom_September_w5
dtypes: float64(121), int32(177), int64(1)
memory usage: 294.8 MB


In [9]:
# correlation analysis
corr_is_positive_growth_5d_future = df_with_dummies[NUMERICAL+DUMMIES+TO_PREDICT].corr()['is_positive_growth_5d_future']

# create a dataframe for an easy way to sort
corr_is_positive_growth_5d_future_df = pd.DataFrame(corr_is_positive_growth_5d_future)

In [10]:
corr_is_positive_growth_5d_future_df.sort_values(by='is_positive_growth_5d_future').head(5)

Unnamed: 0,is_positive_growth_5d_future
DGS10,-0.036227
month_wom_September_w3,-0.034537
gdppot_us_yoy,-0.034185
gdppot_us_qoq,-0.032138
DGS5,-0.030369


In [11]:
corr_is_positive_growth_5d_future_df.sort_values(by='is_positive_growth_5d_future').tail(8)

Unnamed: 0,is_positive_growth_5d_future
month_wom_October_w5,0.026023
month_wom_March_w4,0.026058
growth_btc_usd_30d,0.027712
growth_future_5d,0.668535
is_positive_growth_5d_future,1.0
cdl3starsinsouth,
cdlconcealbabyswall,
cdlmathold,


In [12]:
corr_is_positive_growth_5d_future_df.sort_values(by='is_positive_growth_5d_future', ascending=False).head(5)

Unnamed: 0,is_positive_growth_5d_future
is_positive_growth_5d_future,1.0
growth_future_5d,0.668535
growth_btc_usd_30d,0.027712
month_wom_March_w4,0.026058
month_wom_October_w5,0.026023


In [13]:
corr_growth_future_5d = df_with_dummies[NUMERICAL+DUMMIES+TO_PREDICT].corr()['growth_future_5d']
corr_growth_future_5d_df = pd.DataFrame(corr_growth_future_5d)

In [14]:
corr_growth_future_5d_df.sort_values(by='growth_future_5d').head(5)

Unnamed: 0,growth_future_5d
growth_7d,-0.05068
growth_3d,-0.049925
growth_brent_oil_365d,-0.041784
roc,-0.040579
rocr,-0.040579


In [15]:
corr_growth_future_5d_df.sort_values(by='growth_future_5d').tail(8)

Unnamed: 0,growth_future_5d
growth_btc_usd_3d,0.032382
natr,0.036051
growth_btc_usd_7d,0.036124
is_positive_growth_5d_future,0.668535
growth_future_5d,1.0
cdl3starsinsouth,
cdlconcealbabyswall,
cdlmathold,


In [16]:
corr = df_with_dummies[NUMERICAL+DUMMIES+TO_PREDICT].corr()['is_positive_growth_5d_future']

In [17]:
corr.abs().sort_values(ascending=False)

is_positive_growth_5d_future    1.000000
growth_future_5d                0.668535
DGS10                           0.036227
month_wom_September_w3          0.034537
gdppot_us_yoy                   0.034185
                                  ...   
month_wom_March_w2              0.000036
month_wom_November_w1           0.000033
cdl3starsinsouth                     NaN
cdlconcealbabyswall                  NaN
cdlmathold                           NaN
Name: is_positive_growth_5d_future, Length: 301, dtype: float64

In [18]:
abs_corr = abs(round(corr['month_wom_September_w3'], 3))
abs_corr

0.035

## Question 2 (2 points): Define new "hand" rules on macro and technical indicators variables

**What is the precision score for the best of the NEW predictions (pred3 or pred4), rounded to 3 digits after the comma?**

Let's utilize the knowledge from the visualised tree (clf10) (Code Snippet 5: 1.4.4 Visualisation):

* You're asked to define two new 'hand' rules (leading to 'positive' subtrees): 
  - `pred3_manual_gdp_fastd`: (gdppot_us_yoy <= 0.027) & (fastd >= 0.251)
  - `pred4_manual_gdp_wti_oil`: (gdppot_us_yoy >= 0.027) & (growth_wti_oil_30d <= 1.005)

* Extend the Code Snippet 3 (Manual "hand rule" predictions): Calculate and add new rules (pred3 and pred4) to the dataframe.You should notice that one of the predictions doesn't have any positive predictions on TEST dataset (while it has many on TRAIN+VALIDATION). 

* Debug: check in the `new_df` and the original dataset/data generation process that we didn't make any mistakes during the data transformation step.

* Explain why this can happen even if there are no errors in the data features.

* As a result, write down the precision score for the remaining predictor (round to three decimal points). E.g. if you have 0.57897, your answer should be 0.579.

In [19]:
def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    """
    Splits a DataFrame into three buckets based on the temporal order of the 'Date' column.

    Args:
        df (DataFrame): The DataFrame to split.
        min_date (str or Timestamp): Minimum date in the DataFrame.
        max_date (str or Timestamp): Maximum date in the DataFrame.
        train_prop (float): Proportion of data for training set (default: 0.6).
        val_prop (float): Proportion of data for validation set (default: 0.2).
        test_prop (float): Proportion of data for test set (default: 0.2).

    Returns:
        DataFrame: The input DataFrame with a new column 'split' indicating the split for each row.
    """
    # Define the date intervals
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    # Assign split labels based on date ranges
    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    # Add 'split' column to the DataFrame
    df['split'] = split_labels

    return df

In [20]:
min_date_df = df_with_dummies.Date.min()
max_date_df = df_with_dummies.Date.max()

df_with_dummies = temporal_split(
    df_with_dummies,
    min_date = min_date_df,
    max_date = max_date_df,
)
df_with_dummies['split'].value_counts()/len(df_with_dummies)

# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = df_with_dummies.copy()

# Full dataframe (transformed and truncated to 25 years)
new_df.info()

# check one record: it has abs. values, text, and numbers
new_df.head(1)

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 320 entries, Open to split
dtypes: datetime64[ns](2), float64(129), int32(178), int64(5), object(6)
memory usage: 323.3+ MB


Unnamed: 0,Open,High,Low,Close,Adj Close_x,Volume,Ticker,Year,Month,Weekday,...,month_wom_October_w2,month_wom_October_w3,month_wom_October_w4,month_wom_October_w5,month_wom_September_w1,month_wom_September_w2,month_wom_September_w3,month_wom_September_w4,month_wom_September_w5,split
3490,58.6875,59.3125,56.0,58.28125,36.065567,53228400.0,MSFT,2000,January,0,...,0,0,0,0,0,0,0,0,0,train


In [21]:
# time split on train/validation/test: FIXED dates of split, approx. 70%, 15%, 15% split
new_df.groupby(['split'])['Date'].agg({'min','max','count'})

Unnamed: 0_level_0,min,max,count
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,2020-09-14,2024-05-07,29829
train,2000-01-03,2017-01-16,123458
validation,2017-01-17,2020-09-11,29388


In [22]:
# what we try to predict
new_df[TO_PREDICT].head(1)

Unnamed: 0,growth_future_5d,is_positive_growth_5d_future
3490,0.963003,0


In [23]:
# to be used as features
new_df[NUMERICAL+DUMMIES].head(1)

Unnamed: 0,growth_1d,growth_3d,growth_7d,growth_30d,growth_90d,growth_365d,growth_dax_1d,growth_dax_3d,growth_dax_7d,growth_dax_30d,...,month_wom_October_w1,month_wom_October_w2,month_wom_October_w3,month_wom_October_w4,month_wom_October_w5,month_wom_September_w1,month_wom_September_w2,month_wom_September_w3,month_wom_September_w4,month_wom_September_w5
3490,0.998394,0.988341,0.991494,1.372333,1.222951,2.063053,0.970196,0.983855,1.051736,1.134572,...,0,0,0,0,0,0,0,0,0,0


In [24]:
new_df['gdppot_us_yoy']

3490    0.044886
3491    0.044886
3492    0.044886
3493    0.044886
3494    0.044886
          ...   
5422    0.022151
5423    0.022151
5424    0.022151
5425    0.022151
5426    0.022151
Name: gdppot_us_yoy, Length: 182675, dtype: float64

In [25]:
# generate manual predictions
# Let's label all prediction features with prefix "pred"
new_df['pred0_manual_cci'] = (new_df.cci>200).astype(int)
new_df['pred1_manual_prev_g1'] = (new_df.growth_1d>1).astype(int)
new_df['pred2_manual_prev_g1_and_snp'] = ((new_df['growth_1d'] > 1) & (new_df['growth_snp500_1d'] > 1)).astype(int)

In [26]:
# TODO 2: find more "hand rules" - can get it from decision trees important factors, or randomly build on other most popular macro/tech indicators/ manual_features
new_df['pred3_manual_gdp_fastd'] = (new_df['gdppot_us_yoy'] <= 0.027) & (new_df['fastd'] >= 0.251).astype(int)
new_df['pred4_manual_gdp_wti_oil'] = (new_df['gdppot_us_yoy'] >= 0.027) & (new_df['growth_wti_oil_30d'] <= 1.005).astype(int)

In [27]:
new_df[['cci','growth_1d','growth_3d','growth_snp500_1d','growth_snp500_3d', 'pred0_manual_cci','pred1_manual_prev_g1','pred2_manual_prev_g1_and_snp','pred3_manual_gdp_fastd','pred4_manual_gdp_wti_oil','is_positive_growth_5d_future']]

Unnamed: 0,cci,growth_1d,growth_3d,growth_snp500_1d,growth_snp500_3d,pred0_manual_cci,pred1_manual_prev_g1,pred2_manual_prev_g1_and_snp,pred3_manual_gdp_fastd,pred4_manual_gdp_wti_oil,is_positive_growth_5d_future
3490,26.847237,0.998394,0.988341,0.990451,0.994370,0,0,0,False,False,0
3491,-34.319663,0.966220,0.957492,0.961655,0.955581,0,0,0,False,False,0
3492,-97.318008,1.010544,0.974839,1.001922,0.954303,0,1,1,False,False,0
3493,-169.947507,0.966502,0.943700,1.000956,0.964425,0,0,0,False,False,0
3494,-142.142685,1.013068,0.989456,1.027090,1.030048,0,1,1,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...
5422,-29.424989,0.988994,0.984120,0.984269,0.997478,0,0,0,True,False,0
5423,-26.657181,1.001447,0.998419,1.009128,0.989842,0,1,1,True,False,0
5424,-123.785473,0.972302,0.962991,1.012557,1.018289,0,0,0,True,False,0
5425,-181.986224,0.989571,0.963553,1.010326,1.032351,0,0,0,True,False,0


In [28]:
PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
PREDICTIONS

['pred0_manual_cci',
 'pred1_manual_prev_g1',
 'pred2_manual_prev_g1_and_snp',
 'pred3_manual_gdp_fastd',
 'pred4_manual_gdp_wti_oil']

In [29]:
p = PREDICTIONS[0]
part1 = p.split('_')[0] # first prefix before '_'
print(f'Full column name: {p}, only first part: {part1}')

Full column name: pred0_manual_cci, only first part: pred0


In [30]:
# One prediction: do we predict correctly?
new_df['is_correct_prediction'] = (new_df.pred0_manual_cci == new_df.is_positive_growth_5d_future)
new_df[['cci','pred0_manual_cci','is_positive_growth_5d_future','is_correct_prediction']]

# check "Precision" : the percentage of "correct" predictions , WHEN we predict "1" (POSITIVE future growth)
filter = (new_df.split=='test') & (new_df.pred0_manual_cci==1)
new_df[filter].is_correct_prediction.value_counts()

is_correct_prediction
True     455
False    344
Name: count, dtype: int64

In [31]:
# %% of correct predictions : 54%
new_df[filter].is_correct_prediction.value_counts() / len(new_df[filter])

is_correct_prediction
True     0.569462
False    0.430538
Name: count, dtype: float64

In [32]:
# delete this column
del new_df["is_correct_prediction"]

In [33]:
# generate columns is_correct_
for pred in PREDICTIONS:
  part1 = pred.split('_')[0] # first prefix before '_'
  new_df[f'is_correct_{part1}'] =  (new_df[pred] == new_df.is_positive_growth_5d_future).astype(int)

In [34]:
# IS_CORRECT dataset
IS_CORRECT =  [k for k in new_df.keys() if k.startswith('is_correct_')]
IS_CORRECT

['is_correct_pred0',
 'is_correct_pred1',
 'is_correct_pred2',
 'is_correct_pred3',
 'is_correct_pred4']

In [35]:
new_df[PREDICTIONS+IS_CORRECT+['is_positive_growth_5d_future']]
len(new_df[new_df.split=='test'])

29829

In [36]:
# define "Precision" for ALL predictions on a Test dataset (~4 last years of trading)
for i,column in enumerate(IS_CORRECT):
  prediction_column = PREDICTIONS[i]
  is_correct_column = column
  filter = (new_df.split=='test') & (new_df[prediction_column]==1)
  print(f'Prediction column:{prediction_column} , is_correct_column: {is_correct_column}')
  print(new_df[filter][is_correct_column].value_counts())
  print(new_df[filter][is_correct_column].value_counts()/len(new_df[filter]))

  print('---------')

Prediction column:pred0_manual_cci , is_correct_column: is_correct_pred0
is_correct_pred0
1    455
0    344
Name: count, dtype: int64
is_correct_pred0
1    0.569462
0    0.430538
Name: count, dtype: float64
---------
Prediction column:pred1_manual_prev_g1 , is_correct_column: is_correct_pred1
is_correct_pred1
1    8621
0    6980
Name: count, dtype: int64
is_correct_pred1
1    0.552593
0    0.447407
Name: count, dtype: float64
---------
Prediction column:pred2_manual_prev_g1_and_snp , is_correct_column: is_correct_pred2
is_correct_pred2
1    5726
0    4729
Name: count, dtype: int64
is_correct_pred2
1    0.547681
0    0.452319
Name: count, dtype: float64
---------
Prediction column:pred3_manual_gdp_fastd , is_correct_column: is_correct_pred3
is_correct_pred3
1    16560
0    13262
Name: count, dtype: int64
is_correct_pred3
1    0.555295
0    0.444705
Name: count, dtype: float64
---------
Prediction column:pred4_manual_gdp_wti_oil , is_correct_column: is_correct_pred4
Series([], Name: coun

## Question 3 (1 point): Unique correct predictions from a 10-levels deep Decision Tree Classifier (pred5_clf_10) 

**What is the total number of records in the TEST dataset when the new prediction pred5_clf_10 is better than all 'hand' rules (pred0..pred4)?**

NOTE: please include `random_state=42` to Decision Tree Classifier init function (line `clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)`) to ensure everyone gets the same results.

Suggested solution:
* Step1: Rewrite the '1.4.3 Inference for a decision tree' piece for the Decision Tree Classifier with max_depth=10 (clf_10), so that you fit the model on TRAIN+VALIDATION sets (unchanged from the lecture), but predict on the whole set X_all (to be able to define a new column 'pred5_clf_10' in the dataframe new_df). Here is the [link](https://stackoverflow.com/questions/40729162/merging-results-from-model-predict-with-original-pandas-dataframe) with explanation. It will solve the problem in 1.4.5 when predictions were made only for Test dataset and couldn't be easily joined with the full dataset. 

* Step2: Once you have it, define a new column 'only_pred5_is_correct' similar to 'hand' prediction rules with several conditions: is_positive_growth_5d_future AND is_correct_pred5 should be equal 1, while all other predictions is_correct_pred0..is_correct_pred4 should be equal to 0.

* Step3: Convert 'only_pred5_is_correct' column from bool to int, and find how many times it is equal to 1 in the TEST set. Write down this as an answer.

ADVANCED: define a function that can be applied to the whole row of predictions ([a few examples of pandas-apply-row-functions](https://sparkbyexamples.com/pandas/pandas-apply-function-to-every-row/)) and can find whether some prediction 'predX' (where X is one of the predictions) is uniquely correct. It should work even if there are 100 predictions available, so that you don't define manually the condition for 'predX'.  

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# set random state to 42
np.random.seed(42)


def remove_infinite_values(X):
    """
    Remove infinite values from the input array.

    Parameters:
    - X: Input array (NumPy array or array-like)

    Returns:
    - Array with infinite values removed
    """
    return X[np.isfinite(X).all(axis=1)]

In [38]:
# look carefully for 'count' to be close to total values (or you need to replace NaNs/remove NaNs), and min/max doesn't equal to -+inf.
#  it will give you an idea to dig deeper into some features to understand the 'nature' of a problem
pd.set_option('display.max_rows', None)

new_df[NUMERICAL+DUMMIES].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
growth_1d,182660.0,1.000868,0.02356674,0.46011,0.9906461,1.000473,1.010716,3.018887
growth_3d,182630.0,1.002558,0.0394759,0.4170567,0.9842405,1.00212,1.020217,3.018887
growth_7d,182570.0,1.005875,0.05838238,0.4065424,0.9774801,1.005428,1.033406,3.018887
growth_30d,182225.0,1.025093,0.1180397,0.3054439,0.9622796,1.022171,1.08399,3.99331
growth_90d,181325.0,1.077823,0.2209361,0.1890111,0.9587507,1.061788,1.175049,5.970002
growth_365d,176915.0,1.350498,0.6155275,0.09783037,1.015438,1.239138,1.527952,9.819906
growth_dax_1d,179673.0,1.000258,0.01419365,0.8776139,0.9937851,1.000761,1.007159,1.11402
growth_dax_3d,179673.0,1.000813,0.02417672,0.8374862,0.9891694,1.002138,1.013879,1.144124
growth_dax_7d,179673.0,1.001993,0.03587574,0.7318924,0.9843275,1.004785,1.021502,1.23246
growth_dax_30d,179673.0,1.008638,0.07189179,0.6263172,0.972715,1.014873,1.050698,1.288371


In [39]:
# Split the data into training and testing sets based on the split date
features_list = NUMERICAL+DUMMIES
to_predict = 'is_positive_growth_5d_future'

train_df = new_df[new_df.split.isin(['train','validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_train = train_df[features_list+[to_predict,'Date','Ticker']]
X_test = test_df[features_list+[to_predict,'Date','Ticker']]

print(f'length: X_train {X_train.shape},  X_test {X_test.shape}')

length: X_train (152846, 302),  X_test (29829, 302)


In [40]:
# Can't have +-inf values . E.g. ln(volume)=-inf when volume==0 => substitute with 0

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_train.shape},  X_test_imputed {X_test.shape}')

length: X_train_imputed (152846, 302),  X_test_imputed (29829, 302)


In [41]:
# you may want to remove 1-2% outliers based on percentile ==> not used here in Decision Trees
def remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99):
    """
    Remove outliers from the input array based on percentiles.

    Parameters:
    - X: Input array (NumPy array or array-like)
    - lower_percentile: Lower percentile threshold (float, default=1)
    - upper_percentile: Upper percentile threshold (float, default=99)

    Returns:
    - Array with outliers removed
    """
    try:
        lower_bound = np.percentile(X, lower_percentile, axis=0)
        upper_bound = np.percentile(X, upper_percentile, axis=0)
        mask = np.logical_and(np.all(X >= lower_bound, axis=1), np.all(X <= upper_bound, axis=1))
        return X[mask]
    except TypeError:
        return X

In [42]:
X_train_imputed = X_train # we won't use outliers removal to save more data to train: remove_outliers_percentile(X_train)
X_test_imputed = X_test # we won't use outliers removal to save more data to test: remove_outliers_percentile(X_test)

In [43]:
# same shape
print(f'length: X_train_imputed {X_train_imputed.shape},  X_test_imputed {X_test_imputed.shape}')

length: X_train_imputed (152846, 302),  X_test_imputed (29829, 302)


In [44]:
y_train = X_train_imputed[to_predict]
y_test = X_test_imputed[to_predict]

# remove y_train, y_test from X_ dataframes
del X_train_imputed[to_predict]
del X_test_imputed[to_predict]

In [45]:
# estimation/fit function (using dataframe of features X and what to predict y) --> optimising total accuracy
# max_depth is hyperParameter
def fit_decision_tree(X, y, max_depth=10):
    # Initialize the Decision Tree Classifier
    clf = DecisionTreeClassifier(max_depth=max_depth)
    
    # Fit the classifier to the training data
    clf.fit(X, y)
    
    return clf, X.columns

In [46]:
%%time
# drop 2 columns before fitting the tree, but we need those columns later for joins
clf_20, train_columns = fit_decision_tree(
    X=X_train_imputed.drop(['Date','Ticker'], axis=1),
    y=y_train,
    max_depth=20,
)

CPU times: user 29 s, sys: 99.8 ms, total: 29.1 s
Wall time: 29.1 s


In [47]:
%%time
clf_10, train_columns = fit_decision_tree(
    X=X_train_imputed.drop(['Date','Ticker'],axis=1),
    y=y_train,
    max_depth=10,
)

CPU times: user 16.1 s, sys: 40 ms, total: 16.1 s
Wall time: 16.1 s


In [48]:
# TODO 3: TRAIN only on train dataset, experiment with trees with depth 1..20 --> find the best one on VALID dataset
#       for the "best" tree model: find precision on the TEST set
def predict_decision_tree(clf:DecisionTreeClassifier, df_X:pd.DataFrame, y_true: pd.Series):
    # Predict the target variable on the test data
    y_pred = clf.predict(df_X)
    
    max_depth = clf.tree_.max_depth
    random_state=42
    
    # Print the maximum depth
    print("Maximum depth of the decision tree:", max_depth)
    
    # Calculate the accuracy/precision of the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f'Accuracy = {accuracy}, precision = {precision}')
    
    # resulting df
    result_df = pd.concat([df_X, y_true, pd.Series(y_pred, index=df_X.index, name='pred_')], axis=1)

    return result_df

In [51]:
def predict_df(df, tree):
    df_ = df.copy()
    df_ = df_[features_list + [to_predict, 'Date','Ticker']]
    df_.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_.fillna(0, inplace=True)
    df_ = remove_outliers_percentile(df_)
    
    df_test = df_[to_predict]
    # df_test = remove_outliers_percentile(df_test)
    
    del df_[to_predict]
    print(f'length: df {df_.shape},  df_test {df_test.shape}')

    X_all = predict_decision_tree(tree, df_.drop(['Date','Ticker'], axis=1), df_test)

    return X_all

In [52]:
pred5_clf_10 = predict_df(new_df, clf_10)

length: df (182675, 301),  df_test (182675,)
Maximum depth of the decision tree: 10


ValueError: Found input variables with inconsistent numbers of samples: [29829, 182675]

In [None]:
pred5_clf_10.tail()

In [None]:
# Predictions of a decision tree of depth "10"
pred5_clf_10.pred_.value_counts()

In [None]:
X_test_imputed.join(pred5_clf_10['pred_']).head()

In [None]:
# Predictions of a decision tree of depth "10" : many more "positive" predictions
pred5_clf_10.pred_.value_counts()

In [None]:
# define a new DF with the SAME index (used for joins)
pred5_clf_10 = pred5[['pred_']].rename(columns={'pred_': 'pred_tree_clf10'})
pred5_clf_10.head(1)

In [None]:
for cat in list(new_df.columns):
    print(cat)