In [178]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
from sklearn.pipeline import make_pipeline
from sklearn.inspection import plot_partial_dependence
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
pd.options.plotting.backend="plotly"


## Split data into Train n Test 
### split_data give X_train, y_train,  X_test, y_test

In [121]:
def split_data(df, split_frac=0.3, return_val=False, rand_state=7):
    X  = df.drop('30DayFwd', axis=1)
    X  = df.drop('Date', axis=1)
    y  = df['30DayFwd']
    #print(y)
    '''stratify = y'''
    return train_test_split(X, y, test_size = split_frac, random_state = rand_state)

## get_model_score returns:
### X_train, y_train score if val_score && test_score == False
### X_train, y_train , X_val & y_val score  if Val_score == True & test_score == False
### X_train, y_train , X_val & y_val , X_test, y_test score if Val_score == True & test_score == True

In [182]:
'''stratify = y_train,'''
def get_model_scores(mod, X_train, y_train, X_test, y_test, val_score = True, test_score=False):
    
    '''
    #if X_train.isna() or X_test.isna() or y_train.isna() or y_test.isna():
        #X_train.filln(0)
        #X_test.fillna(0)
        #y_train.fillna(0)
        #y_test.fillna(0)
    #X_train = X_train.drop('Date', axis=1)
    #X_test = X_test.drop('Date', axis=1)
    '''
    if val_score:
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                          test_size = 0.7, 
                                                          train_size = 0.3,                                                          
                                                          random_state= 9)        
    mod.fit(X_train, y_train)
    
    results = {}
    
    results['train_score'] = mod.score(X_train, y_train)
    if val_score:
        results['val_score'] = mod.score(X_val, y_val)
        
    if test_score:
        results['test_score'] = mod.score(X_test, y_test)
        
    return results


# Get data locally as csv file
## With addition of 3 columns

In [190]:
def local_dataset():
    data_set = pd.read_csv(r"C:\Users\samina\Desktop\GA_DATA_SCIENCE\Data_Sci\Homework\Unit4\Final_project\google.csv", parse_dates=['Date'])
    
    #If the folowing lines in this func-> ValueError: continuous is not supported
    #if the following lines add after func->ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
    
    #data_set = data_set.fillna(0)
    #data_set['year'] = data_set['Date'].dt.year
    #data_set['month'] = data_set['Date'].dt.month
    #data_set['day'] = data_set['Date'].dt.day
    #data_set['30DayFwd']     = (data_set['Close'].shift(-30) - data_set['Close']) / data_set['Close']

    #VOLUME CHANGE 
    #data_set['VolPctChange'] = data_set[['Volume']].pct_change()
    #data_set['5DayVol']      = data_set['Volume'].pct_change().rolling(5).mean().values
    #data_set['30DayVol']     =  data_set['Volume'].pct_change().rolling(30).mean().values
    #data_set['252DayVol']    =  data_set['Volume'].pct_change().rolling(252).mean().values

    #CLOSING STOCK RATIO
    #data_set['Close30DRatio']  = data_set['Close'] / data_set['Close'].rolling(30).mean().values
    #data_set['Close60DRatio']  = data_set['Close'] / data_set['Close'].rolling(60).mean().values
    #data_set['Close252DRatio'] = data_set['Close'] / data_set['Close'].rolling(252).mean().values

    #CLOSE VALUE CHANGE
    #data_set['CloseChange']      = data_set['Close'].pct_change()
    #data_set['Close5DayChange']  = data_set['Close'].pct_change().rolling(5).mean().values
    #data_set['Close10DayChange'] = data_set['Close'].pct_change().rolling(10).mean().values
    #data_set['Close60DayChange'] = data_set['Close'].pct_change().rolling(60).mean().values

    #data_set = data_set.fillna(0)
    
    return data_set

# Data frame ready to go

In [210]:
df = local_dataset()

df['30DayFwd']     = (df['Close'].shift(-30) - df['Close']) / df['Close']

#VOLUME CHANGE 
df['VolPctChange'] = df[['Volume']].pct_change()
df['5DayVol']      = df['Volume'].pct_change().rolling(5).mean().values
df['30DayVol']     =  df['Volume'].pct_change().rolling(30).mean().values
df['252DayVol']    =  df['Volume'].pct_change().rolling(252).mean().values

#CLOSING STOCK RATIO
df['Close30DRatio']  = df['Close'] / df['Close'].rolling(30).mean().values
df['Close60DRatio']  = df['Close'] / df['Close'].rolling(60).mean().values
df['Close252DRatio'] = df['Close'] / df['Close'].rolling(252).mean().values

#CLOSE VALUE CHANGE
df['CloseChange']      = df['Close'].pct_change()
df['Close5DayChange']  = df['Close'].pct_change().rolling(5).mean().values
df['Close10DayChange'] = df['Close'].pct_change().rolling(10).mean().values
df['Close60DayChange'] = df['Close'].pct_change().rolling(60).mean().values


In [203]:
df.sort_values(by='Date', ascending=True)
df.tail(20)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,year,month,day,...,5DayVol,30DayVol,252DayVol,Close30DRatio,Close60DRatio,Close252DRatio,CloseChange,Close5DayChange,Close10DayChange,Close60DayChange
4291,2021-09-03,2882.919922,2907.540039,2870.100098,2895.5,2895.5,955200,2021,9,3,...,0.023365,0.044194,0.058289,1.037946,1.079461,1.372507,0.003855,0.000329,0.004525,0.002347
4292,2021-09-07,2894.98999,2916.47998,2890.820068,2910.379883,2910.379883,758500,2021,9,7,...,0.044438,0.041533,0.058109,1.041817,1.082342,1.376145,0.005139,8.5e-05,0.003116,0.002483
4293,2021-09-08,2907.870117,2911.02002,2884.0,2897.669922,2897.669922,774300,2021,9,8,...,-0.067735,0.014591,0.058188,1.035269,1.075146,1.366634,-0.004367,-0.000778,0.001759,0.002323
4294,2021-09-09,2897.669922,2913.389893,2888.679932,2898.27002,2898.27002,739900,2021,9,9,...,0.005096,0.003209,0.059283,1.033384,1.072863,1.363495,0.000207,-0.001259,0.001392,0.002369
4295,2021-09-10,2908.870117,2920.379883,2834.830078,2838.419922,2838.419922,1643500,2021,9,10,...,0.173258,0.065497,0.064478,1.010751,1.048609,1.332089,-0.02065,-0.003163,-9.5e-05,0.002069
4296,2021-09-13,2864.02002,2883.820068,2845.649902,2869.300049,2869.300049,1008800,2021,9,13,...,0.121107,0.04459,0.062998,1.019752,1.05779,1.343208,0.010879,-0.001758,-0.000715,0.002161
4297,2021-09-14,2883.219971,2894.550049,2858.110107,2868.120117,2868.120117,945800,2021,9,14,...,0.149802,0.04779,0.062503,1.017544,1.055042,1.3393,-0.000411,-0.002868,-0.001392,0.00226
4298,2021-09-15,2875.179932,2911.629883,2845.120117,2904.120117,2904.120117,1032400,2021,9,15,...,0.163949,0.052623,0.063721,1.028146,1.065835,1.352695,0.012552,0.000515,-0.000131,0.002352
4299,2021-09-16,2902.419922,2904.0,2868.326904,2887.469971,2887.469971,1014600,2021,9,16,...,0.169386,0.056482,0.063711,1.020242,1.057476,1.341551,-0.005733,-0.000673,-0.000966,0.002184
4300,2021-09-17,2875.969971,2884.98999,2821.22998,2829.27002,2829.27002,3002000,2021,9,17,...,0.316897,0.131178,0.069765,0.998614,1.034268,1.311286,-0.020156,-0.000574,-0.001869,0.001919


# Model Selection and train, test, val baseline scores

In [204]:
mod = xgb.XGBClassifier()           #xgb.XGBRegressor(verbosity=1)
te = ce.TargetEncoder()
pipe = make_pipeline(te, mod)

print(pipe)        #debugging need to remove

Pipeline(steps=[('targetencoder', TargetEncoder()),
                ('xgbclassifier',
                 XGBClassifier(base_score=None, booster=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, gamma=None, gpu_id=None,
                               importance_type='gain',
                               interaction_constraints=None, learning_rate=None,
                               max_delta_step=None, max_depth=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               random_state=None, reg_alpha=None,
                               reg_lambda=None, scale_pos_weight=None,
                               subsample=None, tree_method=None,
                               validate_parameters=None, verbosity=None))])


## split data in train n test sets

In [205]:
X_train, X_test, y_train, y_test = split_data(df)

In [206]:
print( X_train.info() )    #debugging ststement remove
#print( y_test.isna() )

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3017 entries, 184 to 4271
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Open              3017 non-null   float64
 1   High              3017 non-null   float64
 2   Low               3017 non-null   float64
 3   Close             3017 non-null   float64
 4   Adj Close         3017 non-null   float64
 5   Volume            3017 non-null   int64  
 6   year              3017 non-null   int64  
 7   month             3017 non-null   int64  
 8   day               3017 non-null   int64  
 9   30DayFwd          3001 non-null   float64
 10  VolPctChange      3016 non-null   float64
 11  5DayVol           3013 non-null   float64
 12  30DayVol          2992 non-null   float64
 13  252DayVol         2836 non-null   float64
 14  Close30DRatio     2993 non-null   float64
 15  Close60DRatio     2969 non-null   float64
 16  Close252DRatio    2837 non-null   float6

In [207]:
y_train      #debugging statement remove

184     0.299624
2661   -0.000458
824    -0.039847
386     0.068296
3113    0.046564
          ...   
919     0.271686
2550   -0.070737
537     0.204196
1220    0.100345
4271    0.014456
Name: 30DayFwd, Length: 3017, dtype: float64

In [208]:
pipe.fit(X_train, y_train)

#pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):




Pipeline(steps=[('targetencoder', TargetEncoder(cols=[])),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=8, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [209]:
'''
X_train = X_train.fillna(0)
X_test.fillna(0)
y_train.fillna(0)
y_test.fillna(0)
'''
baseline_score = get_model_scores(pipe, X_train, y_train, X_test, y_test, test_score=True)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [198]:
baseline_score

{'train_score': 0.9999995348639846,
 'val_score': 0.999088975470272,
 'test_score': 0.9991423142517383}

In [211]:
imp = pd.DataFrame(
    {
        'Col': X_train.columns,
       'Imp': pipe[-1].feature_importances_}
)
imp.sort_values(by='Imp', ascending=False)


Unnamed: 0,Col,Imp
9,30DayFwd,0.32623
1,High,0.317722
16,Close252DRatio,0.276123
0,Open,0.075513
5,Volume,0.004412
12,30DayVol,0.0
19,Close10DayChange,0.0
18,Close5DayChange,0.0
17,CloseChange,0.0
15,Close60DRatio,0.0


# USE OLD FSIONED FOR loops for paameters

In [200]:
n_estimators = [100, 200, 300, 400, 500]
learning_rate = [0.1, 0.2, 0.3, 0.4]
tree_depth = [3, 4, 5, 6]
subsample = [1, 0.8, 0.6, 0.3]
cv_scores = []

for estimator in n_estimators:
    for rate in learning_rate:
        for depth in tree_depth:
            for sample in subsample:
                print("Fitting new model")
                pipe[-1].set_params(n_estimators=estimator, learning_rate=rate, max_depth=depth, subsample=sample)
                scores = get_model_scores(pipe, X_train, y_train, X_test, y_test)
                cv_scores.append((scores['val_score'], scores['train_score'], estimator, rate, depth, sample))

Fitting new model


  elif pd.api.types.is_categorical(cols):




ValueError: continuous is not supported

In [180]:
cv_scores

[]

In [None]:
max(cv_scores)

## Setting best params to the model

In [181]:
pipe[-1].set_params(n_estimators=200, learning_rate=0.2, max_depth=4, subsample=0.3)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.3,
              tree_method='hist', validate_parameters=1, verbosity=None)

In [None]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

In [201]:
#ERROR CELL

# list the parameters we want to load in the xgbclassifier__ notation is because we want to refer to
# items inside of a pipeline
param_grid = {
    'xgbclassifier__n_estimators': [100, 200, 300, 400],
    'xgbclassifier__max_depth': [3, 4, 5, 6],
    'xgbclassifier__max_features': [0.3,0.6, 0.8, 1], 
    'xgbclassifier__subsample': [0.3,0.6, 0.8, 0.1]
}

# we'll apply this option for faster fitting -- a nice feature of xgboost
pipe[-1].set_params(tree_method = 'hist')

# import a splitter
#from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits = 3)

grid = GridSearchCV(pipe, param_grid, cv = kfold, verbose = 1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 256 candidates, totalling 768 fits


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.