In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [11]:
data = pd.read_csv('CF_data.csv')
data

Unnamed: 0,Company,Time,Net Income,Cash from Operations,Cash from Investing,Cash from Financing,Net Change in Cash,Levered Free Cash Flow,Unlevered Free Cash Flow,Free Cash Flow / Share,Price
0,YUM,2015-12-01,1283.0,1260.0,-199.0,-1089.0,-28.0,402.4,498.0,1.88,52.53
1,VAR,2010-10-01,360.4,460.8,-74.8,-422.2,-33.3,360.7,363.3,3.23,55.98
2,HRL,2015-10-01,686.1,992.0,-900.9,-70.6,13.1,826.2,834.4,1.61,33.77
3,EPD,2010-12-01,320.8,2300.0,-3251.6,961.1,10.2,-338.8,124.9,0.47,20.80
4,EONGY,2010-12-01,7848.5,14864.2,1013.7,-13307.4,2592.0,-1843.9,606.7,2.24,30.41
5,CF,2012-12-01,1848.7,2375.6,-513.5,-796.8,1067.9,1616.9,1701.4,5.80,40.63
6,BDX,2014-09-01,1185.0,1746.0,-948.0,-807.0,-29.0,949.6,1034.0,5.65,113.81
7,BIIB,2013-12-01,1862.3,2345.1,-1604.7,-716.5,31.9,-1067.7,-1047.8,-4.91,257.51
8,MTRN,2010-12-01,46.4,31.0,-46.5,19.8,3.9,-27.4,-26.3,-1.12,38.64
9,NOC,2010-12-01,2053.0,2453.0,-760.0,-1266.0,427.0,-637.3,-469.1,6.29,58.77


In [12]:
prices = data['Price'] 
data = data.drop(['Company', 'Time', 'Price'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Net Income
Done
Cash from Operations
Done
Cash from Investing
Done
Cash from Financing
Done
Net Change in Cash
Done
Levered Free Cash Flow
Done
Unlevered Free Cash Flow
Done
Free Cash Flow / Share
Done


In [13]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price', prices)
data

Unnamed: 0,Net Income,Cash from Operations,Cash from Investing,Cash from Financing,Net Change in Cash,Levered Free Cash Flow,Unlevered Free Cash Flow,Free Cash Flow / Share,Price
0,0.195162,0.101904,0.584679,0.668242,0.474163,0.459175,0.452369,0.003819,52.53
1,0.187545,0.092933,0.585543,0.673199,0.474108,0.458964,0.451685,0.003884,55.98
2,0.190234,0.098895,0.579795,0.675813,0.474591,0.461320,0.454078,0.003806,33.77
3,0.187218,0.113578,0.563438,0.683483,0.474561,0.455424,0.450474,0.003751,20.80
4,0.249369,0.254616,0.593118,0.577408,0.501464,0.447806,0.452921,0.003837,30.41
5,0.199833,0.114427,0.582491,0.670415,0.485583,0.465322,0.458481,0.004008,40.63
6,0.194353,0.107359,0.579467,0.670339,0.474153,0.461945,0.455091,0.004001,113.81
7,0.199945,0.114084,0.574898,0.671011,0.474787,0.451734,0.444518,0.003492,257.51
8,0.184952,0.088108,0.585740,0.676485,0.474495,0.457000,0.449706,0.003675,38.64
9,0.201519,0.115296,0.580776,0.666926,0.478904,0.453913,0.447457,0.004032,58.77


In [14]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:8] # Parameters
y = NpMatrix[:,8] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.19516198 0.10190382 0.58467919 ... 0.45917516 0.45236896 0.00381927]
 [0.18754474 0.09293251 0.58554341 ... 0.4589641  0.45168483 0.00388429]
 [0.19023381 0.09889542 0.57979515 ... 0.46132016 0.4540775  0.00380627]
 ...
 [0.19813425 0.10176912 0.58603606 ... 0.45923792 0.45299671 0.00382891]
 [0.20573002 0.12755377 0.58208374 ... 0.45472016 0.44804477 0.00381687]
 [0.19787004 0.0888824  0.57738688 ... 0.46291752 0.45705425 0.00369453]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 8
Total Number of Training instances: 5000


[52.53 55.98 33.77 ... 63.33 53.15 40.29]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 52.53
Total number of y values 5000


In [15]:
#X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2, random_state = 42) # 42, 56, 66, 87

In [16]:
# Hyperparameter Grid Search
#criterion = {'mse', 'mae'}
#'criterion' : ['mse', 'mae'],
params = {
        'splitter' : ['best', 'random'],
          'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
         }

decision_tree = DecisionTreeRegressor()
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False)

#gs_decision_tree = GridSearchCV(decision_tree, params, scoring = mae,  cv=10)
gs_decision_tree = GridSearchCV(decision_tree, params, scoring = mae,  cv=5)
grid_results = gs_decision_tree.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_decision_tree.best_params_)
print("Best score on Test Data", gs_decision_tree.best_score_)
print("Optimal Configuration", gs_decision_tree.best_estimator_)

Best Parameters {'max_depth': 6, 'splitter': 'best'}
Best score on Test Data -36.014924978319286
Optimal Configuration DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')


In [17]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-51.2805294  -53.82290621 -43.79626355 -52.94519401 -38.71383546
 -53.73949319 -37.19519463 -51.28875465 -36.67182642 -52.16703366
 -36.01492498 -47.92573347 -36.39629185 -48.24682082 -36.37114306
 -46.17948188 -36.61841206 -43.06402613 -38.20350579 -46.71971979
 -38.48956304 -41.34708228 -39.28616793 -41.30745055 -39.6946587
 -41.78524926 -41.06398075 -40.64494187 -41.24335413 -39.84789822], using {'max_depth': 6, 'splitter': 'best'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008263,0.000553,0.0,0.0,1,best,"{'max_depth': 1, 'splitter': 'best'}",-50.278533,-52.038238,-55.328099,-50.63662,-48.121156,-51.280529,2.381479,25
1,0.001599,0.001958,0.000799,0.001598,1,random,"{'max_depth': 1, 'splitter': 'random'}",-53.533302,-51.619356,-57.197865,-55.106197,-51.65781,-53.822906,2.129053,30
2,0.011396,0.003075,0.002598,0.003069,2,best,"{'max_depth': 2, 'splitter': 'best'}",-44.656109,-42.142536,-48.05501,-42.261398,-41.866264,-43.796264,2.353371,20
3,0.000799,0.001598,0.0,0.0,2,random,"{'max_depth': 2, 'splitter': 'random'}",-53.971031,-51.59543,-51.668951,-54.574529,-52.91603,-52.945194,1.196544,28
4,0.009595,0.00195,0.000798,0.001595,3,best,"{'max_depth': 3, 'splitter': 'best'}",-39.599374,-36.867703,-44.45034,-35.778525,-36.873235,-38.713835,3.133807,9
5,0.000776,0.000442,0.000226,0.000452,3,random,"{'max_depth': 3, 'splitter': 'random'}",-52.969117,-50.044431,-58.997645,-54.733805,-51.952468,-53.739493,3.035695,29
6,0.010595,0.001733,0.000806,0.001612,4,best,"{'max_depth': 4, 'splitter': 'best'}",-38.301293,-35.891267,-41.607568,-35.113395,-35.062449,-37.195195,2.501008,6
7,0.000799,0.001598,0.0,0.0,4,random,"{'max_depth': 4, 'splitter': 'random'}",-52.846871,-43.633325,-56.475795,-55.058758,-48.429025,-51.288755,4.69744,26
8,0.014384,0.001973,0.000799,0.001598,5,best,"{'max_depth': 5, 'splitter': 'best'}",-37.676621,-35.834889,-38.096541,-36.198829,-35.552252,-36.671826,1.021475,5
9,0.001598,0.001957,0.0,0.0,5,random,"{'max_depth': 5, 'splitter': 'random'}",-49.311801,-50.517135,-56.04337,-51.715837,-53.247025,-52.167034,2.3354,27


In [18]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,0.016984,0.001528,0.0,0.0,6,best,"{'max_depth': 6, 'splitter': 'best'}",-37.70502,-35.168342,-36.314623,-34.937378,-35.949262,-36.014925,0.982686,1
14,0.020997,0.002662,0.0,0.0,8,best,"{'max_depth': 8, 'splitter': 'best'}",-37.666682,-36.943198,-35.880453,-34.966265,-36.399118,-36.371143,0.918794,2
12,0.019486,0.000982,0.0,0.0,7,best,"{'max_depth': 7, 'splitter': 'best'}",-36.96352,-35.68106,-37.300191,-35.835189,-36.2015,-36.396292,0.632948,3
16,0.021574,0.001954,0.000799,0.001597,9,best,"{'max_depth': 9, 'splitter': 'best'}",-37.494744,-37.417596,-37.536404,-34.20109,-36.442226,-36.618412,1.274665,4
8,0.014384,0.001973,0.000799,0.001598,5,best,"{'max_depth': 5, 'splitter': 'best'}",-37.676621,-35.834889,-38.096541,-36.198829,-35.552252,-36.671826,1.021475,5
6,0.010595,0.001733,0.000806,0.001612,4,best,"{'max_depth': 4, 'splitter': 'best'}",-38.301293,-35.891267,-41.607568,-35.113395,-35.062449,-37.195195,2.501008,6
18,0.023968,3.6e-05,0.0,0.0,10,best,"{'max_depth': 10, 'splitter': 'best'}",-38.082487,-39.427637,-39.263439,-37.155577,-37.088389,-38.203506,0.997822,7
20,0.031159,0.002997,0.0,0.0,11,best,"{'max_depth': 11, 'splitter': 'best'}",-38.722459,-39.997109,-39.080437,-36.453696,-38.194115,-38.489563,1.175424,8
4,0.009595,0.00195,0.000798,0.001595,3,best,"{'max_depth': 3, 'splitter': 'best'}",-39.599374,-36.867703,-44.45034,-35.778525,-36.873235,-38.713835,3.133807,9
22,0.039007,0.005488,0.0,0.0,12,best,"{'max_depth': 12, 'splitter': 'best'}",-39.369414,-40.573117,-37.96823,-40.382235,-38.137844,-39.286168,1.088131,10
