In [1]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np 
import math

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [30]:
features_df = pd.read_csv("bitcoin_train.csv")
features_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,...,Open_Gold,Daily_Change_Gold,Daily_Change_Perc_Gold,Increased_Gold,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500,label
0,793,2017-12-28,15864.099609,15888.400391,13937.299805,14606.5,12336499712,-1232.0,0.0,636.405515,...,1292.0,5.2,0.004025,1,2687.54,2686.1,1.44,0.000536,1,0.0
1,908,2018-06-14,6342.75,6707.140137,6334.459961,6675.350098,5138710016,325.450196,1.0,-383.20157,...,1303.1,5.2,0.00399,1,2782.49,2783.21,-0.72,-0.000259,0,0.0
2,224,2015-09-25,234.362,237.427002,233.684006,235.143997,22363600,0.61499,1.0,-1.731076,...,1151.0,-5.4,-0.004692,0,1931.34,1935.93,-4.59,-0.002371,0,1.0
3,1042,2018-12-26,3819.666748,3893.359619,3769.86377,3857.297607,5326547918,41.806884,1.0,-83.043066,...,1273.5,-0.5,-0.000393,0,2467.7,2363.12,104.58,0.044255,1,1.0
4,563,2017-01-31,920.958984,972.018982,920.958984,970.403015,164582000,50.020996,1.0,12.846431,...,1197.7,13.7,0.011439,1,2278.87,2274.02,4.85,0.002133,1,1.0


In [31]:
# create lag variable for date/outcome when data is sorted going up by day
features_df = features_df.sort_values(by ="Date")
features_df["lag_label"] = features_df["label"].shift(1)


# drop first row since cannot predict first day of data
features_df.iloc[0]
features_df = features_df.iloc[1:, :]
features_df.head()

features_df.iloc[0]

Unnamed: 0                             5
Date                          2014-10-21
Open                           382.42099
High                          392.645996
Low                           380.834015
Close                         386.475006
Volume                          14188900
Daily_Change                    3.630005
Daily_Change_Ind                     1.0
MACD                           -3.175317
PROC_3                         -0.012689
PROC_5                          0.010244
PROC_10                         0.066729
wpr                           -22.254677
sto_os                         77.745323
goog_trend_score                      40
count                                  1
compound                             0.0
retweets_count                        27
likes_count                           32
replies_count                          2
compound_weighted_replies            0.0
compound_weighted_likes              0.0
compound_weighted_retweets           0.0
Daily_Change_Per

In [32]:
features_df.columns

feature_lst = ['Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5','PROC_10', 'wpr',\
                'sto_os', 'goog_trend_score', 'count', 'compound', 'retweets_count', 'likes_count', 'replies_count',\
                'compound_weighted_replies', 'compound_weighted_likes','compound_weighted_retweets',\
                'Daily_Change_Perc', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold', 'Increased_Gold', \
                'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500']

#outcome = features_df.columns[-1]

# use lag variable for outcome
outcome = features_df.columns[-1]

print(feature_lst)
print(outcome)

['Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5', 'PROC_10', 'wpr', 'sto_os', 'goog_trend_score', 'count', 'compound', 'retweets_count', 'likes_count', 'replies_count', 'compound_weighted_replies', 'compound_weighted_likes', 'compound_weighted_retweets', 'Daily_Change_Perc', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold', 'Increased_Gold', 'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500']
lag_label


In [33]:
X = features_df[feature_lst]
y = features_df[outcome]
print(X.shape, y.shape)

(1252, 24) (1252,)


In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1234)

In [35]:
# Figure out what the tree depth should be (maximum score)
results_dict = {}
cx_validation = KFold(n_splits = 10, shuffle = True, random_state = 1)
for depth in range(1, 11):
    base_estimator = DecisionTreeClassifier(max_depth = depth, random_state = 1234)
    if base_estimator.fit(X,y).tree_.max_depth < depth:
        break
    accuracy_score = np.mean(cross_val_score(base_estimator, X, y, cv = cx_validation, scoring = 'accuracy', n_jobs = 1))
    print("Depth:", depth, ", Score:", accuracy_score)

    results_dict[depth] = accuracy_score

max_val = -1
max_depth_val = None
for key, val in results_dict.items():
    if val > max_val:
        max_val = val
        max_depth_val = key

print("Max depth should be:", max_depth_val)

    # https://educationalresearchtechniques.com/2019/01/02/adaboost-classification-in-python/
    # https://towardsdatascience.com/boosting-and-adaboost-clearly-explained-856e21152d3e
    # https://python-bloggers.com/2019/01/adaboost-classification-in-python/ # this one for 
    

Depth: 1 , Score: 0.6389587301587301
Depth: 2 , Score: 0.6318158730158729
Depth: 3 , Score: 0.6047301587301586
Depth: 4 , Score: 0.614215873015873
Depth: 5 , Score: 0.6046285714285714
Depth: 6 , Score: 0.5886666666666667
Depth: 7 , Score: 0.595879365079365
Depth: 8 , Score: 0.5663174603174603
Depth: 9 , Score: 0.5615174603174603
Depth: 10 , Score: 0.5567047619047619
Max depth should be: 1


In [51]:
# Create adaboost classifer object
base_estimator = DecisionTreeClassifier(max_depth = max_depth_val, random_state = 1234)

adaboost = AdaBoostClassifier(base_estimator = base_estimator, n_estimators = 20,
                              learning_rate = 1, random_state = 1234)

# Train Adaboost Classifer
model = adaboost.fit(X_train, y_train)

# Predict the response for valid dataset
y_hat = model.predict(X_valid)

In [52]:
# Model Accuracy with chosen decision tree depth and randomly chosen hyperparameters
accuracy_score = metrics.accuracy_score(y_valid, y_hat)
mse = metrics.mean_squared_error(y_valid, y_hat)
confusion = metrics.confusion_matrix(y_valid, y_hat) 

print("Accuracy:", accuracy_score)
print("MSE:", mse)
print("CONFUSION:", confusion)


Accuracy: 0.6230031948881789
MSE: 0.3769968051118211
CONFUSION: [[ 72  67]
 [ 51 123]]


In [50]:
# ask irsa about this part
new = model.predict(adaboost.feature_importances_)
#for i in range(len(feature_lst)):
    #print(f"{feature_lst[i]}: {feature_import[i]}")



ValueError: Expected 2D array, got 1D array instead:
array=[0.   0.   0.1  0.05 0.15 0.05 0.   0.05 0.05 0.   0.05 0.05 0.   0.
 0.1  0.05 0.   0.15 0.05 0.05 0.   0.05 0.   0.  ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

hyper parametertuning (number estimators and learning rate)


In [38]:
# hyper parametertuning (number estimators and learning rate)
ada = AdaBoostClassifier()
search_grid = {'n_estimators':[500,1000,2000], 'learning_rate':[.001,0.01,.1]}
search = GridSearchCV(estimator = ada, param_grid = search_grid, scoring = 'accuracy', n_jobs = 1, cv = cx_validation)

#https://python-bloggers.com/2019/01/adaboost-classification-in-python/

In [None]:
search.fit(X,y)
search.best_params_
print(search.best_params_)
search.best_score_
print(search.best_score_)
#Out[34]: 0.7425149700598802



# https://python-bloggers.com/2019/01/adaboost-classification-in-python/

In [196]:
score = np.mean(cross_val_score(ada, X, y, scoring='accuracy', cv = cx_validation, n_jobs = 1))
score


#https://python-bloggers.com/2019/01/adaboost-classification-in-python/

0.6472126984126984

In [40]:
# https://machinelearningmastery.com/adaboost-ensemble-in-python/

# alternative way to check n_estiamtors and learning_Rate
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.645663 using {'learning_rate': 0.01, 'n_estimators': 500}
0.636055 (0.043355) with: {'learning_rate': 0.001, 'n_estimators': 10}
0.637122 (0.041014) with: {'learning_rate': 0.001, 'n_estimators': 50}
0.637122 (0.041014) with: {'learning_rate': 0.001, 'n_estimators': 100}
0.636330 (0.040705) with: {'learning_rate': 0.001, 'n_estimators': 500}
0.634461 (0.040174) with: {'learning_rate': 0.001, 'n_estimators': 1000}
0.637122 (0.041838) with: {'learning_rate': 0.01, 'n_estimators': 10}
0.636597 (0.040281) with: {'learning_rate': 0.01, 'n_estimators': 50}
0.634728 (0.040395) with: {'learning_rate': 0.01, 'n_estimators': 100}
0.645663 (0.040959) with: {'learning_rate': 0.01, 'n_estimators': 500}
0.641386 (0.039317) with: {'learning_rate': 0.01, 'n_estimators': 1000}
0.634197 (0.040456) with: {'learning_rate': 0.1, 'n_estimators': 10}
0.644326 (0.043006) with: {'learning_rate': 0.1, 'n_estimators': 50}
0.640861 (0.040113) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.627018 (0.0

Rerun adaboost with chosen hyperparameters

In [41]:
base_estimator = DecisionTreeClassifier(max_depth = max_depth_val, random_state = 1234)

chosen_learning_rate = list(grid_result.best_params_.values())[0]
chosen_n = list(grid_result.best_params_.values())[1]

print("chosen_n", chosen_n)
print("chosen_learing_rate", chosen_learning_rate)

adaboost = AdaBoostClassifier(base_estimator = base_estimator, n_estimators = chosen_n,
                              learning_rate = chosen_learning_rate, random_state = 1234)

# Train Adaboost Classifer
model = adaboost.fit(X_train, y_train)

# Predict the response for valid dataset
y_hat = model.predict(X_valid)

chosen_n 500
chosen_learing_rate 0.01


In [42]:
# Model Accuracy with chosen decision tree depth and chosen hyperparameters
accuracy_score = metrics.accuracy_score(y_valid, y_hat)
mse = metrics.mean_squared_error(y_valid, y_hat)
confusion = metrics.confusion_matrix(y_valid, y_hat) 

print("Accuracy:", accuracy_score)
print("MSE:", mse)
print("CONFUSION:", confusion)

Accuracy: 0.6325878594249201
MSE: 0.36741214057507987
CONFUSION: [[ 65  74]
 [ 41 133]]


In [None]:
''' other links
https://inria.github.io/scikit-learn-mooc/python_scripts/ensemble_adaboost.html
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
 https://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
 https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
 
 https://towardsdatascience.com/how-do-you-implement-adaboost-with-python-a76427b0fa7a
 https://towardsdatascience.com/machine-learning-part-17-boosting-algorithms-adaboost-in-python-d00faac6c464
 
 https://machinelearningmastery.com/adaboost-ensemble-in-python/
 https://machinelearningmastery.com/k-fold-cross-validation/

 https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
 https://blog.paperspace.com/adaboost-optimizer/
 https://www.datacamp.com/tutorial/adaboost-classifier-python
 https://educationalresearchtechniques.com/2019/01/02/adaboost-classification-in-python/

# Repeat for Dogecoin


In [None]:
features_df = pd.read_csv("bitcoin_train.csv")
features_df.head()