# Apply Decision Tree methods

There are two types of decision tree methods which are classification and regression. 

Due to the required output resolution of only 1 kWH it is of interest to see if either of these methods give an accurate prediction for the solar energy produced. 

In [None]:
day_df = pd.read_csv('day_averaged.csv')

## Implement Test-Train Split 

In [None]:
from sklearn.model_selection import train_test_split

inputs  = day_df[['Day', 'temp_mean', 'pressure_mean', 'humidity_mean', 'wind_speed_mean', 'wind_deg_mean', 'clouds_all_mean']].to_numpy()
output = day_df[['solar_discrete']].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(inputs, output, test_size=0.25, random_state=42)

## Implementing Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

accuracy = []

for i in range(1,20):

    tree_clf = DecisionTreeClassifier(max_depth=i)

    tree_clf.fit(X_train,y_train)

    y_pred = tree_clf.predict(X_test)

    accuracy.append(accuracy_score(y_test,y_pred))

max_accuracy = max(accuracy)
max_index = accuracy.index(max_accuracy) 

print('Max Accuracy  = ', max_accuracy)
print('Optimal Max depth = ', max_index + 1)

## Implementing Decision Tree Classifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

accuracy = []

for i in range(1,20):

    tree_clf = DecisionTreeClassifier(max_depth=i)

    tree_clf.fit(X_train,y_train)

    y_pred = tree_clf.predict(X_test)

    accuracy.append(accuracy_score(y_test,y_pred))

max_accuracy = max(accuracy)
max_index = accuracy.index(max_accuracy) 

print('Max Accuracy  = ', max_accuracy)
print('Optimal Max depth = ', max_index + 1)

In [None]:
## Implementing Cross Validation rather than test-train split

In [None]:
from sklearn.model_selection import cross_val_score

# function for fitting trees of various depths on the training data using cross-validation
def run_cross_validation_on_trees(X, y, tree_depths, cv=5, scoring='accuracy'):
    cv_scores_list = []
    cv_scores_std = []
    cv_scores_mean = []
    accuracy_scores = []
    for depth in tree_depths:
        tree_model = DecisionTreeClassifier(max_depth=depth)
        cv_scores = cross_val_score(tree_model, X, y, cv=cv, scoring=scoring)
        cv_scores_list.append(cv_scores)
        cv_scores_mean.append(cv_scores.mean())
        cv_scores_std.append(cv_scores.std())
        accuracy_scores.append(tree_model.fit(X, y).score(X, y))
    cv_scores_mean = np.array(cv_scores_mean)
    cv_scores_std = np.array(cv_scores_std)
    accuracy_scores = np.array(accuracy_scores)
    return cv_scores_mean, cv_scores_std, accuracy_scores
  
# function for plotting cross-validation results
def plot_cross_validation_on_trees(depths, cv_scores_mean, cv_scores_std, accuracy_scores, title):
    fig, ax = plt.subplots(1,1, figsize=(15,5))
    ax.plot(depths, cv_scores_mean, '-o', label='mean cross-validation accuracy', alpha=0.9)
    ax.fill_between(depths, cv_scores_mean-2*cv_scores_std, cv_scores_mean+2*cv_scores_std, alpha=0.2)
    ylim = plt.ylim()
    ax.plot(depths, accuracy_scores, '-*', label='train accuracy', alpha=0.9)
    ax.set_title(title, fontsize=16)
    ax.set_xlabel('Tree depth', fontsize=14)
    ax.set_ylabel('Accuracy', fontsize=14)
    ax.set_ylim(ylim)
    ax.set_xticks(depths)
    ax.legend()

# fitting trees of depth 1 to 24
sm_tree_depths = range(1,25)
sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores = run_cross_validation_on_trees(X_train, y_train, sm_tree_depths)

# plotting accuracy
plot_cross_validation_on_trees(sm_tree_depths, sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores, 
                               'Accuracy per decision tree depth on training data')

In [None]:
idx_max = sm_cv_scores_mean.argmax()
sm_best_tree_depth = sm_tree_depths[idx_max]
sm_best_tree_cv_score = sm_cv_scores_mean[idx_max]
sm_best_tree_cv_score_std = sm_cv_scores_std[idx_max]
print('The depth-{} tree achieves the best mean cross-validation accuracy {} +/- {}% on training dataset'.format(sm_best_tree_depth, round(sm_best_tree_cv_score*100,5), round(sm_best_tree_cv_score_std*100, 5)))


In [None]:
from sklearn.tree import plot_tree

tree_clf = DecisionTreeClassifier(max_depth=(sm_best_tree_depth))

tree_clf.fit(X_train,y_train)

y_pred = tree_clf.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)

#print(accuracy)
plt.figure(figsize=(20,10))
plot_tree(tree_clf.fit(X_train, y_train));

In [None]:
error = y_pred - y_test.flatten()

plt.figure(figsize=(10,10))
plt.scatter(range(0,len(error)),error)

In [None]:
print('Percentage of incorrect predictions  = ', np.count_nonzero(error)/len(error) * 100)
print('Number of values which are underestimated = ', np.sum(np.array(error) < 0, axis=0), 'Out of ', len(error))
print('Percentage of underestimated predictions  = ', round((np.sum(np.array(error) < 0, axis=0)/len(error) * 100),2))