# Decision Tree Example using atmospheric data from Christman Field
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/eabarnes1010/course_ml_ats/blob/main/code/tree_classifier_christman_hourly_competition.ipynb)

* Iris example adapted from: https://www.datacamp.com/community/tutorials/random-forests-classifier-python
* Further modified by: Aaron Hill and Wei-Ting Hsiao (Dept. of Atmospheric Science, Colorado State University), January 2020
* Further adapted by: Prof. Elizabeth Barnes for ATS 780A7 Spring 2022 at Colorado State University

Lets import some libraries we will need throughout this tutorial:



In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
print('IN_COLAB = ' + str(IN_COLAB))

IN_COLAB = True


In [2]:
import sys
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.inspection import permutation_importance
import pydot
import matplotlib.pyplot as plt
# %matplotlib inline

In [3]:
print(f"python version = {sys.version}")
print(f"numpy version = {np.__version__}")
print(f"scikit-learn version = {sklearn.__version__}")  

python version = 3.7.12 (default, Jan 15 2022, 18:48:18) 
[GCC 7.5.0]
numpy version = 1.19.5
scikit-learn version = 1.0.2


In [None]:
if(IN_COLAB==True):
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        local_path = '/content/drive/My Drive/Colab Notebooks/'
    except:
        local_path = './'
else:
    local_path = 'figures/'

# 1. Data Preparation

### 1.1 Data overview

We have stored a .csv file on a CSU drive, accessible via URL. This will be the basis for our tutorial. This file contains Fort Collins weather data from 2020, and we will use these data to predict the high temperature for a given day with a random forest regression model.  

In [None]:
# Read in data from url
url = "https://raw.githubusercontent.com/eabarnes1010/course_ml_ats/main/data/fccwx_data_2020.csv"
data = pd.read_csv(url,parse_dates=["Date"],infer_datetime_format=True)
data['dayofyear'] = data['Date'].dt.dayofyear
data.reindex(index=data.index[::-1])

Lets look at our data to see what we are working with

In [None]:
# Display first 5 rows
print('The shape of our features is:', data.shape)

In [None]:
# A handy tool in pandas: descriptive statistics for each column
data.describe()

### 1.2 Targets and features

The pandas table is handy for a quick glance, but we need to organize some numpy arrays that separately contain our features and labels.

In [None]:
THRESHOLD_TEMP = 10   # default = 10
TARGET_VAR = 'Temp [C]'

# Labels are the values we want to predict
labels = np.zeros((np.shape(data[TARGET_VAR])))
i = np.where(data[TARGET_VAR] < THRESHOLD_TEMP)[0]  # 10C
labels[i] = 1
i = np.where(data[TARGET_VAR] < -THRESHOLD_TEMP/2)[0]  # -5C
labels[i] = 2

# Remove the labels from the features
# axis 1 refers to the columns
features = data.drop(TARGET_VAR, axis = 1)

# Also remove DewPt and Date
features = features.drop('DewPt [C]', axis = 1)   # comment out if you want the prediction task to be easy
features = features.drop('Date', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

### 1.3 Splitting training and testing datasets

Assuming we have no feature data available from 2019 we could use to test our trained models against, we will want to split up our dataset into training and testing portions. A standard proportion is 3/4 for training, 1/4 for testing, although this is somewhat arbitrary here. 

In [None]:
# Split the data into training and testing sets

# Tunable Parameter: Describes the proportion of the dataset we want to use for testing. 1 - split_size is used for training. 
split_size = 0.25

# PARAMETERS:
#     test_size: fraction of testing/validation datasets
#     random_state: random parameter
train_features, val_features, train_labels, val_labels = train_test_split(features, labels, test_size = split_size, random_state = 42)

Lets quickly check the size of our training and testing arrays are what we expect (and we didn't do something wrong)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Validation Features Shape:', val_features.shape)
print('Validation Labels Shape:', val_labels.shape)

In [None]:
plt.plot(features[:,-1],np.array(data[TARGET_VAR]),'.')
plt.axhline(y=THRESHOLD_TEMP,linestyle='--',color='k')
plt.axhline(y=-THRESHOLD_TEMP/2,linestyle='--',color='k')
plt.xlabel('day of year')
plt.ylabel(TARGET_VAR)
plt.show()

# 2. Creating a decision tree

### Train the model and visualize it

In [None]:
# SINGLE DECISION TREE

# Tunable Parameters for Model
# tree_depth = 2 
# node_split = 2       # minimum number of training samples needed to split a node
# leaf_samples = 1     # minimum number of training samples required to make a leaf node
# RAND_STATE = 42

# tree_clf = DecisionTreeClassifier(max_depth=tree_depth, 
#                                   min_samples_split=node_split,
#                                   min_samples_leaf=leaf_samples,
#                                   random_state=RAND_STATE,
#                                   criterion='gini',  #can also set to 'entropy'
#                                  )
# tree_clf.fit(train_features,train_labels)

In [None]:
# RANDOM FOREST
# DO NO UCOMMENT THIS BLOCK


tree_depth = 2 
tree_depth = 15
node_split = 2       # minimum number of training samples needed to split a node
node_split = 10
leaf_samples = 2     # minimum number of training samples required to make a leaf node
leaf_samples = 20
RAND_STATE = 42

number_of_trees = 1

tree_clf = RandomForestClassifier(n_estimators = number_of_trees, 
                           random_state = RAND_STATE,
                           min_samples_split = node_split,
                           min_samples_leaf = leaf_samples,
                           criterion = 'gini',
                           max_depth = tree_depth)
tree_clf.fit(train_features,train_labels)


In [None]:
from graphviz import Source
fig_savename = 'tree_classifier_christman'

if(len(np.shape(tree_clf))!=0):
    tree_to_viz = tree_clf[0]
else:
    tree_to_viz = tree_clf
export_graphviz(tree_to_viz,
                out_file=local_path + fig_savename+'.dot',
                filled=True,
                proportion=False,
                leaves_parallel=False,
                class_names=('above ' + str(THRESHOLD_TEMP) + 'C', 'below ' + str(THRESHOLD_TEMP) + 'C', 'really below ' + str(THRESHOLD_TEMP) + 'C'),
                feature_names=feature_list)
Source.from_file(local_path + fig_savename+'.dot')

### Make predictions

In [None]:
y_pred_train = tree_clf.predict(train_features)
y_pred_val = tree_clf.predict(val_features)
y_pred_val

In [None]:
tree_clf.predict_proba(val_features)[:5]

### Visualization of predictions

In [None]:
val_features.shape

In [None]:
plt.plot(features[:,-1],np.array(data[TARGET_VAR]),'.')
plt.plot(train_features[:,-1],(-y_pred_train*15)+15,'.r')
plt.axhline(y=THRESHOLD_TEMP,linestyle='--',color='k')
plt.axhline(y=-THRESHOLD_TEMP/2,linestyle='--',color='k')
plt.xlabel('day of year')
plt.ylabel(TARGET_VAR)
plt.show()

plt.plot(features[:,-1],np.array(data[TARGET_VAR]),'.')
plt.plot(val_features[:,-1],(-y_pred_val*15)+15,'.r')
plt.axhline(y=THRESHOLD_TEMP,linestyle='--',color='k')
plt.axhline(y=-THRESHOLD_TEMP/2,linestyle='--',color='k')
plt.xlabel('day of year')
plt.ylabel(TARGET_VAR)
plt.show()

### Evaluate the classification predictions

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
# y_probs = tree_clf.predict_proba(val_features)
# y_scores = y_probs[:,1]
# fpr_tree, tpr_tree, thresholds_tree = roc_curve(val_labels,y_scores)
# auc_tree = roc_auc_score(val_labels,y_scores)

In [None]:
# def plot_roc_curve(fpr, tpr, label=None):
#     plt.plot(fpr, tpr, linewidth=2, label=label)
#     plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
#     plt.axis([0, 1.01, 0, 1.01])                                    # Not shown in the book
#     plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
#     plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
#     plt.grid(True)                                            # Not shown
    
# plt.figure(figsize=(8, 6))
# plot_roc_curve(fpr_tree, tpr_tree, "Decision Tree")
# plt.title('AUC = ' + str(auc_tree))
# plt.grid(True)
# plt.legend(loc="lower right", fontsize=16)
# plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

y_pred_train = tree_clf.predict(train_features)
print('training confusion matrix')
print(confusion_matrix(train_labels, y_pred_train))
ConfusionMatrixDisplay.from_predictions(train_labels, y_pred_train,normalize='true')
plt.title('Training Data')

In [None]:
y_pred_val = tree_clf.predict(val_features)
print('validation confusion matrix')
print(confusion_matrix(val_labels, y_pred_val))

ConfusionMatrixDisplay.from_predictions(val_labels, y_pred_val,normalize='true')
plt.title('Validation Data')

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(train_labels, y_pred_train))
print(accuracy_score(val_labels, y_pred_val))

In [None]:
# from sklearn.metrics import precision_score, recall_score

# print(precision_score(train_labels, y_pred_train))
# print(precision_score(val_labels, y_pred_val))

In [None]:
# print(recall_score(train_labels, y_pred_train))
# print(recall_score(val_labels, y_pred_val))

In [None]:
from sklearn.metrics import f1_score

print('Macro F1-Score')
print(f1_score(train_labels, y_pred_train, average='macro'))
print(f1_score(val_labels, y_pred_val, average='macro'))

print('')

print('Weighted F1-Score')
print(f1_score(train_labels, y_pred_train, average='weighted'))
print(f1_score(val_labels, y_pred_val, average='weighted'))

## Iterate through possibilities and plot

In [None]:
import itertools
tree_depths = (5, 10)
node_splits = (2, 5, 10)
leaf_samples = (2, 5, 10)
for tree_depth, node_split, leaf_sample in itertools.product(
    tree_depths, node_splits, leaf_samples
):
  # tree_depth = 2 
  # tree_depth = 15
  # node_split = 2       # minimum number of training samples needed to split a node
  # node_split = 10
  # leaf_samples = 2     # minimum number of training samples required to make a leaf node
  # leaf_samples = 20
  # RAND_STATE = 42
  print('')
  label = f'Depth: {tree_depth} Split: {node_split} Leaf: {leaf_sample}'
  print(label)
  number_of_trees = 1

  tree_clf = RandomForestClassifier(n_estimators = number_of_trees, 
                            random_state = RAND_STATE,
                            min_samples_split = node_split,
                            min_samples_leaf = leaf_sample,
                            criterion = 'gini',
                            max_depth = tree_depth)
  tree_clf.fit(train_features,train_labels)

  y_pred_train = tree_clf.predict(train_features)
  y_pred_val = tree_clf.predict(val_features)
  print('Counts:', (y_pred_val == 0).sum(), (y_pred_val == 1).sum(), (y_pred_val == 2).sum())

  fig, axs = plt.subplots(ncols=3, figsize=(12, 4))
  fig.suptitle(label)
  ax = axs[0]
  ax.plot(features[:,-1],np.array(data[TARGET_VAR]),'.')
  ax.plot(train_features[:,-1],(-y_pred_train*15)+15,'.r')
  ax.axhline(y=THRESHOLD_TEMP,linestyle='--',color='k')
  ax.axhline(y=-THRESHOLD_TEMP/2,linestyle='--',color='k')
  ax.set_xlabel('day of year')
  ax.set_ylabel(TARGET_VAR)

  ax = axs[1]
  ax.plot(features[:,-1],np.array(data[TARGET_VAR]),'.')
  ax.plot(val_features[:,-1],(-y_pred_val*15)+15,'.r')
  ax.axhline(y=THRESHOLD_TEMP,linestyle='--',color='k')
  ax.axhline(y=-THRESHOLD_TEMP/2,linestyle='--',color='k')
  ax.set_xlabel('day of year')
  ax.set_ylabel(TARGET_VAR)

  ax = axs[2]
  from sklearn.metrics import confusion_matrix
  from sklearn.metrics import ConfusionMatrixDisplay
  y_pred_train = tree_clf.predict(train_features)
  # print(confusion_matrix(train_labels, y_pred_train))
  ConfusionMatrixDisplay.from_predictions(train_labels, y_pred_train,normalize='true', ax=ax)
  ax.set_title('Training Data')

  from sklearn.metrics import f1_score

  print('Macro F1-Score')
  print(f1_score(train_labels, y_pred_train, average='macro'))
  print(f1_score(val_labels, y_pred_val, average='macro'))
  print('Weighted F1-Score')
  print(f1_score(train_labels, y_pred_train, average='weighted'))
  print(f1_score(val_labels, y_pred_val, average='weighted'))

# EVALUATE YOUR MODEL ON TRUE TESTING DATA

In [None]:
raise ValueError('do not go below this line!')

In [None]:
tree_depth = 20
node_split = 10
leaf_sample = 2
tree_clf = RandomForestClassifier(n_estimators = number_of_trees, 
                          random_state = RAND_STATE,
                          min_samples_split = node_split,
                          min_samples_leaf = leaf_sample,
                          criterion = 'gini',
                          max_depth = tree_depth)
tree_clf.fit(train_features,train_labels)

In [None]:
# # Read in data from url
from sklearn.metrics import f1_score

url = "https://raw.githubusercontent.com/eabarnes1010/course_ml_ats/main/data/fccwx_data_2021.csv"
data = pd.read_csv(url,parse_dates=["Date"],infer_datetime_format=True)
data['dayofyear'] = data['Date'].dt.dayofyear
data.reindex(index=data.index[::-1])

# Labels are the values we want to predict
labels = np.zeros((np.shape(data[TARGET_VAR])))
i = np.where(data[TARGET_VAR] < THRESHOLD_TEMP)[0]
labels[i] = 1
i = np.where(data[TARGET_VAR] < -THRESHOLD_TEMP/2)[0]
labels[i] = 2

# Remove the labels from the features
# axis 1 refers to the columns
features = data.drop(TARGET_VAR, axis = 1)

# Also remove DewPt and Date
features = features.drop('DewPt [C]', axis = 1)   # comment out if you want the prediction task to be easy
features = features.drop('Date', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

# make the predictions
y_pred_test = tree_clf.predict(features)

# print the metrics report
print(sklearn.metrics.classification_report(labels,y_pred_test))

# print final f1 score
print('---------------------------------------')
print('Macro F1-Score   : ' + str(f1_score(labels, y_pred_test,average='macro')))
print('Weighted F1-Score: ' + str(f1_score(labels, y_pred_test,average='weighted')))

# print accuracies
print('---------------------------------------')
print('TRAINING ACCURACY  : ' + str(accuracy_score(train_labels, y_pred_train)))
print('VALIDATION ACCURACY: ' + str(accuracy_score(val_labels, y_pred_val)))
print('TESTING ACCURACY   : ' + str(accuracy_score(labels, y_pred_test)))

plt.plot(features[:,-1],np.array(data[TARGET_VAR]),'.')
plt.plot(features[:,-1],(y_pred_test*50)-25,'.r')
plt.axhline(y=THRESHOLD_TEMP,linestyle='--',color='k')
plt.axhline(y=-THRESHOLD_TEMP/2,linestyle='--',color='k')
plt.xlabel('day of year')
plt.ylabel(TARGET_VAR)
plt.title('Testing Year 2021')
plt.show()