# 1.  Import packages

In [1]:

import os
import io

import earthpy as et
import pandas as pd
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
import graphviz


# 2. Make/set working directory

In [2]:
# Set working directory

# if the desired path exists:
data_dir = os.path.join(et.io.HOME, 'Dropbox',
                        'cu_earthdata_certificate_2021', 'earthlab_project', 'data')
if os.path.exists(data_dir):
    # set working directory:
    os.chdir(data_dir)
    print("path exists")
else:
    print("path does not exist, making new path")
    os.makedirs(data_dir)
    os.chdir(data_dir)

path exists


# 3. Download and organize data

In [3]:
# 1.  Download landcover data generated in script 06
file_path = os.path.join("lake_landcover.csv")
landcover_stats = pd.read_csv(file_path)

landcover_stats = landcover_stats.drop(
    ['source', 'TZID', 'clouds', 'type', 'path', 'system_ind', 'date_utc', 'timediff', 'date_unity', 'pixelCount', 'id', 'sat', 'qa_sd', 'endtime', 'row', 'swir1_sd', 'swir2_sd', 'date_only', 'blue', 'blue_sd', 'red', 'red_sd', 'green', 'green_sd'], axis=1)


# 2.  Download color stats collected in script 03
file_path = os.path.join("lake_mtbs_merged_tab.csv")
color_stats = pd.read_csv(file_path)

color_stats = color_stats.drop(['Aerosol', 'pCount_dswe1', 'pCount_dswe3', 'sat', 'Event_ID', 'irwinID', 'Incid_Name',
                               'Map_ID', 'Map_Prog', 'Asmnt_Type', 'Ig_Date', 'Pre_ID', 'Post_ID', 'Perim_ID', 'dNBR_stdDv'], axis=1)


# Merge the first two datasets
new_df = pd.merge(landcover_stats, color_stats)  #how='left', left_on=[
    #'landsat_id', 'Hylak_id', 'date'], right_on=['LandsatID', 'Hylak_id', 'date'])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# 3. Download topography data generated in script 05
file_path = os.path.join("lake_topography.csv")
topography_stats = pd.read_csv(file_path)

topography_stats = topography_stats.drop(['Unnamed: 0', 'source', 'TZID', 'clouds', 'type', 'path', 'system_ind', 'date_utc', 'timediff',
                                         'date_unity', 'pixelCount', 'id', 'sat', 'qa_sd', 'endtime', 'row', 'swir1_sd', 'swir2_sd', 'date_only', 'geometry', 'blue', 'blue_sd', 'red', 'red_sd', 'green', 'green_sd'], axis=1)

more_df = pd.merge(new_df, topography_stats)

In [5]:
# 4. Download area data generated in script 01
file_path = os.path.join("polygon_areas_tab.csv")
area_stats = pd.read_csv(file_path)

area_stats = area_stats.drop(['Unnamed: 0'], axis=1)

bigger_df = pd.merge(more_df, area_stats) #547 entries

In [6]:
# 5. Download time since fire data from script 02
file_path = os.path.join("burned_lakes_tabular.csv")
time_fire_stats = pd.read_csv(file_path)

small_time_stats = time_fire_stats[['Hylak_id', 'LandsatID', 'date', 'pre_post', 'days_since', 'img_month', 'years_since', "fui"]]

full_df = pd.merge(bigger_df, small_time_stats,  how='left', left_on=[
    'Hylak_id', 'date'], right_on=['Hylak_id', 'date'])

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# Remove unwanted columns
full_df = full_df.drop(['Hylak_id', 'landsat_id', 'date', 'LandsatID_y', 'Unnamed: 0', 'system:index', 'SiteID',
                       'LandsatID_x', 'NoData_T', 'Mod_T', 'High_T', 'Low_T', 'IncGreen_T', 'time', 'tis'], axis=1)

# Remove columns with insufficuent in-situ data
full_df = full_df.drop(['doc', 'secchi', 'tss', 'p_sand'], axis=1)

# Remove columns that may be correlated with color 
full_df = full_df.drop(['nir_sd', 'Blue', 'Red', 'Green', 'fui', 'Swir1', 'Swir2', 'Nir', 'swir2', 'nir', 'TIR2', 'TIR1', 'swir1'], axis=1)

# Lets drop columns where chla data is empty.  This should leave about 150 images for analysis
full_df.dropna(subset=["chl_a"], inplace=True)

In [8]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237 entries, 0 to 1709
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Grassland/Herbaceous       237 non-null    float64
 1   Cultivated_Crops           237 non-null    float64
 2   Shrub/Scrub                237 non-null    float64
 3   BarrenLand_Rock/Sand/Clay  237 non-null    float64
 4   Evergreen Forest           237 non-null    float64
 5   Developed_OpenSpace        237 non-null    float64
 6   Open_water                 237 non-null    float64
 7   Class_sum                  237 non-null    float64
 8   Woody_Wetlands             237 non-null    float64
 9   qa                         237 non-null    int64  
 10  chl_a                      237 non-null    float64
 11  pwater                     237 non-null    float64
 12  year                       237 non-null    int64  
 13  dWL                        237 non-null    float6

In [9]:
# Export file to local drive
out_path = os.path.join("randomforest_df.csv")
full_df.to_csv(out_path)

## One-Hot Encoding
Transformation of classification data to binomial

In [10]:
# One-hot encode the data using pandas get_dummies
formatted_df = pd.get_dummies(full_df)

# Display some object columns to check transformation (fire type and pre or post fire)
print(formatted_df.iloc[:,50:].head(5))

formatted_df.info()

Empty DataFrame
Columns: []
Index: [0, 1, 4, 511, 512]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 237 entries, 0 to 1709
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Grassland/Herbaceous          237 non-null    float64
 1   Cultivated_Crops              237 non-null    float64
 2   Shrub/Scrub                   237 non-null    float64
 3   BarrenLand_Rock/Sand/Clay     237 non-null    float64
 4   Evergreen Forest              237 non-null    float64
 5   Developed_OpenSpace           237 non-null    float64
 6   Open_water                    237 non-null    float64
 7   Class_sum                     237 non-null    float64
 8   Woody_Wetlands                237 non-null    float64
 9   qa                            237 non-null    int64  
 10  chl_a                         237 non-null    float64
 11  pwater                        237 non-null    float64
 12  year    

## Features and Targets and Convert Data to Arrays
Separate the data into the features and targets. The target, 
also known as the label, is the value we want to predict, in this case the DWL (color) and the features are all the columns the model uses to make a prediction.
We will also convert the Pandas dataframes to Numpy arrays because that is the way the algorithm works. 
(Save the column headers, which are the names of the features, to a list to use for later visualization).

In [11]:
# Use numpy to convert to arrays
# Labels are the values we want to predict
labels = np.array(formatted_df['dWL'])

# Remove the labels from the features
# axis 1 refers to the columns
features = formatted_df.drop('dWL', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)


In [12]:
features

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

### Training and Testing Sets
There is one final step of data preparation: splitting data into training and testing sets. 
During training, we let the model ‘see’ the answers, in this case the actual dWL, so it can learn 
how to predict the temperature from the features. We expect there to be some relationship between 
all the features and the target value, and the model’s job is to learn this relationship during training. 
Then, when it comes time to evaluate the model, we ask it to make predictions on a testing set where 
it only has access to the features (not the answers)! Because we do have the actual answers for 
the test set, we can compare these predictions to the true value to judge how accurate the model is. 
Generally, when training a model, we randomly split the data into training and testing sets to get a 
representation of all data points (if we trained on the first nine months of the year and then used the 
final three months for prediction, our algorithm would not perform well because it has not seen any data 
from those last three months.) I am setting the random state to 42 which means the results will be the same 
each time I run the split for reproducible results.
The following code splits the data sets with another single line:


In [13]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.25, random_state=42)

We can look at the shape of all the data to make sure we did everything correctly. We expect the training features number of columns to match the testing feature number of columns and the number of rows to match for the respective training and testing features and the labels 

In [14]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (177, 36)
Training Labels Shape: (177,)
Testing Features Shape: (60, 36)
Testing Labels Shape: (60,)


Depending on the initial data set, there may be extra work involved such as removing outliers, imputing missing values, or converting temporal variables into cyclical representations. These steps may seem arbitrary at first, but once you get the basic workflow, it will be generally the same for any machine learning problem. It’s all about taking human-readable data and putting it into a form that can be understood by a machine learning model.

# Establish Baseline
Before we can make and evaluate predictions, we need to establish a baseline, a sensible measure that we hope to beat with our model. If our model cannot improve upon the baseline, then it will be a failure and we should try a different model or admit that machine learning is not right for our problem. The baseline prediction for our case can be the historical max temperature averages. In other words, our baseline is the error we would get if we simply predicted the average color for all images.


In [15]:
# The baseline predictions are the historical averages
#baseline_preds = test_features[:, feature_list.index('Green')]

# Baseline errors, and display average baseline error
#baseline_errors = abs(baseline_preds - test_labels)
#print('Average baseline error: ', round(np.mean(baseline_errors), 2))

We now have our goal! If we can’t beat an average error of 5 degrees, then we need to rethink our approach.

## Train Model
After all the work of data preparation, creating and training the model is pretty simple using Scikit-learn. We import the random forest regression model from skicit-learn, instantiate the model, and fit (scikit-learn’s name for training) the model on the training data. (Again setting the random state for reproducible results). This entire process is only 3 lines in scikit-learn!

In [16]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

## Make Predictions on the Test Set
Our model has now been trained to learn the relationships between the features and the targets. The next step is figuring out how good the model is! To do this we make predictions on the test features (the model is never allowed to see the test answers). We then compare the predictions to the known answers. When performing regression, we need to make sure to use the absolute error because we expect some of our answers to be low and some to be high. We are interested in how far away our average prediction is from the actual value so we take the absolute value (as we also did when establishing the baseline).
Making predictions with out model is another 1-line command in Skicit-learn.

In [17]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'nm.')

Mean Absolute Error: 8.92 nm.


In [22]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
print(mape)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

[1.68560606e-02 3.11815945e-01 1.54637097e-01 5.66308316e+00
 5.04472272e-02 1.48221344e-02 4.26515303e-01 1.32382892e-01
 7.28977273e-01 1.22403259e-01 6.68326613e+00 4.30970149e-02
 7.92545432e+00 2.85923695e+00 7.70643939e-01 1.26353791e-03
 7.24955752e-01 6.49857904e+00 0.00000000e+00 1.89203540e-01
 1.45564516e-01 1.68375637e+00 1.32462687e-02 3.73216495e+00
 2.13014761e-01 1.09164969e-01 9.88142292e-04 5.23262787e+00
 4.14285714e-02 8.35183312e-01 1.17003891e+00 5.73709677e+00
 1.16849315e+00 3.26548673e-01 5.95437650e+00 1.26353791e-03
 2.74685817e-02 6.14031972e+00 2.14015152e-02 3.28571429e-02
 2.75000000e-02 0.00000000e+00 1.12997248e-01 4.21428571e-02
 3.03044355e+00 7.31225296e-03 1.53952641e+00 2.34850843e+00
 6.66975806e+00 2.88223801e+00 3.19013646e-01 0.00000000e+00
 3.27630522e+00 3.67857143e-02 0.00000000e+00 1.01977459e+01
 1.56193896e-02 5.73165323e+00 0.00000000e+00 1.61579892e-02]
Accuracy: 98.3 %.


In [23]:
# Pull out one tree from the forest
tree = rf.estimators_[5]

# if the desired path exists:
data_dir = os.path.join(et.io.HOME, 'Dropbox',
                        'cu_earthdata_certificate_2021', 'earthlab_project', 'ea-2021-final-project-wildfire_lake_productivity', 'images')

os.chdir(data_dir)
    
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png')

In [24]:
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

In [25]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: chl_a                Importance: 0.45
Variable: year                 Importance: 0.13
Variable: BurnBndLat           Importance: 0.07
Variable: elevation            Importance: 0.06
Variable: qa                   Importance: 0.03
Variable: BurnBndAc            Importance: 0.03
Variable: wholearea            Importance: 0.03
Variable: days_since           Importance: 0.03
Variable: BurnBndLon           Importance: 0.02
Variable: dNBR_offst           Importance: 0.02
Variable: burnedarea           Importance: 0.02
Variable: percentburned        Importance: 0.02
Variable: img_month            Importance: 0.02
Variable: pwater               Importance: 0.01
Variable: chili                Importance: 0.01
Variable: landform             Importance: 0.01
Variable: physd                Importance: 0.01
Variable: slope                Importance: 0.01
Variable: tdiv                 Importance: 0.01
Variable: years_since          Importance: 0.01
Variable: Grassland/Herbaceous Importanc