# Classification Using Random Forests (CR)
                         

In [None]:
import os
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

# Acquire data in an accessible format


In [None]:
df = pd.read_csv(os.path.join(".", "Cleaned_Data", "chromatic.csv"))
df.head()

# Prepare data for the machine learning model Using the 'Sub_Region' Column as Input

In [None]:
df.drop(columns=['Latitude', 'Longitude','Country', 'Region' ])
df = df.drop(columns=['Latitude', 'Longitude','Country', 'Region'])

In [None]:
Sub_Region_List = df['Sub_Region'].drop_duplicates()

print (Sub_Region_List)

# One-Hot Encoding

In [None]:
# Step 0: Reformat data
data = df.values
X = data[:, 0:115]
y = data[:, 116]


In [None]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

In [None]:
from tensorflow.keras.utils import to_categorical

# Step 2: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

In [None]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 15)

# Separate The Data into Features & Targets

In [None]:
target = one_hot_y
target_names = ["negative", "positive"]

In [None]:
data = df.drop("Sub_Region", axis=1)
feature_names = data.columns
data.head()

# Convert DataTypes For The Training & Testing Data Sets

In [None]:
df.dtypes

In [None]:
# Convert 'Sub_Region' Column to Float

df["Sub_Region"] = pd.to_numeric(df.Sub_Region, errors='coerce' )
df.dtypes

# Train and split Model with random forest regression model from skicit-learn

In [None]:
# Import  model 
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 1234)
# Split Data Into Testing and Training Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1234)

# Train & Split Model with Random Forest Classification Model 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

# Shape of all the Data


In [None]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)

# Metrics and Scoring For Classification Model 

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# Train the model on training data
rf = rf.fit(X_train, y_train);
rf.score(X_test, y_test)

# Make Predictions & Calculate Errors

In [None]:
# Use numpy to convert to arrays
import numpy as np
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
predictions

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
errors

In [None]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
# mape = 100 * (errors / y_test, )
# Calculate and display accuracy
# accuracy = 100 - np.mean(mape, )
# print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
sorted(zip(rf.feature_importances_, target_names), reverse=True)

# Visualizing The Decision Tree in Regression Task

In [None]:
# Fit the regressor, set max_depth = 3
regr = DecisionTreeRegressor(max_depth=3, random_state=1234)
model = regr.fit(X, one_hot_y)


In [None]:
text_representation = tree.export_text(regr)
print(text_representation)

In [None]:
# Note that color of the leaf coresponds to the predicted value.
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(regr, feature_names=data.columns, filled=True)
                 
fig.savefig("static/images/random_trees_DF")

# Random Trees Data Structure and Visualization

In [None]:
# Total Count of Nodes

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
n_nodes = rf.estimators_[0].tree_.node_count
n_nodes

In [None]:
# Visualize (5) Random Trees 
# Note ALSO that color of the leaf coresponds to the predicted value.
fn=data.columns
cn=one_hot_y

fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (12,2), dpi=720)
for index in range(0, 5):
    tree.plot_tree(rf.estimators_[index],
                   feature_names = fn, 
                   class_names=cn,
                   filled = True,
                   ax = axes[index]);

axes[index].set_title('Estimator: ' + str(index), fontsize = 11)

fig.savefig("static/images/rt_5trees-CR.png")