<a href="https://colab.research.google.com/github/cm-int/machine-learning-fundamentals/blob/main/module_2/Labs/Lab_2_1_NearEarthObjects_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 2.1: Creating a Classification Machine Learning Model

In this lab, you’ll build and test several binary classification models to classify Near Earth Objects (NEOs) as Hazardous or Not Hazardous, depending on whether they are likely to collide with the Earth.

You’ll build a Decision Tree model, a Random Forest Model, and a Gradient Boosted Tree model. You’ll compare the performance of all three models, and examine the effects of adjusting the probability thresholds of predictions on the recall of a model. 




**About the data:**

(From https://www.kaggle.com/code/elnahas/nasa-nearest-earth-objects/data)

There is an infinite number of objects in the outer space. Some of them are closer than we think. Even though we might think that a distance of 70,000 Km can not potentially harm us, but at an astronomical scale, this is a very small distance and can disrupt many natural phenomena. These objects/asteroids can thus prove to be harmful. Hence, it is wise to know what is surrounding us and what can harm us amongst those. This dataset compiles the list of NASA certified asteroids that are classified as the nearest earth object.

#Read the Data

In [None]:
# Upload the neo_v2.csv file from Github

!wget 'https://raw.githubusercontent.com/cm-int/machine-learning-fundamentals/main/module_2/Labs/neo_v2.csv'

In [None]:
# Read the data from the neo_v2.csv file into a Pandas DataFrame named neos and display the data

import numpy as np
import pandas as pd

neos = pd.read_csv("neo_v2.csv")
print(neos)

In [None]:
# We want to predict whether an object is hazardous. Create a new DataFrame named hazardous containing dummy variables for the 'hazardous' column and display the results

hazardous = pd.get_dummies(neos.hazardous)
print(hazardous)

In [None]:
# Rename the two columns to 'Yes' and 'No'. The default names are 'True' and 'False' which are also the names of Python constants, and can cause problems later.

hazardous.rename(columns={hazardous.columns[0]: "No", hazardous.columns[1]: "Yes"}, inplace=True)
print(hazardous)

In [None]:
# Remove the 'No' column from the DataFrame and convert the result into an array

hazardous = hazardous.drop('No', axis=1)
hazardous = hazardous.to_numpy().flatten()

In [None]:
# Display the first few values from the hazardous array

print(hazardous)

In [None]:
# Is 'sentry_object' useful in the neo dataframe?  How many different values does it have? (Answer: Just one - False repeated 90836 times)

neos.value_counts('sentry_object')

In [None]:
# Similarly, is 'orbiting_body' useful?

neos.value_counts('orbiting_body') 

In [None]:
# Remove the 'hazardous' column from the neos DataFrame, along with columns that probably won't help with classification (orbiting_body, sentry_object)

neos = neos.drop(['hazardous', 'orbiting_body', 'sentry_object'], axis=1) 
neos

# Visualize the Data

In [None]:
# Create a TSNE model and transform the data in the neos array into a 2D representation of the data
# NOTE: This step uses a random sample of 2000 rows from the neos array, to save time

from sklearn.manifold import TSNE

sample_data = neos.sample(2000)
sample_hazardous = hazardous[sample_data.index]

visual_model = TSNE(learning_rate = 100, init='pca')
visual_transformation = visual_model.fit_transform(sample_data)


In [None]:
# Extract first and second columns from the array containing the transformed data and use them to create a Pandas DataFrame

x_data = visual_transformation[:, 0]
y_data = visual_transformation[:, 1]
transformed_data = pd.DataFrame({'x':x_data, 'y':y_data})

In [None]:
# Display the results as a Matplotlib graph
# The clustering doesn't look promising!

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
plt.scatter(transformed_data.loc[sample_hazardous==0]['x'], transformed_data.loc[sample_hazardous==0]['y'], c='red')
plt.scatter(transformed_data.loc[sample_hazardous==1]['x'], transformed_data.loc[sample_hazardous==1]['y'], c='blue')
plt.legend(loc ='lower left', labels = ['Not Hazardous', 'Hazardous'])
plt.show()

In [None]:
# Try creating a 3D TSNE model
# NOTE: This step takes between 4 and 5 minutes to run

visual_model = TSNE(learning_rate = 100, init='pca', n_components=3)
visual_transformation = visual_model.fit_transform(sample_data)

In [None]:
x_data = visual_transformation[:, 0]
y_data = visual_transformation[:, 1]
z_data = visual_transformation[:, 2]
transformed_data = pd.DataFrame({'x':x_data, 'y':y_data, 'z':z_data})

In [None]:
# This time the clustering looks better, so there's a chance we can come up with a decent model

fig = plt.figure(figsize=(10, 10))

ax = fig.add_subplot(projection='3d')
ax.scatter(transformed_data.loc[sample_hazardous==0]['x'], transformed_data.loc[sample_hazardous==0]['y'], transformed_data.loc[sample_hazardous==0]['z'], c='red')
ax.scatter(transformed_data.loc[sample_hazardous==1]['x'], transformed_data.loc[sample_hazardous==1]['y'], transformed_data.loc[sample_hazardous==1]['z'], c='blue')
ax.view_init(5, 5)

plt.legend(loc ='lower left', labels = ['Not Hazardous', 'Hazardous'])
plt.show()

#Classification using a Decision Tree Model

In [None]:
# Split the data into training and test datasets

from sklearn.model_selection import train_test_split

features_train, features_test, predictions_train, predictions_test = train_test_split(neos, hazardous, test_size=0.33, random_state=13)
print(f'features_train: {features_train.shape}\npredictions_train: {predictions_train.shape}')

In [None]:
# Create and fit a Decision Tree model. Use the default values for the hyperparameters.

from sklearn.tree import DecisionTreeClassifier

classifier_model = DecisionTreeClassifier()
_ = classifier_model.fit(features_train, predictions_train)

In [None]:
# Review the Decision Tree
# NOTE: This step takes a minute or two to run. Scroll the results pane to see the tree when it is complete.

import graphviz
from sklearn.tree import export_graphviz

view_data = export_graphviz(classifier_model, out_file=None, feature_names=neos.columns, class_names=['Not Hazardous', 'Hazardous'])
graph = graphviz.Source(view_data)
graph

In [None]:
# Use the model to make predictions

test_results = classifier_model.predict(features_test)

In [None]:
# Compare the first 30 test predictions to the actual test results

print(test_results[0:30])
print(predictions_test[0:30])
print(f'\n\n({(test_results == predictions_test)[0:30]})')

In [None]:
# Examine the accuracy, precision and recall of the model using K-Fold cross-validation. Set K to 5

from sklearn.model_selection import cross_validate

scoring_metrics = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}

scores = cross_validate(classifier_model, features_train, predictions_train, cv=5, scoring=scoring_metrics)

print(f'Accuracy: {scores["test_Accuracy"]}\n')
print(f'Precision: {scores["test_Precision"]}\n')
print(f'Recall: {scores["test_Recall"]}\n')

In [None]:
# How many potentially Hazardous NEOs have been misclassified as Not Hazardous (false negatives)? Display the confusion matrix for the model.

from sklearn.metrics import ConfusionMatrixDisplay
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, test_results, display_labels=['Not Hazardous', 'Hazardous'])

In [None]:
# Generate the ROC curve and calculate the AUC. Is this model better than random guesswork?

from sklearn import metrics

_ = metrics.RocCurveDisplay.from_predictions(test_results, predictions_test)

In [None]:
# Probabilities generated by the Decision Tree Model are either 1 or 0

test_probs = classifier_model.predict_proba(features_test)
test_probs[0:30]

# Classification using a Random Forest

In [None]:
# Create and fit the model using the same training and test datasets as before

from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(n_estimators=100) 
_ = forest_model.fit(features_train, predictions_train)

In [None]:
# Test predictions

test_results = forest_model.predict(features_test)
print(test_results[0:30])
print(predictions_test[0:30])
print(f'\n\n({(test_results == predictions_test)[0:30]})')

In [None]:
# Cross-validate the model and compare the accuracy, precision, and recall against the Decision Tree model

scoring_metrics = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}
scores = cross_validate(forest_model, features_train, predictions_train, cv=5, scoring=scoring_metrics)

print(f'Accuracy: {scores["test_Accuracy"]}\n')
print(f'Precision: {scores["test_Precision"]}\n')
print(f'Recall: {scores["test_Recall"]}\n')

In [None]:
# Compare Confusion Matrix and ROC curves to the Decision Tree Model

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, test_results, display_labels=['Not Hazardous', 'Hazardous'])
_ = metrics.RocCurveDisplay.from_estimator(forest_model, features_test, predictions_test)
_ = metrics.RocCurveDisplay.from_predictions(test_results, predictions_test)

# Note: The number of FP and FNs have both dropped slightly. The model is still more likely to label a Hazardous NEO as Not Hazardous.
# Is this a good thing? The default threshold of 0.5 is probably too high for this dataset, so investigate further ...

In [None]:
# Find the probabilities for each prediction in the test sample

test_probs = forest_model.predict_proba(features_test)
test_probs[1:30]

In [None]:
# Keep the probabilities for the positive outcome (Hazardous) and discard those for the negative outcome (Not Hazardous)

test_probs_hazardous = test_probs[:, 1]
print(f'{test_probs_hazardous[1:30]}')

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(predictions_test, test_probs_hazardous)

# Calculate the J statistic to find the optimal threshold for this model
J = tpr - fpr
idx = np.argmax(J)
optimal_threshold = thresholds[idx]

plt.figure(figsize=(10, 10))
plt.plot([0,1], [0,1], linestyle='--', label='Random')
plt.plot(fpr, tpr, marker='.', markersize=5, label='Model')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.scatter(fpr[idx], tpr[idx], marker='X', s=200, color='Blue', label=f'Optimal threshold at P={optimal_threshold}')
plt.legend()
_ = plt.show()

In [None]:
print(f'Predictions with a probability >= {optimal_threshold} should be classed as Hazardous')

In [None]:
# Examine how using the optimal threshold impacts the quality of the predictions made using the model 

# Find the index for every prediction with a probability probability >= optimal_threshold
indexes = np.where(test_probs_hazardous >= optimal_threshold)

# Create a new array cor holding the adjusted predictions after applying the new threshold. Initialize it with zeros, and make it the same length as the original set of test results
adjusted_test_results = np.zeros(test_results.size)

# Set the value in the adjusted predictions array to 1 for each item indicated by the indexes array
adjusted_test_results[indexes] = 1

# Display the results
print(adjusted_test_results)

In [None]:
# Generate a confusion matrix comparing the predictions test data and the adjusted test results

# The number of false negatives should have dropped considerably, although there is now an increased level of false positives. Are false positives better than false negatives?

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, adjusted_test_results, display_labels=['Not Hazardous', 'Hazardous'])

In [None]:
# Calculate the accuracy, precision, and recall for the model using this new threshold
# Recall should be much improved from before. Why is this more important than precision for this model?

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(predictions_test, adjusted_test_results)
precision = precision_score(predictions_test, adjusted_test_results)
recall = recall_score(predictions_test, adjusted_test_results)

print(f'Accuracy: {accuracy}\n')
print(f'Precision: {precision}\n')
print(f'Recall: {recall}\n')

#Classification using Gradient Boosted Decision Tree

In [None]:
# Create and fit the model

from sklearn.ensemble import GradientBoostingClassifier
gbt_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
_ = gbt_model.fit(features_train, predictions_train)

In [None]:
# Test predictions

test_results = gbt_model.predict(features_test)
print(f'{(test_results == predictions_test)[0:30]}')

In [None]:
# Perform cross-validation, and calculate the accuracy, precision and recall of the model

scoring_metrics = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}
scores = cross_validate(gbt_model, features_train, predictions_train, cv=5, scoring=scoring_metrics)

print(f'Accuracy: {scores["test_Accuracy"]}\n')
print(f'Precision: {scores["test_Precision"]}\n')
print(f'Recall: {scores["test_Recall"]}\n')

In [None]:
# Generate the Confusion Matrix and ROC curves

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, test_results, display_labels=['Not Hazardous', 'Hazardous'])
_ = metrics.RocCurveDisplay.from_estimator(gbt_model, features_test, predictions_test)

# This model has even more FNs than the original Random Forest, but far fewer FPs

In [None]:
# Calculate the optimal threshold for this model, as per the Random Forest model

test_probs = gbt_model.predict_proba(features_test)[:, 1]
fpr, tpr, thresholds = roc_curve(predictions_test, test_probs)

# Calculate the J statistic to find the optimal threshold for this model
J = tpr - fpr
idx = np.argmax(J)
optimal_threshold = thresholds[idx]

plt.figure(figsize=(10, 10))
plt.plot([0,1], [0,1], linestyle='--', label='Random')
plt.plot(fpr, tpr, marker='.', markersize=5, label='Model')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.scatter(fpr[idx], tpr[idx], marker='X', s=200, color='Blue', label=f'Optimal threshold at P={optimal_threshold}')
plt.legend()
_ = plt.show()

In [None]:
print(f'Predictions with a probability >= {optimal_threshold} should be classed as Hazardous')

In [None]:
# Follow the same process as before to see how amending the threshold affects the quality of the model

# Find the index for every prediction with a probability probability >= optimal_threshold
indexes = np.where(test_probs_hazardous >= optimal_threshold)

# Create a new array cor holding the adjusted predictions after applying the new threshold. Initialize it with zeros, and make it the same length as the original set of test results
adjusted_test_results = np.zeros(test_results.size)

# Set the value in the adjusted predictions array to 1 for each item indicated by the indexes array
adjusted_test_results[indexes] = 1

# Display the results
print(adjusted_test_results)

In [None]:
# Calculate the accuracy, precision, and recall for the model using this new threshold
# Again, recall should be much improved from before.

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(predictions_test, adjusted_test_results)
precision = precision_score(predictions_test, adjusted_test_results)
recall = recall_score(predictions_test, adjusted_test_results)

print(f'Accuracy: {accuracy}\n')
print(f'Precision: {precision}\n')
print(f'Recall: {recall}\n')