<a href="https://colab.research.google.com/github/emilybguo/CS229/blob/main/softmax_adaboost_randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook holds code for data preprocessing and 3 out of our 4 overall ML algorithms: softmax regression, AdaBoost, and Random Forest.

Import relevant libraries

In [None]:
from google.colab import files
import numpy as np
import pandas as pd 
import sklearn 

# For standardizing dataset
from sklearn import preprocessing

# library for multiclass model metrics
!pip install disarray
import disarray

import matplotlib.pyplot as plt

 # For splitting of data into train and test set
from sklearn.model_selection import train_test_split
 
# For metrics and confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics

# Scikit-learn ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


# Data Preprocessing

In this file, we preprocess the data to prepare it for testing on machine learning algorithms. This includes controlling for variables, removing those we do not want to consider, and splitting catagorical data into one-hot vectors that allow us to include them in algorithms.

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

In [None]:
df=pd.read_csv('/content/gdrive/MyDrive/CS229 project/cleaned_data_12_09.csv')
df.head()

Create train, validation, and test sets.

In [None]:
x = df.drop(columns = ['action_taken']).copy()
y = df['action_taken']

# first, split data into training and remaining datasets, with training as 80% of 
# original dataset size
x_train, x_rem, y_train, y_rem = train_test_split(x, y, train_size=0.8, shuffle = True, random_state=0)

# next, split remaining data into validation and test datasets, with both as 10%
# of the original dataset size (50% of the remaining 20%)
x_valid, x_test, y_valid, y_test = train_test_split(x_rem, y_rem, test_size=0.5, shuffle = True, random_state=0)

Print frequency of each class in each dataset.

In [None]:
d = {}

for x in y_train:
    d[x] = d.get(x,0) + 1

# printing result
print(f"The list frequency of elements in y_train is : {d}" )

e = {}

for x in y_valid:
    e[x] = e.get(x,0) + 1

# printing result
print(f"The list frequency of elements in y_valid is : {e}" )

f = {}

for x in y_test:
    f[x] = f.get(x,0) + 1

# printing result
print(f"The list frequency of elements in y_valid is : {f}" )

The list frequency of elements in y_train is : {3: 16869, 6: 3866, 2: 3776}
The list frequency of elements in y_valid is : {3: 2118, 2: 487, 6: 459}
The list frequency of elements in y_valid is : {2: 459, 3: 2069, 6: 536}


Run to save all datasets to Drive.

In [None]:
# save x_train
with open('/content/gdrive/MyDrive/CS229 project/x_train.csv', 'w', encoding = 'utf-8-sig') as f:
  x_train.to_csv(f)

# save x_valid
with open('/content/gdrive/MyDrive/CS229 project/x_valid.csv', 'w', encoding = 'utf-8-sig') as f:
  x_valid.to_csv(f)

# save x_test
with open('/content/gdrive/MyDrive/CS229 project/x_test.csv', 'w', encoding = 'utf-8-sig') as f:
  x_test.to_csv(f)

# save y_train
with open('/content/gdrive/MyDrive/CS229 project/y_train.csv', 'w', encoding = 'utf-8-sig') as f:
  y_train.to_csv(f)

# save y_valid
with open('/content/gdrive/MyDrive/CS229 project/y_valid.csv', 'w', encoding = 'utf-8-sig') as f:
  y_valid.to_csv(f)

# save y_test
with open('/content/gdrive/MyDrive/CS229 project/y_test.csv', 'w', encoding = 'utf-8-sig') as f:
  y_test.to_csv(f)

Standardize datasets based on x_train.

In [None]:
scaler = preprocessing.StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_valid = scaler.transform(x_valid)
x_test = scaler.transform(x_test)

# Machine learning algorithms

### Model 1: Softmax 

In [None]:
softmax_baseline = LogisticRegression(max_iter=1000, penalty = 'none').fit(x_train, y_train) 
softmax_baseline.score(x_valid, y_valid)


In [None]:
feature_importance = pd.DataFrame(df.columns.values[:-1].tolist(), columns = ["feature"])
feature_importance["importance"] = np.power(np.e, softmax_baseline.coef_[0])
feature_importance = feature_importance.sort_values(by = ["importance"], ascending=False)

feature_importance[:10]

In [None]:
cm = confusion_matrix(y_valid, softmax_baseline.predict(x_valid), labels= softmax_baseline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=softmax_baseline.classes_)
disp.plot()

plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.accuracy, metrics.da.precision, metrics.da.recall

## Model 2: AdaBoost

In [None]:
ada = AdaBoostClassifier(learning_rate = 0.1, n_estimators=10, base_estimator=DecisionTreeClassifier(max_depth=6, min_samples_leaf= 5)).fit(x_train, y_train)

In [None]:
ada.score(x_valid, y_valid)

In [None]:
cm = confusion_matrix(y_valid, ada.predict(x_valid), labels= ada.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=ada.classes_)
disp.plot()

plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.accuracy, metrics.da.precision, metrics.da.recall

In [None]:
feature_imp = pd.Series(ada.feature_importances_, index = df.columns.values[:-1]).sort_values(ascending = False)
feature_imp[:10]

## Random Forest

In [None]:
# unbalanced
rf = RandomForestClassifier(n_estimators= 1800, max_depth = 30, 
                            min_samples_split = 10, 
                            min_samples_leaf = 1, max_features = 'sqrt',
                            bootstrap = False).fit(x_train, y_train)
rf.score(x_valid, y_valid)



In [None]:
# balanced
rf_b = RandomForestClassifier(n_estimators= 1800, max_depth = 30, 
                            min_samples_split = 10, 
                            min_samples_leaf = 1, max_features = 'sqrt',
                            bootstrap = True, class_weight= 'balanced_subsample').fit(x_train, y_train)
rf_b.score(x_valid, y_valid)



In [None]:
cm = confusion_matrix(y_valid, rf.predict(x_valid), labels= rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=rf.classes_)
disp.plot()

plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.accuracy, metrics.da.precision, metrics.da.recall

In [None]:
feature_imp = pd.Series(rf.feature_importances_, index = df.columns.values[:-1]).sort_values(ascending = False)
feature_imp[:10]

## Testing the Models

In [None]:
print("Baseline: ", softmax_baseline.score(x_test, y_test))
print("AdaBoost: ", ada.score(x_test, y_test))
print("RF (unbalanced): ", rf.score(x_test, y_test))
print("RF (balanced): ", rf_b.score(x_test, y_test))



Baseline

In [None]:
cm = confusion_matrix(y_test, softmax_baseline.predict(x_test), labels= softmax_baseline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=softmax_baseline.classes_)
disp.plot()
plt.title("Baseline Confusion Matrix")
plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall

AdaBoost

In [None]:
cm = confusion_matrix(y_test, ada.predict(x_test), labels= ada.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=ada.classes_)
disp.plot()
plt.title("AdaBoost Confusion Matrix")
plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall

Random Forest (Unbalanced)

In [None]:
cm = confusion_matrix(y_test, rf.predict(x_test), labels= rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=rf.classes_)
disp.plot()
plt.title("Random Forest Confusion Matrix")
plt.show()


metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall

Random Forest (Balanced)

In [None]:
cm = confusion_matrix(y_test, rf_b.predict(x_test), labels= rf_b.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=rf_b.classes_)
disp.plot()
plt.title("Random Forest (Balanced) Confusion Matrix")
plt.show()


metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall