In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import gc

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
survey_df = pd.read_csv('/content/drive/MyDrive/COG403/data/SharedResponsesSurvey.csv')
print(f"survey_df: {survey_df.shape}")

  survey_df = pd.read_csv('/content/drive/MyDrive/COG403/data/SharedResponsesSurvey.csv')


survey_df: (11286141, 27)


In [4]:
# drop all rows with NaN values
survey_df = survey_df.dropna()
survey_df = survey_df.replace('', np.nan)
survey_df = survey_df.dropna()    # drop empty string rows
print(f"survey_df (dropped nan): {survey_df.shape}")

survey_df (dropped nan): (8833729, 27)


In [5]:
headers = survey_df.columns
print(f"Header names: {headers}")

# Randomly sample 10% examples from survey_df
sampled_df = survey_df.sample(frac=0.1, random_state=1)

print(f"sampled_df: {sampled_df.shape}")

# Once sampled_df is created, delete the original survey_df to free up System RAM
del survey_df
gc.collect()

Header names: Index(['ResponseID', 'ExtendedSessionID', 'UserID', 'ScenarioOrder',
       'Intervention', 'PedPed', 'Barrier', 'CrossingSignal', 'AttributeLevel',
       'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
       'NonDefaultChoice', 'DefaultChoiceIsOmission', 'NumberOfCharacters',
       'DiffNumberOFCharacters', 'Saved', 'Template', 'DescriptionShown',
       'LeftHand', 'UserCountry3', 'Review_age', 'Review_education',
       'Review_gender', 'Review_income', 'Review_political',
       'Review_religious'],
      dtype='object')
sampled_df: (883373, 27)


0

In [6]:
target_column = ['Saved']
drop_columns = ['ResponseID', 'ExtendedSessionID', 'UserID', 'Saved', 'Template']  # remove unneeded features
encode_columns = ['AttributeLevel', 'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
                     'NonDefaultChoice', 'UserCountry3', 'Review_education', 'Review_gender', 'Review_income']  # encode to 1-hot vectors

# one_hot_encoded_df = pd.get_dummies(sampled_df[encode_columns], sparse=True)
one_hot_encoded_df = pd.get_dummies(sampled_df[encode_columns])
X = sampled_df.drop(encode_columns + drop_columns, axis=1)
X = pd.concat([X, one_hot_encoded_df], axis=1)
y = sampled_df[target_column]

print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Once X and y are loaded, delete the one_hot_encoded_df object to free up System RAM
del one_hot_encoded_df, sampled_df
gc.collect()

X: (883373, 285)
y: (883373, 1)


0

In [7]:
X.head()

Unnamed: 0,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,DefaultChoiceIsOmission,NumberOfCharacters,DiffNumberOFCharacters,DescriptionShown,LeftHand,...,Review_income_15000,Review_income_25000,Review_income_35000,Review_income_5000,Review_income_50000,Review_income_80000,Review_income_above100000,Review_income_default,Review_income_over10000,Review_income_under5000
9465311,6,1,0,1,0,0.0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
10462302,3,1,0,1,0,1.0,1,0,0.0,1.0,...,0,0,0,0,0,0,0,1,0,0
6201309,11,1,1,0,0,1.0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2780663,6,0,0,1,0,0.0,5,0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1111090,3,0,0,1,0,1.0,4,0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
y.head()

Unnamed: 0,Saved
9465311,1
10462302,0
6201309,0
2780663,1
1111090,1


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [31]:
X_train = X_train.to_numpy(dtype=np.float32)
y_train = y_train.to_numpy(dtype=np.float32).reshape(-1, 1)  # Reshape for consistency
X_val = X_val.to_numpy(dtype=np.float32)
y_val = y_val.to_numpy(dtype=np.float32).reshape(-1, 1)
X_test = X_test.to_numpy(dtype=np.float32)
y_test = y_test.to_numpy(dtype=np.float32).reshape(-1, 1)

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

X_train: (530023, 285)
y_train: (530023, 1)
X_val: (176675, 285)
y_val: (176675, 1)
X_test: (176675, 285)
y_test: (176675, 1)


In [32]:
def remove_nan_rows(X, y):
    X = np.nan_to_num(X, nan=np.nan, posinf=np.nan, neginf=np.nan)
    mask = ~np.isnan(X).any(axis=1)
    return X[mask], y[mask]

In [33]:
X_train, y_train = remove_nan_rows(X_train, y_train)
X_val, y_val = remove_nan_rows(X_val, y_val)
X_test, y_test = remove_nan_rows(X_test, y_test)

In [37]:
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

X_train: (529881, 285)
y_train: (529881, 1)
X_val: (176619, 285)
y_val: (176619, 1)
X_test: (176631, 285)
y_test: (176631, 1)


In [34]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [35]:
# predict
val_prediction = model.predict(X_val)

# accuracy on validation
val_accuracy = accuracy_score(y_val, val_prediction)
print(f"Decision Tree Validation Accuracy: {val_accuracy:.4f}")
print(classification_report(y_val, val_prediction))

Decision Tree Validation Accuracy: 0.6655
              precision    recall  f1-score   support

         0.0       0.66      0.66      0.66     87103
         1.0       0.67      0.67      0.67     89516

    accuracy                           0.67    176619
   macro avg       0.67      0.67      0.67    176619
weighted avg       0.67      0.67      0.67    176619



In [36]:
# accuracy on test
# predict
test_prediction = model.predict(X_test)

# accuracy on validation
test_accuracy = accuracy_score(y_test, test_prediction)
print(f"Decision Tree Test Accuracy: {test_accuracy:.4f}")
print(classification_report(y_test, test_prediction))

Decision Tree Test Accuracy: 0.6651
              precision    recall  f1-score   support

         0.0       0.66      0.66      0.66     87434
         1.0       0.67      0.67      0.67     89197

    accuracy                           0.67    176631
   macro avg       0.67      0.67      0.67    176631
weighted avg       0.67      0.67      0.67    176631

