In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import torch
from ucimlrepo import fetch_ucirepo 

# Notes on Results:

Although we expect random forest to perform better on our datasets as stated by the paper, when the dataset is low-dimensional and has less features, logisitc regression appears to perform slightly better. This is most likely due to the fact that in our Sepsis Dataset, there are only 3 features to go off of, meaking the data low-dimensional

# Occupation Detection

In [2]:
#Occupancy
occupancy_detection = fetch_ucirepo(id=357) 
X_occupancy = occupancy_detection.data.features 
y_occupancy = occupancy_detection.data.targets 
display(X_occupancy)
display(y_occupancy)

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio
0,2015-02-04 17:51:00,23.18,27.272,426,721.25,0.00479298817650529
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714,0.00478344094931065
2,2015-02-04 17:53:00,23.15,27.245,426,713.5,0.00477946352442199
3,2015-02-04 17:54:00,23.15,27.2,426,708.25,0.00477150882608175
4,2015-02-04 17:55:00,23.1,27.2,426,704.5,0.00475699293331518
...,...,...,...,...,...,...
20557,2015-02-18 09:15:00,20.815,27.7175,429.75,1505.25,0.00421296819328694
20558,2015-02-18 09:16:00,20.865,27.745,423.5,1514.5,0.00423026193160229
20559,2015-02-18 09:16:59,20.89,27.745,423.5,1521.5,0.00423681810140671
20560,2015-02-18 09:17:59,20.89,28.0225,418.75,1632,0.0042794854718673


Unnamed: 0,Occupancy
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
20557,1.0
20558,1.0
20559,1.0
20560,1.0


In [3]:
#mismatched data in certain rows where it says 'Temperature' instead of the date. Identify and drop the corresponding rows
temperature_rows = X_occupancy[X_occupancy['date'].str.contains('Temperature', na=False)]

X_occupancy = X_occupancy.drop(temperature_rows.index) #drops mismatched types
y_occupancy = y_occupancy.drop(temperature_rows.index) #drops the corresponding mistmatched types in the gt
display(X_occupancy)
display(y_occupancy)

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio
0,2015-02-04 17:51:00,23.18,27.272,426,721.25,0.00479298817650529
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714,0.00478344094931065
2,2015-02-04 17:53:00,23.15,27.245,426,713.5,0.00477946352442199
3,2015-02-04 17:54:00,23.15,27.2,426,708.25,0.00477150882608175
4,2015-02-04 17:55:00,23.1,27.2,426,704.5,0.00475699293331518
...,...,...,...,...,...,...
20557,2015-02-18 09:15:00,20.815,27.7175,429.75,1505.25,0.00421296819328694
20558,2015-02-18 09:16:00,20.865,27.745,423.5,1514.5,0.00423026193160229
20559,2015-02-18 09:16:59,20.89,27.745,423.5,1521.5,0.00423681810140671
20560,2015-02-18 09:17:59,20.89,28.0225,418.75,1632,0.0042794854718673


Unnamed: 0,Occupancy
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
20557,1.0
20558,1.0
20559,1.0
20560,1.0


In [4]:
# # Ensure the 'date' column is in datetime format
X_occupancy['date'] = pd.to_datetime(X_occupancy.loc[:,'date'], errors='coerce')

# # Split the 'date' column into 'date' and 'time' columns
X_occupancy['date_only'] = X_occupancy.loc[:,'date'].dt.date  # Extract just the date (YYYY-MM-DD)
X_occupancy['time_only'] = X_occupancy.loc[:,'date'].dt.time  # Extract just the time (HH:MM:SS)

# Convert date to ordinal
X_occupancy['date_only'] = X_occupancy.loc[:,'date_only'].map(lambda x: x.toordinal())

# Convert time to seconds since midnight
X_occupancy['time_only'] = X_occupancy.loc[:,'time_only'].map(
    lambda x: x.hour * 3600 + x.minute * 60 + x.second
)

X_occupancy = X_occupancy.drop('date', axis = 1)

# Check the resulting DataFrame
print(X_occupancy[['date_only', 'time_only']].head())
display(X_occupancy)
display(y_occupancy)

   date_only  time_only
0     735633      64260
1     735633      64319
2     735633      64380
3     735633      64440
4     735633      64500


Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,date_only,time_only
0,23.18,27.272,426,721.25,0.00479298817650529,735633,64260
1,23.15,27.2675,429.5,714,0.00478344094931065,735633,64319
2,23.15,27.245,426,713.5,0.00477946352442199,735633,64380
3,23.15,27.2,426,708.25,0.00477150882608175,735633,64440
4,23.1,27.2,426,704.5,0.00475699293331518,735633,64500
...,...,...,...,...,...,...,...
20557,20.815,27.7175,429.75,1505.25,0.00421296819328694,735647,33300
20558,20.865,27.745,423.5,1514.5,0.00423026193160229,735647,33360
20559,20.89,27.745,423.5,1521.5,0.00423681810140671,735647,33419
20560,20.89,28.0225,418.75,1632,0.0042794854718673,735647,33479


Unnamed: 0,Occupancy
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
20557,1.0
20558,1.0
20559,1.0
20560,1.0


In [5]:
non_numeric_rows = X_occupancy[X_occupancy.map(lambda x: not isinstance(x, (int, float))).any(axis=1)]
print(non_numeric_rows)
X_occupancy = X_occupancy.astype(float)
y_occupancy = y_occupancy.astype(int)

print(X_occupancy.dtypes)
print(X_occupancy)

      Temperature Humidity   Light      CO2        HumidityRatio  date_only  \
0           23.18   27.272     426   721.25  0.00479298817650529     735633   
1           23.15  27.2675   429.5      714  0.00478344094931065     735633   
2           23.15   27.245     426    713.5  0.00477946352442199     735633   
3           23.15     27.2     426   708.25  0.00477150882608175     735633   
4            23.1     27.2     426    704.5  0.00475699293331518     735633   
...           ...      ...     ...      ...                  ...        ...   
20557      20.815  27.7175  429.75  1505.25  0.00421296819328694     735647   
20558      20.865   27.745   423.5   1514.5  0.00423026193160229     735647   
20559       20.89   27.745   423.5   1521.5  0.00423681810140671     735647   
20560       20.89  28.0225  418.75     1632   0.0042794854718673     735647   
20561          21     28.1     409     1864  0.00432073200293677     735647   

       time_only  
0          64260  
1          64

In [6]:
#Initialize scaler
scaler = StandardScaler()

# Apply scaling
X_occupancy = pd.DataFrame(scaler.fit_transform(X_occupancy), columns=X_occupancy.columns)

# Verify the scaling
print("Scaled feature preview (first 5 rows):")
display(X_occupancy.head())


Scaled feature preview (first 5 rows):


Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,date_only,time_only
0,2.154659,-0.077062,1.403076,0.098642,0.735396,-1.284171,0.833914
1,2.126231,-0.077965,1.419709,0.075344,0.722962,-1.284171,0.836228
2,2.126231,-0.082481,1.403076,0.073738,0.717782,-1.284171,0.83862
3,2.126231,-0.091514,1.403076,0.056867,0.707423,-1.284171,0.840974
4,2.07885,-0.091514,1.403076,0.044817,0.688518,-1.284171,0.843327


In [7]:
display(y_occupancy)

Unnamed: 0,Occupancy
0,1
1,1
2,1
3,1
4,1
...,...
20557,1
20558,1
20559,1
20560,1


# Sepsis Survival

In [8]:
# fetch dataset 
sepsis_survival_minimal_clinical_records = fetch_ucirepo(id=827) 
  
# data (as pandas dataframes) 
X_sepsis = sepsis_survival_minimal_clinical_records.data.features 
y_sepsis = sepsis_survival_minimal_clinical_records.data.targets 
  
# metadata 
# print(sepsis_survival_minimal_clinical_records.metadata) 
  
# variable information 
# print(sepsis_survival_minimal_clinical_records.variables) 
# y_sepsis.loc[:, "hospital_outcome_1alive_0dead"] = np.where(y_sepsis["hospital_outcome_1alive_0dead"] == 0, -1, 1) #-1 represents empty and 1 represents occupied
display(X_sepsis)
display(y_sepsis)

Unnamed: 0,age_years,sex_0male_1female,episode_number
0,21,1,1
1,20,1,1
2,21,1,1
3,77,0,1
4,72,0,1
...,...,...,...
110336,47,0,1
110337,50,0,1
110338,62,0,1
110339,58,0,1


Unnamed: 0,hospital_outcome_1alive_0dead
0,1
1,1
2,1
3,1
4,1
...,...
110336,1
110337,0
110338,1
110339,0


In [9]:
#Scale the data
X_sepsis = pd.DataFrame(scaler.fit_transform(X_sepsis), columns=X_sepsis.columns)
print(X_sepsis)

        age_years  sex_0male_1female  episode_number
0       -1.730274           1.053878       -0.464617
1       -1.771736           1.053878       -0.464617
2       -1.730274           1.053878       -0.464617
3        0.591613          -0.948877       -0.464617
4        0.384302          -0.948877       -0.464617
...           ...                ...             ...
110336  -0.652255          -0.948877       -0.464617
110337  -0.527868          -0.948877       -0.464617
110338  -0.030321          -0.948877       -0.464617
110339  -0.196170          -0.948877       -0.464617
110340  -0.320557           1.053878       -0.464617

[110341 rows x 3 columns]


# Coupon Acceptance

In [10]:
#Used to handle the non numerical age values
def transform_age(age):
    if age == '50plus':
        return 50
    elif age == 'below21':
        return 20
    else:
        return int(age) 
        
#Coupon dataset
in_vehicle_coupon_recommendation = fetch_ucirepo(id=603) 
X_coupons = in_vehicle_coupon_recommendation.data.features 
y_coupons = in_vehicle_coupon_recommendation.data.targets 
display(X_coupons)
display(y_coupons)

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,never,,4~8,1~3,1,0,0,0,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,never,,4~8,1~3,1,0,0,0,1
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,never,,4~8,1~3,1,1,0,0,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,never,,4~8,1~3,1,1,0,0,1
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,never,,4~8,1~3,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,...,never,never,1~3,4~8,1~3,1,0,0,1,0
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,...,never,never,1~3,4~8,1~3,1,0,0,0,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,...,never,never,1~3,4~8,1~3,1,0,0,1,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,...,never,never,1~3,4~8,1~3,1,1,1,0,1


Unnamed: 0,Y
0,1
1,0
2,1
3,0
4,0
...,...
12679,1
12680,1
12681,0
12682,0


In [11]:
# need to apply one-hot encoding to categorical data
categorical_columns = [
    'destination', 'passenger', 'weather', 'time', 'coupon', 'expiration', 
    'gender', 'maritalStatus', 'education', 'occupation', 'income', 'car', 
    'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50'
]
X_coupons = pd.get_dummies(X_coupons, columns=categorical_columns)
display(X_coupons)

Unnamed: 0,temperature,age,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,destination_Home,destination_No Urgent Place,...,RestaurantLessThan20_1~3,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never
0,55,21,1,1,0,0,0,1,False,True,...,False,True,False,False,False,True,False,False,False,False
1,80,21,1,1,0,0,0,1,False,True,...,False,True,False,False,False,True,False,False,False,False
2,80,21,1,1,1,0,0,1,False,True,...,False,True,False,False,False,True,False,False,False,False
3,80,21,1,1,1,0,0,1,False,True,...,False,True,False,False,False,True,False,False,False,False
4,80,21,1,1,1,0,0,1,False,True,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,55,26,0,1,0,0,1,0,True,False,...,False,True,False,False,False,True,False,False,False,False
12680,55,26,0,1,0,0,0,1,False,False,...,False,True,False,False,False,True,False,False,False,False
12681,30,26,0,1,0,0,1,0,False,False,...,False,True,False,False,False,True,False,False,False,False
12682,30,26,0,1,1,1,0,1,False,False,...,False,True,False,False,False,True,False,False,False,False


In [12]:
# Need to properly encode the age as integers
X_coupons.loc[:,'age'] = X_coupons.loc[:,'age'].apply(transform_age)

In [13]:
X_coupons = pd.DataFrame(scaler.fit_transform(X_coupons), columns=X_coupons.columns)

# Verify the scaling
print("Scaled feature preview (first 5 rows):")
display(X_coupons.head())

Scaled feature preview (first 5 rows):


Unnamed: 0,temperature,age,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,destination_Home,destination_No Urgent Place,...,RestaurantLessThan20_1~3,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never
0,-0.43343,-1.108935,1.189378,0.0,-1.131581,-0.367745,-0.522967,0.522967,-0.585362,1.009347,...,-0.85769,1.594684,-0.335752,-0.444545,-0.132856,1.689769,-0.246759,-0.145795,-0.959053,-0.450003
1,0.871799,-1.108935,1.189378,0.0,-1.131581,-0.367745,-0.522967,0.522967,-0.585362,1.009347,...,-0.85769,1.594684,-0.335752,-0.444545,-0.132856,1.689769,-0.246759,-0.145795,-0.959053,-0.450003
2,0.871799,-1.108935,1.189378,0.0,0.88372,-0.367745,-0.522967,0.522967,-0.585362,1.009347,...,-0.85769,1.594684,-0.335752,-0.444545,-0.132856,1.689769,-0.246759,-0.145795,-0.959053,-0.450003
3,0.871799,-1.108935,1.189378,0.0,0.88372,-0.367745,-0.522967,0.522967,-0.585362,1.009347,...,-0.85769,1.594684,-0.335752,-0.444545,-0.132856,1.689769,-0.246759,-0.145795,-0.959053,-0.450003
4,0.871799,-1.108935,1.189378,0.0,0.88372,-0.367745,-0.522967,0.522967,-0.585362,1.009347,...,-0.85769,1.594684,-0.335752,-0.444545,-0.132856,1.689769,-0.246759,-0.145795,-0.959053,-0.450003


# Running Models on Datasets:

In [14]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# import numpy as np

# # 3 splits x 3 classifiers x 3 datasets x 3 trials
# splits = {
#     "20/80": 0.2,  # 20% test, 80% train
#     "50/50": 0.5,  # 50% test, 50% train
#     "80/20": 0.8   # 80% test, 20% train
# }

# classifiers = {
#     "Logistic Regression": None,
#     "Random Forest": None,
#     "Decision Tree": None 
# }

# datasets = {
#     "Occupancy Dataset": (X_occupancy, y_occupancy),
#     "Sepsis Dataset": (X_sepsis, y_sepsis),
#     "Coupon Dataset": (X_coupons, y_coupons)
# }

# num_trials = 3
# results = []

# for dataset_name, (X, y) in datasets.items():
#     print(f"\n--- Processing {dataset_name} ---")
    
#     for split_name, test_size in splits.items():
#         print(f"\n  Split: {split_name}")
        
#         # Logistic Regression Experiment
#         if "Logistic Regression" in classifiers:
#             for regularization in [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]:
#                 print(f"    Classifier: Logistic Regression (C={regularization})")
                
#                 log_model = LogisticRegression(max_iter=2000, solver="lbfgs", C=regularization)
                
#                 train_accuracies = []
#                 test_accuracies = []
                
#                 for trial in range(num_trials):
#                     X_train, X_test, y_train, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )

#                     log_model.fit(X_train, y_train)

#                     y_train_pred = log_model.predict(X_train)
#                     y_test_pred = log_model.predict(X_test)

#                     train_accuracies.append(accuracy_score(y_train, y_train_pred))
#                     test_accuracies.append(accuracy_score(y_test, y_test_pred))

#                 avg_train = np.mean(train_accuracies)
#                 avg_test = np.mean(test_accuracies)

#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")

#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Logistic Regression (C={regularization})",
#                     "Train Accuracy": avg_train,
#                     "Test Accuracy": avg_test
#                 })

#         # Random Forest Experiment
#         if "Random Forest" in classifiers:
#             # Loop over different max_features values
#             for max_features in [1, 2, 4, 6, 8, 12, 16, 20]:
#                 print(f"    Classifier: Random Forest (max_features={max_features})")
#                 random_forest_model = RandomForestClassifier(n_estimators=1024, max_features=max_features, random_state=42)

#                 # Accumulate accuracies over trials
#                 train_accuracies = []
#                 val_accuracies = []
#                 test_accuracies = []

#                 for trial in range(num_trials):
#                     X_train, X_test, y_train, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )

#                     # Fit the model on the training set
#                     random_forest_model.fit(X_train, y_train)

#                     # Make predictions for train, validation, and test sets
#                     y_train_pred = random_forest_model.predict(X_train)
#                     y_test_pred = random_forest_model.predict(X_test)

#                     # Evaluate accuracies
#                     train_accuracy = accuracy_score(y_train, y_train_pred)
#                     test_accuracy = accuracy_score(y_test, y_test_pred)

#                     # Store accuracies
#                     train_accuracies.append(train_accuracy)
#                     test_accuracies.append(test_accuracy)

#                 # Compute average accuracies over trials
#                 avg_train = np.mean(train_accuracies)
#                 avg_test = np.mean(test_accuracies)

#                 # Output results
#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")

#                 # Store results
#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Random Forest (max_features={max_features})",
#                     "Train Accuracy": avg_train,
#                     "Test Accuracy": avg_test
#                 })

#         # Decision Tree Experiment
#         if "Decision Tree" in classifiers:
#             for criterion in ["gini", "entropy"]:
#                 print(f"    Classifier: Decision Tree (Criterion={criterion})")

#                 train_accuracies = []
#                 test_accuracies = []

#                 for trial in range(num_trials):
#                     X_train, X_test, y_train, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )

#                     dt_model = DecisionTreeClassifier(random_state=42, criterion=criterion)
#                     dt_model.fit(X_train, y_train)

#                     y_train_pred = dt_model.predict(X_train)
#                     y_test_pred = dt_model.predict(X_test)

#                     train_accuracies.append(accuracy_score(y_train, y_train_pred))
#                     test_accuracies.append(accuracy_score(y_test, y_test_pred))

#                 avg_train = np.mean(train_accuracies)
#                 avg_test = np.mean(test_accuracies)

#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")

#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Decision Tree (Criterion={criterion})",
#                     "Train Accuracy": avg_train,
#                     "Test Accuracy": avg_test
#                 })


In [15]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score
# import numpy as np122

# # 3 splits x 3 classifiers x 3 datasets x 3 trials
# splits = {
#     "20/80": 0.2,  # 20% test, 80% train
#     "50/50": 0.5,  # 50% test, 50% train
#     "80/20": 0.8   # 80% test, 20% train
# }


# classifiers = {
#     # For now don't initialize the classifiers to anything so we can adjust the parameters according to that of the paper
#     "Logistic Regression": None,  # 
#     # "Random Forest": None,  
#     "Decision Tree": None 
# }

# datasets = {
#     "Occupancy Dataset": (X_occupancy, y_occupancy),
#     "Sepsis Dataset": (X_sepsis, y_sepsis),
#     "Coupon Dataset": (X_coupons, y_coupons)
# }

# num_trials = 3

# results = []

# for dataset_name, (X, y) in datasets.items():
#     print(f"\n--- Processing {dataset_name} ---")
    
#     for split_name, test_size in splits.items():
#         print(f"\n  Split: {split_name}")
        
#         # Logistic Regression Experiment
#         if "Logistic Regression" in classifiers:
#             # Following papers different C values
#             for regularization in [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]:
#                 print(f"    Classifier: Logistic Regression (C={regularization})")
                
                
#                 log_model = LogisticRegression(max_iter=2000, solver="lbfgs", C=regularization)
                
#                 # Accumulate accuracies over trials so we can compute the average
#                 train_accuracies = []
#                 val_accuracies = []
#                 test_accuracies = []
                
#                 for trial in range(num_trials):
#                     # Training and Testing Splits, need stratify parameters to ensure equal splitting between trials
#                     X_train_full, X_test, y_train_full, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )

#                     # validation split 
#                     X_train, X_val, y_train, y_val = train_test_split(
#                         X_train_full, y_train_full, test_size=0.2, random_state=42 + trial, stratify=y_train_full
#                     )

#                     # Fit the model on the training set
#                     log_model.fit(X_train, y_train)

#                     # Make predictions for train, validation, and test sets
#                     y_train_pred = log_model.predict(X_train)
#                     y_val_pred = log_model.predict(X_val)
#                     y_test_pred = log_model.predict(X_test)

#                     # Evaluate accuracies
#                     train_accuracy = accuracy_score(y_train, y_train_pred)
#                     val_accuracy = accuracy_score(y_val, y_val_pred)
#                     test_accuracy = accuracy_score(y_test, y_test_pred)

#                     # Store accuracies
#                     train_accuracies.append(train_accuracy)
#                     val_accuracies.append(val_accuracy)
#                     test_accuracies.append(test_accuracy)

#                 # Compute average accuracies over trials
#                 avg_train = np.mean(train_accuracies)
#                 avg_val_accuracy = np.mean(val_accuracies)
#                 avg_test = np.mean(test_accuracies)

#                 # Output results
#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Validation Accuracy: {avg_val_accuracy:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")

#                 # Store results
#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Logistic Regression (C={regularization})",
#                     "Train Accuracy": avg_train,
#                     "Validation Accuracy": avg_val_accuracy,
#                     "Test Accuracy": avg_test
#                 })


#         if "Decision Tree" in classifiers:
#             for criterion in ["gini", "entropy"]: #testing the different splitting criterion according to paper
#                 print(f"    Classifier: Decision Tree (Criterion={criterion})")
        
#                 train_accuracies = []
#                 val_accuracies = []
#                 test_accuracies = []
        
#                 for trial in range(num_trials):
#                     X_train_full, X_test, y_train_full, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )

#                     X_train, X_val, y_train, y_val = train_test_split(
#                         X_train_full, y_train_full, test_size=0.2, random_state=42 + trial, stratify=y_train_full
#                     )
        
#                     dt_model = DecisionTreeClassifier(random_state=42, criterion=criterion)
#                     dt_model.fit(X_train, y_train)
        
#                     y_train_pred = dt_model.predict(X_train)
#                     y_val_pred = dt_model.predict(X_val)
#                     y_test_pred = dt_model.predict(X_test)
        
#                     train_accuracies.append(accuracy_score(y_train, y_train_pred))
#                     val_accuracies.append(accuracy_score(y_val, y_val_pred))
#                     test_accuracies.append(accuracy_score(y_test, y_test_pred))
        
#                 avg_train = np.mean(train_accuracies)
#                 avg_val_accuracy = np.mean(val_accuracies)
#                 avg_test = np.mean(test_accuracies)
        
#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Validation Accuracy: {avg_val_accuracy:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")
        
#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Decision Tree (Criterion={criterion})",
#                     "Train Accuracy": avg_train,
#                     "Validation Accuracy": avg_val_accuracy,
#                     "Test Accuracy": avg_test
#                 })


In [16]:
# from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score
# import numpy as np

# splits = {
#     "20/80": 0.2,
#     "50/50": 0.5,
#     "80/20": 0.8
# }

# classifiers = {
#     "Logistic Regression": LogisticRegression(max_iter=2000, solver="lbfgs"),
#     "Random Forest": RandomForestClassifier(n_estimators=1024, random_state=42),
#     "Decision Tree": DecisionTreeClassifier(random_state=42)
# }

# datasets = {
#     "Occupancy Dataset": (X_occupancy, y_occupancy),
#     "Sepsis Dataset": (X_sepsis, y_sepsis),
#     "Coupon Dataset": (X_coupons, y_coupons)
# }

# num_trials = 3
# n_splits = 5  # For cross-validation
# results = []

# for dataset_name, (X, y) in datasets.items():
#     print(f"\n--- Processing {dataset_name} ---")

#     for split_name, test_size in splits.items():
#         print(f"\n  Split: {split_name}")

#         for classifier_name, classifier in classifiers.items():
#             print(f"    Classifier: {classifier_name}")

#             if classifier_name == "Logistic Regression":
#                 hyper_params = {"C": [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]}
#             elif classifier_name == "Random Forest":
#                 hyper_params = {"max_features": [1, 2, 4, 6, 8, 12, 16, 20]}
#             elif classifier_name == "Decision Tree":
#                 hyper_params = {"criterion": ["gini", "entropy"]}

#             # Store best results for each classifier
#             best_val_accuracy = -1
#             best_params = None
#             avg_train, avg_val_accuracy, avg_test = 0, 0, 0

#             for param_name, param_values in hyper_params.items():
#                 for param_value in param_values:
#                     # Update hyper-parameter
#                     classifier.set_params(**{param_name: param_value})

#                     train_accuracies, val_accuracies, test_accuracies = [], [], []

#                     for trial in range(num_trials):
#                         X_train_full, X_test, y_train_full, y_test = train_test_split(
#                             X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                         )

#                         X_train, X_val, y_train, y_val = train_test_split(
#                             X_train_full, y_train_full, test_size=0.2, random_state=42 + trial, stratify=y_train_full
#                         )

#                         # Cross-validation
#                         k_fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42 + trial)
#                         fold_train_acc, fold_val_acc = [], []

#                         for train_index, val_index in k_fold.split(X_train, y_train):
#                             X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
#                             y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

#                             classifier.fit(X_train_fold, y_train_fold)
#                             fold_train_acc.append(accuracy_score(y_train_fold, classifier.predict(X_train_fold)))
#                             fold_val_acc.append(accuracy_score(y_val_fold, classifier.predict(X_val_fold)))

#                         # Use average validation accuracy across folds
#                         train_accuracy = np.mean(fold_train_acc)
#                         val_accuracy = np.mean(fold_val_acc)

#                         # Evaluate on held-out test set
#                         classifier.fit(X_train, y_train)
#                         test_accuracy = accuracy_score(y_test, classifier.predict(X_test))

#                         train_accuracies.append(train_accuracy)
#                         val_accuracies.append(val_accuracy)
#                         test_accuracies.append(test_accuracy)

#                     # Average over trials
#                     avg_train = np.mean(train_accuracies)
#                     avg_val_accuracy = np.mean(val_accuracies)
#                     avg_test = np.mean(test_accuracies)

#                     if avg_val_accuracy > best_val_accuracy:
#                         best_val_accuracy = avg_val_accuracy
#                         best_params = {param_name: param_value}

#             print(f"      Best Params: {best_params}")
#             print(f"      Avg Train Accuracy: {avg_train:.4f}")
#             print(f"      Avg Validation Accuracy: {avg_val_accuracy:.4f}")
#             print(f"      Avg Test Accuracy: {avg_test:.4f}")

#             results.append({
#                 "Dataset": dataset_name,
#                 "Split": split_name,
#                 "Classifier": classifier_name,
#                 "Best Params": best_params,
#                 "Train Accuracy": avg_train,
#                 "Validation Accuracy": avg_val_accuracy,
#                 "Test Accuracy": avg_test
#             })


In [17]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import accuracy_score
# import pandas as pd

# # Define datasets and splits
# datasets = {
#     "Occupancy Dataset": (X_occupancy, y_occupancy),
#     "Sepsis Dataset": (X_sepsis, y_sepsis),
#     "Coupon Dataset": (X_coupons, y_coupons)
# }
# splits = {"20/80": 0.2, "50/50": 0.5, "80/20": 0.8}

# # Hyperparameter grids
# hyperparams = {
#     "Logistic Regression": {
#         "C": [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4],
#         "solver": ["lbfgs"],
#         "max_iter": [2000]
#     },
#     # "Random Forest": {
#     #     "n_estimators": [1024],
#     #     "max_features": [1, 2, 4, 6, 8, 12, 16, 20]
#     # },
#     "Decision Tree": {
#         "criterion": ["gini", "entropy"]
#     }
# }

# # Initialize results list
# results = []

# for dataset_name, (X, y) in datasets.items():
#     print(f"\n--- Processing {dataset_name} ---")

#     for split_name, test_size in splits.items():
#         print(f"  Split: {split_name}")

#         # Split the dataset into training and testing sets
#         X_train, X_test, y_train, y_test = train_test_split(
#             X, y.values.ravel(), test_size=test_size, random_state=42, stratify=y
#         )

#         # Loop through classifiers
#         for clf_name, param_grid in hyperparams.items():
#             print(f"    Classifier: {clf_name}")

#             # Initialize the model
#             if clf_name == "Logistic Regression":
#                 model = LogisticRegression()
#             elif clf_name == "Random Forest":
#                 model = RandomForestClassifier(random_state=42)
#             elif clf_name == "Decision Tree":
#                 model = DecisionTreeClassifier(random_state=42)

#             # Use GridSearchCV function to perform 5-fold cross validation as stated by the paper, finds the optimal hyper parameters
#             grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
#             grid_search.fit(X_train, y_train)

#             # Get the best model and parameters
#             best_model = grid_search.best_estimator_
#             best_params = grid_search.best_params_
#             print(f"      Best Hyperparameters: {best_params}")

#             # Evaluate the model on the test set
#             y_test_pred = best_model.predict(X_test)
#             test_accuracy = accuracy_score(y_test, y_test_pred)
#             print(f"      Test Accuracy: {test_accuracy:.4f}")

#             # Save results
#             results.append({
#                 "Dataset": dataset_name,
#                 "Split": split_name,
#                 "Classifier": clf_name,
#                 "Best Hyperparameters": best_params,
#                 "Test Accuracy": test_accuracy
#             })

# # Convert results to a DataFrame and display
# results_df = pd.DataFrame(results)
# print("\n--- Cross-Validation Results ---")
# print(results_df)


In [23]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# 3 splits x 3 classifiers x 3 datasets x 3 trials
splits = {
    "20/80": 0.2,  # 20% test, 80% train
    "50/50": 0.5,  # 50% test, 50% train
    "80/20": 0.8   # 80% test, 20% train
}

#To be later initialized so can tune hyper parameters
classifiers = {
    "Logistic Regression": None,
    "Random Forest": None,
    "Decision Tree": None
}

datasets = {
    "Occupancy Dataset": (X_occupancy, y_occupancy),
    "Sepsis Dataset": (X_sepsis, y_sepsis),
    "Coupon Dataset": (X_coupons, y_coupons)
}

num_trials = 3

#results store values for later use when creating heatmap
results = []

for dataset_name, (X, y) in datasets.items():
    print(f"\n{dataset_name}")
    
    for split_name, test_size in splits.items():
        print(f"\n{split_name}")
        
        if "Logistic Regression" in classifiers:
            optimal_log_param = None
            opt_log_cross_val = -1  # dummy value place holder

            #iterates through all possible regularization parameters
            for regularization in [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]:
                print(f"Logistic Regression (C={regularization})")
                
                log_model = LogisticRegression(max_iter=2000, solver="lbfgs", C=regularization)

     
                trial_results = {
                    "Train Accuracy": [],
                    "Cross Validation Accuracy": [],
                    "Test Accuracy": []
                }

                #Iterates for 3 trials as indicated
                for trial in range(num_trials):
                    X_train_full, X_test, y_train_full, y_test = train_test_split(
                        X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
                    )
                    
                    # 5-fold cross validation
                    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
                    cross_val_results = cross_val_score(log_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
                    avg_cross_val = np.mean(cross_val_results)
                    trial_results["Cross Validation Accuracy"].append(avg_cross_val)

                    #training the model
                    log_model.fit(X_train_full, y_train_full)

                    y_train_full_pred = log_model.predict(X_train_full)
                    train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
                    trial_results["Train Accuracy"].append(train_accuracy)

                    #evaluating tet
                    y_test_pred = log_model.predict(X_test)
                    test_accuracy = accuracy_score(y_test, y_test_pred)
                    trial_results["Test Accuracy"].append(test_accuracy)

                # taking the average over the 3 trials
                avg_train = np.mean(trial_results["Train Accuracy"])
                avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
                avg_test = np.mean(trial_results["Test Accuracy"])

                # select optimal parameter 
                if avg_cross_val > opt_log_cross_val:
                    opt_log_cross_val = avg_cross_val
                    optimal_log_param = regularization

                # Output results
                print(f"      Avg Train Accuracy: {avg_train:.4f}")
                print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
                print(f"      Avg Test Accuracy: {avg_test:.4f}")

                results.append({
                    "Dataset": dataset_name,
                    "Split": split_name,
                    "Classifier": f"Logistic Regression (C={regularization})",
                    "Train Accuracy": avg_train,
                    "Cross Validation Accuracy": avg_cross_val,
                    "Test Accuracy": avg_test
                })

            print(f"  Optimal Regularization Parameter for Logistic Regression: C={optimal_log_param}\n Optimal Cross Validation Accuracy{opt_log_cross_val:.4f}")

        if "Random Forest" in classifiers:
            optimal_max_feat = None
            opt_random_forest_cross_val = -1  
            
            # Iterate over different max_features values
            for max_features in [1, 2, 4, 6, 8, 12, 16, 20]:
                print(f"    Classifier: Random Forest (max_features={max_features})")
                random_forest_model = RandomForestClassifier(n_estimators=1024, max_features=max_features, random_state=42)
        
                trial_results = {
                    "Train Accuracy": [],
                    "Cross Validation Accuracy": [],
                    "Test Accuracy": []
                }
        
                for trial in range(num_trials):
                    X_train_full, X_test, y_train_full, y_test = train_test_split(
                        X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
                    )
        
                    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
                    cross_val_results = cross_val_score(random_forest_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
                    avg_cross_val = np.mean(cross_val_results)
                    trial_results["Cross Validation Accuracy"].append(avg_cross_val)
        
                    random_forest_model.fit(X_train_full, y_train_full)
        
                    y_train_full_pred = random_forest_model.predict(X_train_full)
                    train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
                    trial_results["Train Accuracy"].append(train_accuracy)
        
                    y_test_pred = random_forest_model.predict(X_test)
                    test_accuracy = accuracy_score(y_test, y_test_pred)
                    trial_results["Test Accuracy"].append(test_accuracy)
        
                avg_train = np.mean(trial_results["Train Accuracy"])
                avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
                avg_test = np.mean(trial_results["Test Accuracy"])
        

                if avg_cross_val > opt_random_forest_cross_val:
                    opt_random_forest_cross_val = avg_cross_val
                    optimal_max_feat = max_features
        
                print(f"      Avg Train Accuracy: {avg_train:.4f}")
                print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
                print(f"      Avg Test Accuracy: {avg_test:.4f}")
        
                results.append({
                    "Dataset": dataset_name,
                    "Split": split_name,
                    "Classifier": f"Random Forest (max_features={max_features})",
                    "Train Accuracy": avg_train,
                    "Cross Validation Accuracy": avg_cross_val,
                    "Test Accuracy": avg_test
                })
        
            print(f"  Optimal Max Features for Random Forest: max_features={optimal_max_feat}, \n Optimal Cross Validation Accuracy: {opt_random_forest_cross_val:.4f}")

        if "Decision Tree" in classifiers:
            opt_criterion = None
            opt_decision_cross_val = -1
            
            for criterion in ["gini", "entropy"]:
                print(f"    Classifier: Decision Tree (Criterion={criterion})")
                
                dt_model = DecisionTreeClassifier(random_state=42, criterion=criterion)

                trial_results = {
                    "Train Accuracy": [],
                    "Cross Validation Accuracy": [],
                    "Test Accuracy": []
                }

                for trial in range(num_trials):
                    X_train_full, X_test, y_train_full, y_test = train_test_split(
                        X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
                    )

                    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
                    cross_val_results = cross_val_score(dt_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
                    avg_cross_val = np.mean(cross_val_results)
                    trial_results["Cross Validation Accuracy"].append(avg_cross_val)

                    dt_model.fit(X_train_full, y_train_full)

                    y_train_full_pred = dt_model.predict(X_train_full)
                    train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
                    trial_results["Train Accuracy"].append(train_accuracy)

                    y_test_pred = dt_model.predict(X_test)
                    test_accuracy = accuracy_score(y_test, y_test_pred)
                    trial_results["Test Accuracy"].append(test_accuracy)

                avg_train = np.mean(trial_results["Train Accuracy"])
                avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
                avg_test = np.mean(trial_results["Test Accuracy"])

                if avg_cross_val > opt_decision_cross_val:
                    opt_decision_cross_val = avg_cross_val
                    opt_criterion = criterion

                print(f"      Avg Train Accuracy: {avg_train:.4f}")
                print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
                print(f"      Avg Test Accuracy: {avg_test:.4f}")

                # Store results
                results.append({
                    "Dataset": dataset_name,
                    "Split": split_name,
                    "Classifier": f"Decision Tree (Criterion={criterion})",
                    "Train Accuracy": avg_train,
                    "Cross Validation Accuracy": avg_cross_val,
                    "Test Accuracy": avg_test
                })

            print(f"  Optimal Criterion for Decision Tree: Criterion={opt_criterion}\n Optimal Cross Validation Accuracy: {opt_decision_cross_val:.4f}")


Occupancy Dataset

20/80
Logistic Regression (C=1e-08)
      Avg Train Accuracy: 0.7690
      Avg Cross-Validation Accuracy: 0.7690
      Avg Test Accuracy: 0.7690
Logistic Regression (C=1e-06)
      Avg Train Accuracy: 0.7690
      Avg Cross-Validation Accuracy: 0.7690
      Avg Test Accuracy: 0.7690
Logistic Regression (C=0.0001)
      Avg Train Accuracy: 0.8426
      Avg Cross-Validation Accuracy: 0.8155
      Avg Test Accuracy: 0.8436
Logistic Regression (C=0.01)
      Avg Train Accuracy: 0.9894
      Avg Cross-Validation Accuracy: 0.9894
      Avg Test Accuracy: 0.9886
Logistic Regression (C=1)
      Avg Train Accuracy: 0.9893
      Avg Cross-Validation Accuracy: 0.9893
      Avg Test Accuracy: 0.9883
Logistic Regression (C=100)
      Avg Train Accuracy: 0.9893
      Avg Cross-Validation Accuracy: 0.9893
      Avg Test Accuracy: 0.9884
Logistic Regression (C=10000)
      Avg Train Accuracy: 0.9893
      Avg Cross-Validation Accuracy: 0.9893
      Avg Test Accuracy: 0.9883
  Optim

KeyboardInterrupt: 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Create a heatmap for each classifier showing accuracies
classifiers = results_df["Classifier"].unique()

# Loop over each classifier
for clf_name in classifiers:
    print(f"--- Heatmap for {clf_name} ---")
    
    # Filter the results for the current classifier
    clf_results = results_df[results_df["Classifier"] == clf_name]
    
    # Create a pivot table where rows are datasets, columns are splits, and values are accuracies
    pivot_table = clf_results.pivot_table(
        index="Dataset", 
        columns="Split", 
        values="Test Accuracy", 
        aggfunc="mean"  # Taking the mean in case there are multiple entries
    )

    # Plot the heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(pivot_table, annot=True, cmap="Blues", cbar=True, fmt=".4f", linewidths=0.5)
    plt.title(f"Test Accuracy Heatmap for {clf_name}")
    plt.ylabel("Dataset")
    plt.xlabel("Split")
    plt.show()


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import pandas as pd
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score
# import numpy as np

# # 3 splits x 3 classifiers x 3 datasets x 3 trials
# splits = {
#     "20/80": 0.2,  # 20% test, 80% train
#     "50/50": 0.5,  # 50% test, 50% train
#     "80/20": 0.8   # 80% test, 20% train
# }

# classifiers = {
#     "Logistic Regression": None,
#     "Random Forest": None,
#     "Decision Tree": None
# }

# datasets = {
#     "Occupancy Dataset": (X_occupancy, y_occupancy),
#     "Sepsis Dataset": (X_sepsis, y_sepsis),
#     "Coupon Dataset": (X_coupons, y_coupons)
# }

# num_trials = 3

# #results store values for later use when creating heatmap
# results = []

# for dataset_name, (X, y) in datasets.items():
#     print(f"\n--- Processing {dataset_name} ---")
    
#     for split_name, test_size in splits.items():
#         print(f"\n  Split: {split_name}")
        
#         # Logistic Regression Experiment
#         if "Logistic Regression" in classifiers:
#             optimal_log_param = None
#             opt_log_cross_val = -1  #dummy val
            
#             for regularization in [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]:
#                 print(f"    Classifier: Logistic Regression (C={regularization})")
                
#                 log_model = LogisticRegression(max_iter=2000, solver="lbfgs", C=regularization)

#                 # Accumulate results over trials
#                 trial_results = {
#                     "Train Accuracy": [],
#                     "CV Accuracy": [],
#                     "Test Accuracy": []
#                 }

#                 for trial in range(num_trials):
#                     # Train-test split for the partition
#                     X_train_full, X_test, y_train_full, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )
                    
#                     # StratifiedKFold for 5-fold cross-validation
#                     k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
#                     cross_val_results = cross_val_score(log_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
#                     avg_cross_val = np.mean(cross_val_results)
#                     trial_results["CV Accuracy"].append(avg_cross_val)

#                     # Fit the model on the entire training set
#                     log_model.fit(X_train_full, y_train_full)

#                     # Evaluate on the training set
#                     y_train_full_pred = log_model.predict(X_train_full)
#                     train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
#                     trial_results["Train Accuracy"].append(train_accuracy)

#                     # Evaluate on the test set
#                     y_test_pred = log_model.predict(X_test)
#                     test_accuracy = accuracy_score(y_test, y_test_pred)
#                     trial_results["Test Accuracy"].append(test_accuracy)

#                 # Compute average accuracies over trials
#                 avg_train = np.mean(trial_results["Train Accuracy"])
#                 avg_cross_val = np.mean(trial_results["CV Accuracy"])
#                 avg_test = np.mean(trial_results["Test Accuracy"])

#                 # Update the best hyperparameter if this C results in better CV accuracy
#                 if avg_cross_val > opt_log_cross_val:
#                     opt_log_cross_val = avg_cross_val
#                     optimal_log_param = regularization

#                 # Output results
#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")

#                 # Store results
#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Logistic Regression (C={regularization})",
#                     "Train Accuracy": avg_train,
#                     "CV Accuracy": avg_cross_val,
#                     "Test Accuracy": avg_test
#                 })

#             # After the trials, print the optimal hyperparameter for Logistic Regression
#             print(f"  Optimal Hyperparameter for Logistic Regression: C={optimal_log_param} with CV Accuracy={opt_log_cross_val:.4f}")

#         # Random Forest Experiment
#         if "Random Forest" in classifiers:
#             optimal_max_feat = None
#             opt_random_forest_cross_val = -np.inf  # Initialize with very low value
            
#             # Loop over different max_features values
#             for max_features in [1, 2, 4, 6, 8, 12, 16, 20]:
#                 print(f"    Classifier: Random Forest (max_features={max_features})")
#                 random_forest_model = RandomForestClassifier(n_estimators=1024, max_features=max_features, random_state=42)
        
#                 # Accumulate results over trials
#                 trial_results = {
#                     "Train Accuracy": [],
#                     "CV Accuracy": [],
#                     "Test Accuracy": []
#                 }
        
#                 for trial in range(num_trials):
#                     # Train-test split for the partition
#                     X_train_full, X_test, y_train_full, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )
        
#                     # StratifiedKFold for 5-fold cross-validation
#                     k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
#                     cross_val_results = cross_val_score(random_forest_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
#                     avg_cross_val = np.mean(cross_val_results)
#                     trial_results["CV Accuracy"].append(avg_cross_val)
        
#                     # Fit the model on the entire training set
#                     random_forest_model.fit(X_train_full, y_train_full)
        
#                     # Evaluate on the training set
#                     y_train_full_pred = random_forest_model.predict(X_train_full)
#                     train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
#                     trial_results["Train Accuracy"].append(train_accuracy)
        
#                     # Evaluate on the test set
#                     y_test_pred = random_forest_model.predict(X_test)
#                     test_accuracy = accuracy_score(y_test, y_test_pred)
#                     trial_results["Test Accuracy"].append(test_accuracy)
        
#                 # Compute average accuracies over trials
#                 avg_train = np.mean(trial_results["Train Accuracy"])
#                 avg_cross_val = np.mean(trial_results["CV Accuracy"])
#                 avg_test = np.mean(trial_results["Test Accuracy"])
        
#                 # Update the best hyperparameter if this max_features results in better CV accuracy
#                 if avg_cross_val > opt_random_forest_cross_val:
#                     opt_random_forest_cross_val = avg_cross_val
#                     optimal_max_feat = max_features
        
#                 # Output results
#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")
        
#                 # Store results
#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Random Forest (max_features={max_features})",
#                     "Train Accuracy": avg_train,
#                     "CV Accuracy": avg_cross_val,
#                     "Test Accuracy": avg_test
#                 })
        
#             # After the trials, print the optimal hyperparameter for Random Forest
#             print(f"  Optimal Hyperparameter for Random Forest: max_features={optimal_max_feat} with CV Accuracy={opt_random_forest_cross_val:.4f}")
    
#         # Decision Tree Experiment
#         if "Decision Tree" in classifiers:
#             opt_criterion = None
#             opt_decision_cross_val = -np.inf  # Initialize with very low value
            
#             for criterion in ["gini", "entropy"]:
#                 print(f"    Classifier: Decision Tree (Criterion={criterion})")
                
#                 dt_model = DecisionTreeClassifier(random_state=42, criterion=criterion)

#                 # Accumulate results over trials
#                 trial_results = {
#                     "Train Accuracy": [],
#                     "CV Accuracy": [],
#                     "Test Accuracy": []
#                 }

#                 for trial in range(num_trials):
#                     # Train-test split for the partition
#                     X_train_full, X_test, y_train_full, y_test = train_test_split(
#                         X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
#                     )

#                     # StratifiedKFold for 5-fold cross-validation
#                     k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
#                     cross_val_results = cross_val_score(dt_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
#                     avg_cross_val = np.mean(cross_val_results)
#                     trial_results["CV Accuracy"].append(avg_cross_val)

#                     # Fit the model on the entire training set
#                     dt_model.fit(X_train_full, y_train_full)

#                     # Evaluate on the training set
#                     y_train_full_pred = dt_model.predict(X_train_full)
#                     train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
#                     trial_results["Train Accuracy"].append(train_accuracy)

#                     # Evaluate on the test set
#                     y_test_pred = dt_model.predict(X_test)
#                     test_accuracy = accuracy_score(y_test, y_test_pred)
#                     trial_results["Test Accuracy"].append(test_accuracy)

#                 # Compute average accuracies over trials
#                 avg_train = np.mean(trial_results["Train Accuracy"])
#                 avg_cross_val = np.mean(trial_results["CV Accuracy"])
#                 avg_test = np.mean(trial_results["Test Accuracy"])

#                 # Update the best hyperparameter if this criterion results in better CV accuracy
#                 if avg_cross_val > opt_decision_cross_val:
#                     opt_decision_cross_val = avg_cross_val
#                     opt_criterion = criterion

#                 # Output results
#                 print(f"      Avg Train Accuracy: {avg_train:.4f}")
#                 print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
#                 print(f"      Avg Test Accuracy: {avg_test:.4f}")

#                 # Store results
#                 results.append({
#                     "Dataset": dataset_name,
#                     "Split": split_name,
#                     "Classifier": f"Decision Tree (Criterion={criterion})",
#                     "Train Accuracy": avg_train,
#                     "CV Accuracy": avg_cross_val,
#                     "Test Accuracy": avg_test
#                 })

#             # After the trials, print the optimal hyperparameter for Decision Tree
#             print(f"  Optimal Hyperparameter for Decision Tree: Criterion={opt_criterion} with CV Accuracy={opt_decision_cross_val:.4f}")

# # Convert results list to DataFrame for easy analysis
# results_df = pd.DataFrame(results)

# # Output final results
# print("\n--- Cross-Validation Results ---")
# print(results_df)


In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# 3 splits x 3 classifiers x 3 datasets x 3 trials
splits = {
    "20/80": 0.2,  # 20% test, 80% train
    "50/50": 0.5,  # 50% test, 50% train
    "80/20": 0.8   # 80% test, 20% train
}

#To be later initialized so can tune hyper parameters
classifiers = {
    "Logistic Regression": None,
    "Random Forest": None,
    "Decision Tree": None
}

datasets = {
    "Occupancy Dataset": (X_occupancy, y_occupancy),
    "Sepsis Dataset": (X_sepsis, y_sepsis),
    "Coupon Dataset": (X_coupons, y_coupons)
}

num_trials = 3

#results store values for later use when creating heatmap
results = []

for dataset_name, (X, y) in datasets.items():
    print(f"\n{dataset_name}")
    
    for split_name, test_size in splits.items():
        print(f"\n{split_name}")
        
        if "Logistic Regression" in classifiers:
            optimal_log_param = None
            opt_log_cross_val = -1  # dummy value place holder

            #iterates through all possible regularization parameters
            for regularization in [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]:
                print(f"Logistic Regression (C={regularization})")
                
                log_model = LogisticRegression(max_iter=2000, solver="lbfgs", C=regularization)

     
                trial_results = {
                    "Train Accuracy": [],
                    "Cross Validation Accuracy": [],
                    "Test Accuracy": []
                }

                #Iterates for 3 trials as indicated
                for trial in range(num_trials):
                    X_train_full, X_test, y_train_full, y_test = train_test_split(
                        X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
                    )
                    
                    # 5-fold cross validation
                    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
                    cross_val_results = cross_val_score(log_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
                    avg_cross_val = np.mean(cross_val_results)
                    trial_results["Cross Validation Accuracy"].append(avg_cross_val)

                    #training the model
                    log_model.fit(X_train_full, y_train_full)

                    y_train_full_pred = log_model.predict(X_train_full)
                    train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
                    trial_results["Train Accuracy"].append(train_accuracy)

                    #evaluating tet
                    y_test_pred = log_model.predict(X_test)
                    test_accuracy = accuracy_score(y_test, y_test_pred)
                    trial_results["Test Accuracy"].append(test_accuracy)

                # taking the average over the 3 trials
                avg_train = np.mean(trial_results["Train Accuracy"])
                avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
                avg_test = np.mean(trial_results["Test Accuracy"])

                # select optimal parameter 
                if avg_cross_val > opt_log_cross_val:
                    opt_log_cross_val = avg_cross_val
                    optimal_log_param = regularization

                # Output results
                print(f"      Avg Train Accuracy: {avg_train:.4f}")
                print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
                print(f"      Avg Test Accuracy: {avg_test:.4f}")

                results.append({
                    "Dataset": dataset_name,
                    "Split": split_name,
                    "Classifier": f"Logistic Regression (C={regularization})",
                    "Train Accuracy": avg_train,
                    "Cross Validation Accuracy": avg_cross_val,
                    "Test Accuracy": avg_test
                })

            print(f"  Optimal Regularization Parameter for Logistic Regression: C={optimal_log_param}\n Optimal Cross Validation Accuracy{opt_log_cross_val:.4f}")

        # if "Random Forest" in classifiers:
        #     optimal_max_feat = None
        #     opt_random_forest_cross_val = -1  
            
        #     # Iterate over different max_features values
        #     for max_features in [1, 2, 4, 6, 8, 12, 16, 20]:
        #         print(f"    Classifier: Random Forest (max_features={max_features})")
        #         random_forest_model = RandomForestClassifier(n_estimators=1024, max_features=max_features, random_state=42)
        
        #         trial_results = {
        #             "Train Accuracy": [],
        #             "Cross Validation Accuracy": [],
        #             "Test Accuracy": []
        #         }
        
        #         for trial in range(num_trials):
        #             X_train_full, X_test, y_train_full, y_test = train_test_split(
        #                 X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
        #             )
        
        #             k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
        #             cross_val_results = cross_val_score(random_forest_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
        #             avg_cross_val = np.mean(cross_val_results)
        #             trial_results["Cross Validation Accuracy"].append(avg_cross_val)
        
        #             random_forest_model.fit(X_train_full, y_train_full)
        
        #             y_train_full_pred = random_forest_model.predict(X_train_full)
        #             train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
        #             trial_results["Train Accuracy"].append(train_accuracy)
        
        #             y_test_pred = random_forest_model.predict(X_test)
        #             test_accuracy = accuracy_score(y_test, y_test_pred)
        #             trial_results["Test Accuracy"].append(test_accuracy)
        
        #         avg_train = np.mean(trial_results["Train Accuracy"])
        #         avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
        #         avg_test = np.mean(trial_results["Test Accuracy"])
        

        #         if avg_cross_val > opt_random_forest_cross_val:
        #             opt_random_forest_cross_val = avg_cross_val
        #             optimal_max_feat = max_features
        
        #         print(f"      Avg Train Accuracy: {avg_train:.4f}")
        #         print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
        #         print(f"      Avg Test Accuracy: {avg_test:.4f}")
        
        #         results.append({
        #             "Dataset": dataset_name,
        #             "Split": split_name,
        #             "Classifier": f"Random Forest (max_features={max_features})",
        #             "Train Accuracy": avg_train,
        #             "Cross Validation Accuracy": avg_cross_val,
        #             "Test Accuracy": avg_test
        #         })
        
        #     print(f"  Optimal Max Features for Random Forest: max_features={optimal_max_feat}, \n Optimal Cross Validation Accuracy: {opt_random_forest_cross_val:.4f}")

        if "Decision Tree" in classifiers:
            opt_criterion = None
            opt_decision_cross_val = -1
            
            for criterion in ["gini", "entropy"]:
                print(f"    Classifier: Decision Tree (Criterion={criterion})")
                
                dt_model = DecisionTreeClassifier(random_state=42, criterion=criterion)

                trial_results = {
                    "Train Accuracy": [],
                    "Cross Validation Accuracy": [],
                    "Test Accuracy": []
                }

                for trial in range(num_trials):
                    X_train_full, X_test, y_train_full, y_test = train_test_split(
                        X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
                    )

                    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
                    cross_val_results = cross_val_score(dt_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
                    avg_cross_val = np.mean(cross_val_results)
                    trial_results["Cross Validation Accuracy"].append(avg_cross_val)

                    dt_model.fit(X_train_full, y_train_full)

                    y_train_full_pred = dt_model.predict(X_train_full)
                    train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
                    trial_results["Train Accuracy"].append(train_accuracy)

                    y_test_pred = dt_model.predict(X_test)
                    test_accuracy = accuracy_score(y_test, y_test_pred)
                    trial_results["Test Accuracy"].append(test_accuracy)

                avg_train = np.mean(trial_results["Train Accuracy"])
                avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
                avg_test = np.mean(trial_results["Test Accuracy"])

                if avg_cross_val > opt_decision_cross_val:
                    opt_decision_cross_val = avg_cross_val
                    opt_criterion = criterion

                print(f"      Avg Train Accuracy: {avg_train:.4f}")
                print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
                print(f"      Avg Test Accuracy: {avg_test:.4f}")

                # Store results
                results.append({
                    "Dataset": dataset_name,
                    "Split": split_name,
                    "Classifier": f"Decision Tree (Criterion={criterion})",
                    "Train Accuracy": avg_train,
                    "Cross Validation Accuracy": avg_cross_val,
                    "Test Accuracy": avg_test
                })

            print(f"  Optimal Criterion for Decision Tree: Criterion={opt_criterion}\n Optimal Cross Validation Accuracy: {opt_decision_cross_val:.4f}")


Occupancy Dataset

20/80
Logistic Regression (C=1e-08)
      Avg Train Accuracy: 0.7690
      Avg Cross-Validation Accuracy: 0.7690
      Avg Test Accuracy: 0.7690
Logistic Regression (C=1e-06)
      Avg Train Accuracy: 0.7690
      Avg Cross-Validation Accuracy: 0.7690
      Avg Test Accuracy: 0.7690
Logistic Regression (C=0.0001)
      Avg Train Accuracy: 0.8426
      Avg Cross-Validation Accuracy: 0.8155
      Avg Test Accuracy: 0.8436
Logistic Regression (C=0.01)
      Avg Train Accuracy: 0.9894
      Avg Cross-Validation Accuracy: 0.9894
      Avg Test Accuracy: 0.9886
Logistic Regression (C=1)
      Avg Train Accuracy: 0.9893
      Avg Cross-Validation Accuracy: 0.9893
      Avg Test Accuracy: 0.9883
Logistic Regression (C=100)
      Avg Train Accuracy: 0.9893
      Avg Cross-Validation Accuracy: 0.9893
      Avg Test Accuracy: 0.9884
Logistic Regression (C=10000)
      Avg Train Accuracy: 0.9893
      Avg Cross-Validation Accuracy: 0.9893
      Avg Test Accuracy: 0.9883
  Optim

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# 3 splits x 3 classifiers x 3 datasets x 3 trials
splits = {
    "20/80": 0.2,  # 20% test, 80% train
    "50/50": 0.5,  # 50% test, 50% train
    "80/20": 0.8   # 80% test, 20% train
}

#To be later initialized so can tune hyper parameters
classifiers = {
    "Logistic Regression": None,
    "Random Forest": None,
    "Decision Tree": None
}

datasets = {
    "Occupancy Dataset": (X_occupancy, y_occupancy),
    "Sepsis Dataset": (X_sepsis, y_sepsis),
    "Coupon Dataset": (X_coupons, y_coupons)
}

num_trials = 3

#results store values for later use when creating heatmap
results = []

for dataset_name, (X, y) in datasets.items():
    print(f"\n{dataset_name}")
    
    for split_name, test_size in splits.items():
        print(f"\n{split_name}")
        
        # if "Logistic Regression" in classifiers:
        #     optimal_log_param = None
        #     opt_log_cross_val = -1  # dummy value place holder

        #     #iterates through all possible regularization parameters
        #     for regularization in [10**-8, 10**-6, 10**-4, 10**-2, 1, 10**2, 10**4]:
        #         print(f"Logistic Regression (C={regularization})")
                
        #         log_model = LogisticRegression(max_iter=2000, solver="lbfgs", C=regularization)

     
        #         trial_results = {
        #             "Train Accuracy": [],
        #             "Cross Validation Accuracy": [],
        #             "Test Accuracy": []
        #         }

        #         #Iterates for 3 trials as indicated
        #         for trial in range(num_trials):
        #             X_train_full, X_test, y_train_full, y_test = train_test_split(
        #                 X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
        #             )
                    
        #             # 5-fold cross validation
        #             k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
        #             cross_val_results = cross_val_score(log_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
        #             avg_cross_val = np.mean(cross_val_results)
        #             trial_results["Cross Validation Accuracy"].append(avg_cross_val)

        #             #training the model
        #             log_model.fit(X_train_full, y_train_full)

        #             y_train_full_pred = log_model.predict(X_train_full)
        #             train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
        #             trial_results["Train Accuracy"].append(train_accuracy)

        #             #evaluating tet
        #             y_test_pred = log_model.predict(X_test)
        #             test_accuracy = accuracy_score(y_test, y_test_pred)
        #             trial_results["Test Accuracy"].append(test_accuracy)

        #         # taking the average over the 3 trials
        #         avg_train = np.mean(trial_results["Train Accuracy"])
        #         avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
        #         avg_test = np.mean(trial_results["Test Accuracy"])

        #         # select optimal parameter 
        #         if avg_cross_val > opt_log_cross_val:
        #             opt_log_cross_val = avg_cross_val
        #             optimal_log_param = regularization

        #         # Output results
        #         print(f"      Avg Train Accuracy: {avg_train:.4f}")
        #         print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
        #         print(f"      Avg Test Accuracy: {avg_test:.4f}")

        #         results.append({
        #             "Dataset": dataset_name,
        #             "Split": split_name,
        #             "Classifier": f"Logistic Regression (C={regularization})",
        #             "Train Accuracy": avg_train,
        #             "Cross Validation Accuracy": avg_cross_val,
        #             "Test Accuracy": avg_test
        #         })

        #     print(f"  Optimal Regularization Parameter for Logistic Regression: C={optimal_log_param}\n Optimal Cross Validation Accuracy{opt_log_cross_val:.4f}")

        if "Random Forest" in classifiers:
            optimal_max_feat = None
            opt_random_forest_cross_val = -1  
            
            # Iterate over different max_features values
            for max_features in [1, 2, 4, 6, 8, 12, 16, 20]:
                print(f"    Classifier: Random Forest (max_features={max_features})")
                random_forest_model = RandomForestClassifier(n_estimators=1024, max_features=max_features, random_state=42)
        
                trial_results = {
                    "Train Accuracy": [],
                    "Cross Validation Accuracy": [],
                    "Test Accuracy": []
                }
        
                for trial in range(num_trials):
                    X_train_full, X_test, y_train_full, y_test = train_test_split(
                        X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
                    )
        
                    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
                    cross_val_results = cross_val_score(random_forest_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
                    avg_cross_val = np.mean(cross_val_results)
                    trial_results["Cross Validation Accuracy"].append(avg_cross_val)
        
                    random_forest_model.fit(X_train_full, y_train_full)
        
                    y_train_full_pred = random_forest_model.predict(X_train_full)
                    train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
                    trial_results["Train Accuracy"].append(train_accuracy)
        
                    y_test_pred = random_forest_model.predict(X_test)
                    test_accuracy = accuracy_score(y_test, y_test_pred)
                    trial_results["Test Accuracy"].append(test_accuracy)
        
                avg_train = np.mean(trial_results["Train Accuracy"])
                avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
                avg_test = np.mean(trial_results["Test Accuracy"])
        

                if avg_cross_val > opt_random_forest_cross_val:
                    opt_random_forest_cross_val = avg_cross_val
                    optimal_max_feat = max_features
        
                print(f"      Avg Train Accuracy: {avg_train:.4f}")
                print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
                print(f"      Avg Test Accuracy: {avg_test:.4f}")
        
                results.append({
                    "Dataset": dataset_name,
                    "Split": split_name,
                    "Classifier": f"Random Forest (max_features={max_features})",
                    "Train Accuracy": avg_train,
                    "Cross Validation Accuracy": avg_cross_val,
                    "Test Accuracy": avg_test
                })
        
            print(f"  Optimal Max Features for Random Forest: max_features={optimal_max_feat}, \n Optimal Cross Validation Accuracy: {opt_random_forest_cross_val:.4f}")

        # if "Decision Tree" in classifiers:
        #     opt_criterion = None
        #     opt_decision_cross_val = -1
            
        #     for criterion in ["gini", "entropy"]:
        #         print(f"    Classifier: Decision Tree (Criterion={criterion})")
                
        #         dt_model = DecisionTreeClassifier(random_state=42, criterion=criterion)

        #         trial_results = {
        #             "Train Accuracy": [],
        #             "Cross Validation Accuracy": [],
        #             "Test Accuracy": []
        #         }

        #         for trial in range(num_trials):
        #             X_train_full, X_test, y_train_full, y_test = train_test_split(
        #                 X, y.values.ravel(), test_size=test_size, random_state=42 + trial, stratify=y
        #             )

        #             k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + trial)
        #             cross_val_results = cross_val_score(dt_model, X_train_full, y_train_full, cv=k_fold, scoring="accuracy")
        #             avg_cross_val = np.mean(cross_val_results)
        #             trial_results["Cross Validation Accuracy"].append(avg_cross_val)

        #             dt_model.fit(X_train_full, y_train_full)

        #             y_train_full_pred = dt_model.predict(X_train_full)
        #             train_accuracy = accuracy_score(y_train_full, y_train_full_pred)
        #             trial_results["Train Accuracy"].append(train_accuracy)

        #             y_test_pred = dt_model.predict(X_test)
        #             test_accuracy = accuracy_score(y_test, y_test_pred)
        #             trial_results["Test Accuracy"].append(test_accuracy)

        #         avg_train = np.mean(trial_results["Train Accuracy"])
        #         avg_cross_val = np.mean(trial_results["Cross Validation Accuracy"])
        #         avg_test = np.mean(trial_results["Test Accuracy"])

        #         if avg_cross_val > opt_decision_cross_val:
        #             opt_decision_cross_val = avg_cross_val
        #             opt_criterion = criterion

        #         print(f"      Avg Train Accuracy: {avg_train:.4f}")
        #         print(f"      Avg Cross-Validation Accuracy: {avg_cross_val:.4f}")
        #         print(f"      Avg Test Accuracy: {avg_test:.4f}")

        #         # Store results
        #         results.append({
        #             "Dataset": dataset_name,
        #             "Split": split_name,
        #             "Classifier": f"Decision Tree (Criterion={criterion})",
        #             "Train Accuracy": avg_train,
        #             "Cross Validation Accuracy": avg_cross_val,
        #             "Test Accuracy": avg_test
        #         })

        #     print(f"  Optimal Criterion for Decision Tree: Criterion={opt_criterion}\n Optimal Cross Validation Accuracy: {opt_decision_cross_val:.4f}")


Occupancy Dataset

20/80
    Classifier: Random Forest (max_features=1)
      Avg Train Accuracy: 1.0000
      Avg Cross-Validation Accuracy: 0.9938
      Avg Test Accuracy: 0.9938
    Classifier: Random Forest (max_features=2)
      Avg Train Accuracy: 1.0000
      Avg Cross-Validation Accuracy: 0.9937
      Avg Test Accuracy: 0.9936
    Classifier: Random Forest (max_features=4)
      Avg Train Accuracy: 1.0000
      Avg Cross-Validation Accuracy: 0.9936
      Avg Test Accuracy: 0.9937
    Classifier: Random Forest (max_features=6)
      Avg Train Accuracy: 1.0000
      Avg Cross-Validation Accuracy: 0.9933
      Avg Test Accuracy: 0.9938
    Classifier: Random Forest (max_features=8)
      Avg Train Accuracy: 1.0000
      Avg Cross-Validation Accuracy: 0.9932
      Avg Test Accuracy: 0.9937
    Classifier: Random Forest (max_features=12)
      Avg Train Accuracy: 1.0000
      Avg Cross-Validation Accuracy: 0.9932
      Avg Test Accuracy: 0.9937
    Classifier: Random Forest (max_fe

In [None]:
hi geenaaaaa. see u at gradddd.