In [None]:
# load train data
# reuse the preprocessing approach from the previous homework
# reuse validation approach from the previous homework. 
# it should be exactly the same because we want to compare the models
     

In [90]:

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [91]:
#Using the titanic dataset 
train = pd.read_csv('/Users/dariyab/Desktop/projector/ML/Decision Trees /titanic/train.csv')
train.shape
train.head() 


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [92]:
#Compare survival rates between men and women
train[['Sex','Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)


Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [93]:
#Compare survival rates between passanger classes 
train[['Pclass','Survived']].groupby(['Pclass'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [119]:
#Preprocessing Data
#Selecting features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[features] #our features
y = train['Survived'] # the column we're trying to predict

In [120]:
#Filling missing values for some feature columns with the median of those columns. Robust to ouliers. 
X['Age'].fillna(X['Age'].median(), inplace = True) 
X['Fare'].fillna(X['Fare'].median(), inplace = True)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.2500
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.9250
3,1,female,35.0,1,0,53.1000
4,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000
887,1,female,19.0,0,0,30.0000
888,3,female,28.0,1,2,23.4500
889,1,male,26.0,0,0,30.0000


In [121]:
#Encoding the categorical variable of "Sex" into 0s and 1s. #1 means the person was male

X = pd.get_dummies(X, columns =['Sex'], drop_first = True).astype(int)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22,1,0,7,1
1,1,38,1,0,71,0
2,3,26,0,0,7,0
3,1,35,1,0,53,0
4,3,35,0,0,8,1
...,...,...,...,...,...,...
886,2,27,0,0,13,1
887,1,19,0,0,30,0
888,3,28,1,2,23,0
889,1,26,0,0,30,1


In [97]:

# define the bagging model (from sklearn)
# define the hyperparameters grid
# define the grid search with cross validation using previously defined validation method
# train the model
# print the best hyperparameters
# print the best score on train and validation data, estimate the generalization error

In [122]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 

dt = DecisionTreeClassifier()

#Bagging model with the decision tree 
bagging_model = BaggingClassifier(base_estimator = dt, n_estimators = 25, random_state = 42 )

#Stratified K-fold validation 
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42) 

#Hyperparameters for tuning the bagging model
parameters = {
    "max_features": [0.7, 0.8, 0.9], 
    "max_samples": [0.7, 0.8, 0.9], 
    "base_estimator__max_depth": [None, 4, 8, 12, 15],
    "base_estimator__min_samples_leaf": [1, 2, 3, 5, 8]
}

#define grid search 

grid_search = GridSearchCV(estimator = bagging_model, 
                           param_grid = parameters, 
                           cv = skf, 
                           scoring = 'accuracy')

#train the model                           
grid_search.fit(X_train, y_train)

#get the best parameters 
best_params = grid_search.best_params_
best_score = grid_search.best_score_ 

print("Best Hyperparameters:", best_params)
print("Best accuracy: ", best_score) 


Best Hyperparameters: {'base_estimator__max_depth': None, 'base_estimator__min_samples_leaf': 3, 'max_features': 0.7, 'max_samples': 0.8}
Best accuracy:  0.846218065407068


In [146]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Custom Bagging Classifier
class CustomBaggingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator=None, n_estimators=25, max_samples=0.8, max_features=0.8):
        self.base_estimator = base_estimator if base_estimator is not None else DecisionTreeClassifier()
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.estimators = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        max_samples = int(self.max_samples * n_samples)
        max_features = int(self.max_features * n_features)

        # Train the estimator on drawn samples and features
        for i in range(self.n_estimators):
            # Draw samples with replacement
            sample_indices = np.random.choice(n_samples, max_samples, replace=True)
            X_sample = X.iloc[sample_indices]
            y_sample = y.iloc[sample_indices]

            # Draw features without replacement
            feature_indices = np.random.choice(n_features, max_features, replace=False)
            X_sample = X_sample.iloc[:, feature_indices]

            # Clone the base estimator and fit it
            estimator = clone(self.base_estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators.append((estimator, feature_indices))

    def predict(self, X):
        predictions = np.array([estimator.predict(X.iloc[:, feature_indices]) for estimator, feature_indices in self.estimators])
        return np.array([np.bincount(pred).argmax() for pred in predictions.T])  # Majority voting

    def predict_proba(self, X):
        proba_sum = np.zeros((X.shape[0], len(np.unique([estimator[0].classes_ for estimator in self.estimators]))))
        for estimator, feature_indices in self.estimators:
            proba_sum += estimator.predict_proba(X.iloc[:, feature_indices])
        return proba_sum / self.n_estimators

    def get_params(self, deep=True):
        return {"base_estimator": self.base_estimator, "n_estimators": self.n_estimators,
                "max_samples": self.max_samples, "max_features": self.max_features}

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [147]:

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the base estimator
base_estimator = DecisionTreeClassifier()

# Hyperparameters to tune
param_grid = {
    'n_estimators': [10, 25],  # Number of trees in the bagging classifier
    'max_depth': [None, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5]  # Minimum samples required to split an internal node
}
# Best score and hyperparameters for reference
best_score = 0
best_hyperparams = {}
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_samples_split in param_grid['min_samples_split']:
                # Create a new instance of the base estimator with the current hyperparameters
                base_estimator = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
                
                # Create the custom bagging classifier with the current hyperparameters
                custom_bagging_model = CustomBaggingClassifier(base_estimator=base_estimator, n_estimators=n_estimators)

                # Fit the model on the current training fold
                custom_bagging_model.fit(X_train_fold, y_train_fold)

                # Make predictions on the validation fold
                y_val_pred = custom_bagging_model.predict(X_val_fold)

                # Calculate accuracy for the current fold
                val_score = accuracy_score(y_val_fold, y_val_pred)

               # print(f"Fold {fold + 1} Accuracy with n_estimators={n_estimators}, max_depth={max_depth}, min_samples_split={min_samples_split}: {val_score}")

                # Update best score and hyperparameters if current score is better
                if val_score > best_score:
                    best_score = val_score
                    best_hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}

print("Best Accuracy:", best_score)
print('Best Hyperparameters:', best_hyperparams)
        


Best Accuracy: 0.8651685393258427
Best Hyperparameters: {'n_estimators': 25, 'max_depth': 5, 'min_samples_split': 5}


In [150]:
#using the scikitlearn random forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Best score and hyperparameters for reference
best_score = 0
best_hyperparams = {}

# Hyperparameters to tune
param_grid = {
    'n_estimators': [10, 25, 50],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

# Looping through each of the validation and train sets
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Loop through the hyperparameters
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_samples_split in param_grid['min_samples_split']:
                # Create a new Random Forest model with current hyperparameters
                rf_model.set_params(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)

                # Fit the model on the current training fold
                rf_model.fit(X_train_fold, y_train_fold)

                # Make predictions on the validation fold
                y_val_pred = rf_model.predict(X_val_fold)

                # Calculate accuracy for the current fold
                val_score = accuracy_score(y_val_fold, y_val_pred)


                # Update best score and hyperparameters if current score is better
                if val_score > best_score:
                    best_score = val_score
                    best_hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}

# Final evaluation on the entire training data with the best hyperparameters
rf_model.set_params(**best_hyperparams)
rf_model.fit(X, y)

# Print results
print("Best Accuracy on Validation Data:", best_score)
print("Best Hyperparameters:", best_hyperparams)


Best Accuracy on Validation Data: 0.8707865168539326
Best Hyperparameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 5}


In [152]:
#Random forest custom 
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Custom Random Forest Classifier
class CustomRandomForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator=None, n_estimators=25, max_samples=0.8, max_features=0.8):
        self.base_estimator = base_estimator if base_estimator is not None else DecisionTreeClassifier()
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.estimators = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        max_samples = int(self.max_samples * n_samples)
        max_features = int(self.max_features * n_features)

        # Train the estimator on drawn samples and features
        for _ in range(self.n_estimators):
            # Draw samples with replacement
            sample_indices = np.random.choice(n_samples, max_samples, replace=True)
            X_sample = X.iloc[sample_indices]
            y_sample = y.iloc[sample_indices]

            # Draw features without replacement
            feature_indices = np.random.choice(n_features, max_features, replace=False)
            X_sample = X_sample.iloc[:, feature_indices]

            # Clone the base estimator and fit it
            estimator = clone(self.base_estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators.append((estimator, feature_indices))

    def predict(self, X):
        predictions = np.array([estimator.predict(X.iloc[:, feature_indices]) for estimator, feature_indices in self.estimators])
        return np.array([np.bincount(pred).argmax() for pred in predictions.T])  # Majority voting

    def predict_proba(self, X):
        proba_sum = np.zeros((X.shape[0], len(np.unique([estimator[0].classes_ for estimator in self.estimators]))))
        for estimator, feature_indices in self.estimators:
            proba_sum += estimator.predict_proba(X.iloc[:, feature_indices])
        return proba_sum / self.n_estimators



In [155]:
#Custom random forest model on titanic dataset
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameters to tune
param_grid = {
    'n_estimators': [10, 25],  # Number of trees in the random forest
    'max_depth': [None, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5]  # Minimum samples required to split an internal node
}

# Best score and hyperparameters for reference
best_score = 0
best_hyperparams = {}

# Looping through each of the validation and train sets
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Loop through the hyperparameters
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_samples_split in param_grid['min_samples_split']:
                # Create a new instance of the base estimator with the current hyperparameters
                base_estimator = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
                
                custom_random_forest_model = CustomRandomForestClassifier(base_estimator=base_estimator, n_estimators=n_estimators)

                # Fit the model 
                custom_random_forest_model.fit(X_train_fold, y_train_fold)

                # Make predictions
                y_val_pred = custom_random_forest_model.predict(X_val_fold)

                # Calculate accuracy 
                val_score = accuracy_score(y_val_fold, y_val_pred)

                if val_score > best_score:
                    best_score = val_score
                    best_hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}

# Final evaluation on the entire training data with the best hyperparameters
best_base_estimator = DecisionTreeClassifier(max_depth=best_hyperparams['max_depth'], min_samples_split=best_hyperparams['min_samples_split'])
best_model = CustomRandomForestClassifier(base_estimator=best_base_estimator, n_estimators=best_hyperparams['n_estimators'])
best_model.fit(X, y)



# Print results
print("Best Accuracy on Validation Data:", best_score)
print("Best Hyperparameters:", best_hyperparams)

# Estimate generalization error
generalization_error = 1 - final_val_score
print("Estimated Generalization Error:", generalization_error)

Best Accuracy on Validation Data: 0.8547486033519553
Best Hyperparameters: {'n_estimators': 25, 'max_depth': None, 'min_samples_split': 2}
Estimated Generalization Error: 0.0786516853932584


In [156]:
#test dataset 
test = pd.read_csv('/Users/dariyab/Desktop/projector/ML/Decision Trees /titanic/test.csv')
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X_test = test[features]
X_test['Age'].fillna(X_test['Age'].median(), inplace = True) 
X_test['Fare'].fillna(X_test['Fare'].median(), inplace = True)
X_test.head()
X_test = pd.get_dummies(X_test, columns = ['Sex'], drop_first = True).astype(int)
#reordering X_test columns so that they match X_train
X_test = X_test[X_train.columns]

In [159]:
#Now let's test our best model on the test dataset: 
# Make predictions on the test data
predictions = best_model.predict(X_test)

# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

# Save the predictions to a CSV file
submission.to_csv('submission_3.csv', index=False)

### After submitting to Kaggle, my score for the best model was unfortunately only 74% on the test dataset. I think it's because I had a hard time using gridsearch CV with my custom scripts and couldn't tune the hyperparameters properly. 