## Boosting

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
#Using the titanic dataset 
train = pd.read_csv('/Users/dariyab/Desktop/projector/ML/Decision Trees /titanic/train.csv')
train.shape
train.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Compare survival rates between men and women
train[['Sex','Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)


Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [4]:
#Compare survival rates between passanger classes 
train[['Pclass','Survived']].groupby(['Pclass'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [5]:
#Preprocessing Data
#Selecting features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[features] #our features
y = train['Survived'] # the column we're trying to predict

In [6]:
#Filling missing values for some feature columns with the median of those columns. Robust to ouliers. 
X['Age'].fillna(X['Age'].median(), inplace = True) 
X['Fare'].fillna(X['Fare'].median(), inplace = True)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.2500
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.9250
3,1,female,35.0,1,0,53.1000
4,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000
887,1,female,19.0,0,0,30.0000
888,3,female,28.0,1,2,23.4500
889,1,male,26.0,0,0,30.0000


In [7]:
#Encoding the categorical variable of "Sex" into 0s and 1s. #1 means the person was male

X = pd.get_dummies(X, columns =['Sex'], drop_first = True).astype(int)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22,1,0,7,1
1,1,38,1,0,71,0
2,3,26,0,0,7,0
3,1,35,1,0,53,0
4,3,35,0,0,8,1
...,...,...,...,...,...,...
886,2,27,0,0,13,1
887,1,19,0,0,30,0
888,3,28,1,2,23,0
889,1,26,0,0,30,1


## XGBoost

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [9]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#Hyperparameters

parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",  # 1 - accuracy
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8, 
    'lambda': 1.0, 
    'alpha': 0.0,
    'colsample_bytree': 0.8,

}


xgb_train = xgb.DMatrix(X, y, feature_names=list(X.columns))

In [10]:
# Number of boosting rounds
num_boost_round = 100

# Early stopping after 10 rounds without improvement
early_stopping_rounds = 10

In [11]:
# Perform cross-validation
cv_results = xgb.cv(
    parameters,
    xgb_train,
    num_boost_round=num_boost_round,
    early_stopping_rounds=early_stopping_rounds,
    folds=skf,
    verbose_eval=10,
    metrics = 'error'
)

# Convert error to accuracy and print results
cv_results['train_accuracy'] = 1 - cv_results['train-error-mean']
cv_results['test_accuracy'] = 1 - cv_results['test-error-mean']

# Print the cross-validation results
print(cv_results[['train_accuracy', 'test_accuracy']])


[0]	train-error:0.38384+0.00058	test-error:0.38384+0.00232
[10]	train-error:0.14478+0.00303	test-error:0.17623+0.01383
[20]	train-error:0.11560+0.00668	test-error:0.16388+0.02187
[30]	train-error:0.10831+0.00448	test-error:0.15713+0.01710
[40]	train-error:0.10522+0.00490	test-error:0.15491+0.02278
[41]	train-error:0.10662+0.00592	test-error:0.15491+0.02278
    train_accuracy  test_accuracy
0         0.616162       0.616163
1         0.616162       0.616163
2         0.734293       0.714902
3         0.791527       0.786762
4         0.799381       0.785638
5         0.808079       0.790120
6         0.817620       0.788984
7         0.821547       0.794595
8         0.835855       0.802448
9         0.852410       0.818160
10        0.855218       0.823771
11        0.864757       0.829377
12        0.866442       0.837242
13        0.869529       0.835001
14        0.872335       0.831630
15        0.875141       0.830513
16        0.876542       0.836131
17        0.877385       0.83

In [12]:
optimal_boost_rounds = cv_results.shape[0]
print(f"Optimal number of boosting rounds: {optimal_boost_rounds}")

Optimal number of boosting rounds: 33


### Best performance using xgb boost is 84% accuracy for the test set and the number of boosting rounds is 33.

In [13]:
# Train the final model on the entire training data with the best boosting rounds
final_model = xgb.train(
    parameters,
    xgb_train,
    num_boost_round=optimal_boost_rounds
)



In [14]:
test = pd.read_csv('/Users/dariyab/Desktop/projector/ML/Decision Trees /titanic/test.csv')
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X_test = test[features]
X_test['Age'].fillna(X_test['Age'].median(), inplace = True) 
X_test['Fare'].fillna(X_test['Fare'].median(), inplace = True)
X_test.head()
X_test = pd.get_dummies(X_test, columns = ['Sex'], drop_first = True).astype(int)
#reordering X_test columns so that they match X
X_test = X_test[X.columns]

In [15]:
# Example: make predictions on a test set or new data
dtest = xgb.DMatrix(X_test)
y_pred_probs = final_model.predict(dtest)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_probs]


# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission_xgb.csv', index=False)

### For the test set in kaggle, the best performance I got was 77% using xgb boost. 

## Light GBM model 
     

In [16]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np


n_rounds = 10000
parameters = {
    #default
    "objective": "binary",
    "learning_rate": 0.01,
    "num_threads": 10,
    "metric": ["binary_logloss", "binary_error"],
    "seed": 42,
    "verbose": -1,

    #regularization
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "subsample_freq": 1,
}

lgb_train = lgb.Dataset(X, label=y, free_raw_data=False)

result = lgb.cv(parameters, lgb_train, n_rounds, folds=skf, eval_train_metric=True, callbacks=[lgb.early_stopping(50), lgb.log_evaluation(period=100)])


Training until validation scores don't improve for 50 rounds
[100]	cv_agg's train binary_logloss: 0.45257 + 0.00458119	cv_agg's train binary_error: 0.163863 + 0.00527041	cv_agg's valid binary_logloss: 0.477083 + 0.0130645	cv_agg's valid binary_error: 0.189674 + 0.0156404
[200]	cv_agg's train binary_logloss: 0.38195 + 0.00514119	cv_agg's train binary_error: 0.143659 + 0.00598672	cv_agg's valid binary_logloss: 0.427807 + 0.020531	cv_agg's valid binary_error: 0.170611 + 0.0146435
[300]	cv_agg's train binary_logloss: 0.345116 + 0.00524713	cv_agg's train binary_error: 0.131033 + 0.00484394	cv_agg's valid binary_logloss: 0.412951 + 0.0229443	cv_agg's valid binary_error: 0.160505 + 0.0166089
Early stopping, best iteration is:
[272]	cv_agg's train binary_logloss: 0.353562 + 0.00516507	cv_agg's train binary_error: 0.135242 + 0.00363877	cv_agg's valid binary_logloss: 0.415007 + 0.0225944	cv_agg's valid binary_error: 0.160505 + 0.011704


In [17]:
print(result.keys())


dict_keys(['train binary_logloss-mean', 'train binary_logloss-stdv', 'train binary_error-mean', 'train binary_error-stdv', 'valid binary_logloss-mean', 'valid binary_logloss-stdv', 'valid binary_error-mean', 'valid binary_error-stdv'])


In [18]:
print(f"Train error:      {result['train binary_logloss-mean'][-1]:.4f}, std: {result['train binary_logloss-stdv'][-1]:.4f}")
print(f"Validation error: {result['valid binary_logloss-mean'][-1]:.4f}, std: {result['valid binary_logloss-stdv'][-1]:.4f}")

Train error:      0.3536, std: 0.0052
Validation error: 0.4150, std: 0.0226


In [19]:
optimal_boost_rounds = len(result['train binary_logloss-mean'])
print(f"Optimal number of boosting rounds: {optimal_boost_rounds}")


Optimal number of boosting rounds: 272


In [20]:
# Get the best binary error (lowest validation error) for accuracy calculation
if 'valid binary_error-mean' in result:
    best_error = result['valid binary_error-mean'][-1]  # Error at the best iteration
    best_accuracy = 1 - best_error  # Accuracy is 1 - error
    print(f"Best Validation Binary Error: {best_error:.4f}")
    print(f"Best Validation Accuracy: {best_accuracy:.4f}")
else:
    print("Binary error metric is not available in the result.")

Best Validation Binary Error: 0.1605
Best Validation Accuracy: 0.8395


### Best performance using lgb boost is 84% accuracy on the validation sets and the number of boosting rounds is 272. 

In [21]:
final_model = lgb.train(
    parameters,
    lgb_train,
    num_boost_round=optimal_boost_rounds
)

# Predict probabilities on the test set
y_pred = final_model.predict(X_test)

# Convert probabilities to binary labels
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary labels


In [22]:
# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission_lgb.csv', index=False)

### For the test set in kaggle, the best performance I got was 76% using lgb boost. 

## Catboost

In [23]:
import catboost as ctb
from catboost import Pool

# Define parameters for CatBoost
parameters = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "iterations": 1000,  
    "learning_rate": 0.1,
    "random_seed": 42,
    "od_wait": 30,  # Number of iterations to wait for improvement
    "od_type": "Iter",  # Type of overfitting detector
    "thread_count": 10
}

# Identify categorical features
categorical_features_indices = np.where(X.dtypes != np.float64)[0]

# Prepare the Pool object for CatBoost
ctb_data = ctb.Pool(X, y, cat_features=categorical_features_indices)

# Create Stratified K-Folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
result = ctb.cv(ctb_data, parameters, folds=skf, seed=42, verbose_eval=100, plot=True)

# Identify the best iteration
best_iteration = result.loc[result["test-Accuracy-mean"].idxmax()]
print("Best Iteration:", best_iteration["iterations"])
print("Best Accuracy:", best_iteration["test-Accuracy-mean"])

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 0.8061798	test: 0.7765363	best: 0.7765363 (0)	total: 63.9ms	remaining: 1m 3s

bestTest = 0.8324022346
bestIteration = 24

Training on fold [1/5]
0:	learn: 0.8078541	test: 0.7865169	best: 0.7865169 (0)	total: 2.59ms	remaining: 2.58s

bestTest = 0.8202247191
bestIteration = 47

Training on fold [2/5]
0:	learn: 0.8064516	test: 0.7921348	best: 0.7921348 (0)	total: 2.73ms	remaining: 2.72s

bestTest = 0.808988764
bestIteration = 50

Training on fold [3/5]
0:	learn: 0.7966339	test: 0.8146067	best: 0.8146067 (0)	total: 19.6ms	remaining: 19.6s

bestTest = 0.8146067416
bestIteration = 0

Training on fold [4/5]
0:	learn: 0.7868163	test: 0.8258427	best: 0.8258427 (0)	total: 2.28ms	remaining: 2.28s

bestTest = 0.8539325843
bestIteration = 23

Best Iteration: 27.0
Best Accuracy: 0.8192894356914193


### Best performance using  catboost is 82% mean accuracy and the number of boosting rounds is 27.

In [24]:
final_model = ctb.CatBoostClassifier(
    iterations=best_iteration[0],  # Use the optimal boosting rounds
    learning_rate=parameters['learning_rate'],
    loss_function=parameters['loss_function'],
    eval_metric=parameters['eval_metric'],
    random_seed=parameters['random_seed'],
    od_wait=parameters['od_wait'],
    od_type=parameters['od_type'],
    thread_count=parameters['thread_count']
)

# Fit the model on the full dataset
final_model.fit(ctb_data, verbose_eval=100)

# Make predictions on test data
y_pred = final_model.predict(X_test)
y_pred_test_probs = final_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Convert probabilities to binary labels if needed
y_pred = (y_pred_test_probs > 0.5).astype(int)  # Thresholding at 0.5


0:	learn: 0.7878788	total: 2.39ms	remaining: 62.2ms
26:	learn: 0.8069585	total: 37.1ms	remaining: 0us


In [25]:
# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission_ctb.csv', index=False)

### For the test set in kaggle, the best performance I got was 76% using lgb boost. 

## Compaing XGBoost with RandomForest

In [26]:
#Comparing XGBoost with RandomForest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize Random Forest with basic hyperparameters
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform cross-validation to get validation score
rf_scores = cross_val_score(rf_model, X, y, cv=skf, scoring='accuracy')

# Print Random Forest validation accuracy
print(f"Random Forest Validation Accuracy: {rf_scores.mean():.4f}")

Random Forest Validation Accuracy: 0.8070


In [27]:
#Hyperparameters

parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",  # 1 - accuracy
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8, 
    'lambda': 1.0, 
    'alpha': 0.0,
    'colsample_bytree': 0.8,

}


xgb_train = xgb.DMatrix(X, y, feature_names=list(X.columns))

In [28]:
# Number of boosting rounds
num_boost_round = 100

# Early stopping after 10 rounds without improvement
early_stopping_rounds = 10
# Perform cross-validation
cv_results = xgb.cv(
    parameters,
    xgb_train,
    num_boost_round=num_boost_round,
    early_stopping_rounds=early_stopping_rounds,
    folds=skf,
    verbose_eval=10,
    metrics = 'error'
)

# Convert error to accuracy and print results
cv_results['train_accuracy'] = 1 - cv_results['train-error-mean']
cv_results['test_accuracy'] = 1 - cv_results['test-error-mean']

# Print the cross-validation results
print(cv_results[['train_accuracy', 'test_accuracy']])


[0]	train-error:0.38384+0.00058	test-error:0.38384+0.00232
[10]	train-error:0.14478+0.00303	test-error:0.17623+0.01383
[20]	train-error:0.11560+0.00668	test-error:0.16388+0.02187
[30]	train-error:0.10831+0.00448	test-error:0.15713+0.01710
[40]	train-error:0.10522+0.00490	test-error:0.15491+0.02278
[42]	train-error:0.10578+0.00425	test-error:0.15491+0.02278
    train_accuracy  test_accuracy
0         0.616162       0.616163
1         0.616162       0.616163
2         0.734293       0.714902
3         0.791527       0.786762
4         0.799381       0.785638
5         0.808079       0.790120
6         0.817620       0.788984
7         0.821547       0.794595
8         0.835855       0.802448
9         0.852410       0.818160
10        0.855218       0.823771
11        0.864757       0.829377
12        0.866442       0.837242
13        0.869529       0.835001
14        0.872335       0.831630
15        0.875141       0.830513
16        0.876542       0.836131
17        0.877385       0.83

In [29]:
print(f"XGBoost scores: {cv_results['test_accuracy'].mean():.4f}")

XGBoost scores: 0.8110


## Conclusion
#### XGBoost performs slightly better than the random forest and decision tree on the training set. However when I import the predicted scores for the test sets into Kaggle, the models don't perform better than bagging or random forest. They are all around 77% which means that they are not very generalizable