## Boosting

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [3]:
#Using the titanic dataset 
train = pd.read_csv('/Users/dariyab/Desktop/projector/ML/Decision Trees /titanic/train.csv')
train.shape
train.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Compare survival rates between men and women
train[['Sex','Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)


Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [5]:
#Compare survival rates between passanger classes 
train[['Pclass','Survived']].groupby(['Pclass'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [8]:
#Preprocessing Data
#Selecting features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[features] #our features
y = train['Survived'] # the column we're trying to predict

In [9]:
#Filling missing values for some feature columns with the median of those columns. Robust to ouliers. 
X['Age'].fillna(X['Age'].median(), inplace = True) 
X['Fare'].fillna(X['Fare'].median(), inplace = True)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.2500
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.9250
3,1,female,35.0,1,0,53.1000
4,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000
887,1,female,19.0,0,0,30.0000
888,3,female,28.0,1,2,23.4500
889,1,male,26.0,0,0,30.0000


In [10]:
#Encoding the categorical variable of "Sex" into 0s and 1s. #1 means the person was male

X = pd.get_dummies(X, columns =['Sex'], drop_first = True).astype(int)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22,1,0,7,1
1,1,38,1,0,71,0
2,3,26,0,0,7,0
3,1,35,1,0,53,0
4,3,35,0,0,8,1
...,...,...,...,...,...,...
886,2,27,0,0,13,1
887,1,19,0,0,30,0
888,3,28,1,2,23,0
889,1,26,0,0,30,1


## XGBoost

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [89]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#Hyperparameters

parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "auc",
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8, 
    'lambda': 1.0, 
    'alpha': 0.0,
    'colsample_bytree': 0.8,

}


xgb_train = xgb.DMatrix(X, y, feature_names=list(X.columns))

In [90]:
# Number of boosting rounds
num_boost_round = 100

# Early stopping after 10 rounds without improvement
early_stopping_rounds = 10

In [51]:
# Perform cross-validation
cv_results = xgb.cv(
    parameters,
    xgb_train,
    num_boost_round=num_boost_round,
    early_stopping_rounds=early_stopping_rounds,
    folds=skf,
    verbose_eval=10
)
cv_results

[0]	train-auc:0.78890+0.01382	test-auc:0.72639+0.02903
[10]	train-auc:0.91515+0.00527	test-auc:0.87025+0.02173
[14]	train-auc:0.92064+0.00554	test-auc:0.86800+0.02242


Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.788903,0.01382,0.726391,0.029028
1,0.888894,0.008247,0.852892,0.018806
2,0.893658,0.00623,0.862879,0.019556
3,0.895441,0.004924,0.863664,0.01782
4,0.902096,0.006044,0.86421,0.020851
5,0.907625,0.006705,0.870379,0.021898


In [52]:
optimal_boost_rounds = cv_results.shape[0]
print(f"Optimal number of boosting rounds: {optimal_boost_rounds}")

Optimal number of boosting rounds: 6


### Best performance using xgb boost is 92% mean accuracy and the number of boosting rounds is 6.

In [53]:
# Train the final model on the entire training data with the best boosting rounds
final_model = xgb.train(
    parameters,
    xgb_train,
    num_boost_round=optimal_boost_rounds
)



In [54]:
test = pd.read_csv('/Users/dariyab/Desktop/projector/ML/Decision Trees /titanic/test.csv')
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X_test = test[features]
X_test['Age'].fillna(X_test['Age'].median(), inplace = True) 
X_test['Fare'].fillna(X_test['Fare'].median(), inplace = True)
X_test.head()
X_test = pd.get_dummies(X_test, columns = ['Sex'], drop_first = True).astype(int)
#reordering X_test columns so that they match X
X_test = X_test[X.columns]

In [55]:
# Example: make predictions on a test set or new data
dtest = xgb.DMatrix(X_test)
y_pred_probs = final_model.predict(dtest)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_probs]


# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission_xgb.csv', index=False)

### For the test set in kaggle, the best performance I got was 77% using xgb boost. 

## Light GBM model 
     

In [95]:
import lightgbm as lgb
n_rounds = 10000
parameters = {
    #default
    "objective": "binary",
    "learning_rate": 0.01,
    "num_threads": 10,
    "metric": "auc",
    "seed": 42,

    #regularization
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "subsample_freq": 1,
}

lgb_train = lgb.Dataset(X, label=y, free_raw_data=False)

result = lgb.cv(parameters, lgb_train, n_rounds, folds=skf, eval_train_metric=True, callbacks=[lgb.early_stopping(50), lgb.log_evaluation(period=100)])

Training until validation scores don't improve for 50 rounds
[100]	cv_agg's train auc: 0.906616 + 0.00458242	cv_agg's valid auc: 0.868208 + 0.0292248
[200]	cv_agg's train auc: 0.917436 + 0.00394297	cv_agg's valid auc: 0.872474 + 0.0279466
[300]	cv_agg's train auc: 0.92719 + 0.00359202	cv_agg's valid auc: 0.873952 + 0.0248364
Early stopping, best iteration is:
[259]	cv_agg's train auc: 0.923294 + 0.00393396	cv_agg's valid auc: 0.874612 + 0.0256799


In [96]:
print(f"Train auc:      {result['train auc-mean'][-1]:.4f}, std: {result['train auc-stdv'][-1]:.4f}")
print(f"Validation auc: {result['valid auc-mean'][-1]:.4f}, std: {result['valid auc-stdv'][-1]:.4f}")

Train auc:      0.9233, std: 0.0039
Validation auc: 0.8746, std: 0.0257


In [103]:
optimal_boost_rounds = len(result['train auc-mean'])
print(f"Optimal number of boosting rounds: {optimal_boost_rounds}")


Optimal number of boosting rounds: 259


### Best performance using lgb boost is 92% mean accuracy and the number of boosting rounds is 259.

In [108]:
final_model = lgb.train(
    parameters,
    lgb_train,
    num_boost_round=optimal_boost_rounds
)

# Predict probabilities on the test set
y_pred = final_model.predict(X_test)

# Convert probabilities to binary labels
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary labels


In [109]:
# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission_lgb.csv', index=False)

### For the test set in kaggle, the best performance I got was 76% using lgb boost. 

## Catboost

In [112]:
import catboost as ctb
from catboost import Pool

parameters = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 1000,  
    "learning_rate": 0.01,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10
}
categorical_features_indices = np.where(X.dtypes != np.float64)[0]


In [116]:
ctb_data = ctb.Pool(X,y, cat_features=categorical_features_indices)
result = ctb.cv(ctb_data, parameters, folds=skf, seed=42, verbose_eval=100, plot=True)
result.loc[result["test-AUC-mean"] == result["test-AUC-mean"].max()]
optimal_iterations = result.loc[result["test-AUC-mean"] == result["test-AUC-mean"].max()]["iterations"].values[0]


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	test: 0.8518445	best: 0.8518445 (0)	total: 121ms	remaining: 2m 1s

bestTest = 0.8566534914
bestIteration = 6

Training on fold [1/5]
0:	test: 0.8420455	best: 0.8420455 (0)	total: 1.89ms	remaining: 1.89s

bestTest = 0.8483957219
bestIteration = 20

Training on fold [2/5]
0:	test: 0.7824866	best: 0.7824866 (0)	total: 2.47ms	remaining: 2.47s

bestTest = 0.8131684492
bestIteration = 47

Training on fold [3/5]
0:	test: 0.8133690	best: 0.8133690 (0)	total: 3.55ms	remaining: 3.55s

bestTest = 0.8237299465
bestIteration = 9

Training on fold [4/5]
0:	test: 0.8546736	best: 0.8546736 (0)	total: 3.54ms	remaining: 3.54s

bestTest = 0.879337854
bestIteration = 9



### Best performance using lgb boost is 87% mean accuracy and the number of boosting rounds is 9.

In [117]:
final_model = ctb.CatBoostClassifier(
    iterations=optimal_iterations,  # Use the optimal boosting rounds
    learning_rate=parameters['learning_rate'],
    loss_function=parameters['loss_function'],
    eval_metric=parameters['eval_metric'],
    random_seed=parameters['random_seed'],
    od_wait=parameters['od_wait'],
    od_type=parameters['od_type'],
    thread_count=parameters['thread_count']
)

# Fit the model on the full dataset
final_model.fit(ctb_data, verbose_eval=100)

# Make predictions on test data
y_pred = final_model.predict(X_test)
y_pred_test_probs = final_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Convert probabilities to binary labels if needed
y_pred = (y_pred_test_probs > 0.5).astype(int)  # Thresholding at 0.5


0:	total: 17.6ms	remaining: 300ms
17:	total: 45.5ms	remaining: 0us


In [118]:
# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission_ctb.csv', index=False)

### For the test set in kaggle, the best performance I got was 76% using lgb boost. 

## Conclusion
#### All three boosting models perform better than the random forest and decision tree on the training set. However when I import the predicted scores for the test sets into Kaggle, the models don't perform better than baggin or random forest. They are all around 77% which means that they are not very generalizable