In [21]:
from sklearn.preprocessing import LabelEncoder


# Apply the mapping to both the training and test data
def map_ordinal_features(df, mappings):
    for feature, mapping in mappings.items():
        df[feature] = df[feature].map(mapping)
    return df

def frequency_encoding(train_df, test_df, column):
    # Get the frequency of each category in the train data
    freq_map = train_df[column].value_counts(normalize=True).to_dict()

    # Map the frequencies to the train and test data using the same mapping
    train_df[column] = train_df[column].map(freq_map)
    test_df[column] = test_df[column].map(freq_map)

    return train_df, test_df

def binary_encoding(df, mappings):
    for feature, mapping in mappings.items():
        df[feature] = df[feature].map(mapping)
    return df

#Titanic Survival Prediction

In [22]:
# Clone the GitHub repository
!git clone https://github.com/geopan2000/TitanicSurvival.git

# Change directory to the cloned repository
%cd TitanicSurvival

Cloning into 'TitanicSurvival'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 15 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (15/15), 38.02 KiB | 9.50 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/TitanicSurvival/TitanicSurvival/TitanicSurvival


In [23]:
import pandas as pd

In [24]:
train_data = pd.read_csv('data/train.csv')
print(train_data.info())
test_data = pd.read_csv('data/test.csv')
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

## Data Pre-Processing

In [25]:
train_data['Age']=train_data['Age'].fillna(train_data['Age'].mean())
train_data['Age']=train_data['Age'] / 100

test_data['Age']=test_data['Age'].fillna(train_data['Age'].mean())
test_data['Age']=test_data['Age'] / 100

In [26]:
# Apply one-hot encoding on train and test data
train_data = train_data.dropna(subset=['Embarked'])
train_data = pd.get_dummies(train_data, columns=['Embarked'])
test_data = pd.get_dummies(test_data, columns=['Embarked'])

test_data['Fare']=test_data['Fare'].fillna(test_data['Fare'].mean())

train_data.drop('Cabin', axis=1, inplace=True)
train_data.drop('Name', axis=1, inplace=True)
train_data.drop('Ticket', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Name', axis=1, inplace=True)
test_data.drop('Ticket', axis=1, inplace=True)


In [27]:
# Transform the 'Sex' column: 'male' to 1 and 'female' to 0
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0})

# If you have a test set and want to apply the same transformation
test_data['Sex'] = test_data['Sex'].map({'male': 1, 'female': 0})

In [28]:
print(train_data.info())

print(test_data.info())


<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    int64  
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked_C   889 non-null    bool   
 9   Embarked_Q   889 non-null    bool   
 10  Embarked_S   889 non-null    bool   
dtypes: bool(3), float64(2), int64(6)
memory usage: 65.1 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex        

In [29]:
target_column = train_data['Survived']
train_data.drop('Survived', axis=1, inplace=True)

In [30]:
import numpy as np
import pandas as pd
import xgboost as xgb

!pip install scikit-optimize
!pip install shap
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.9.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.9.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.9.0 scikit-optimize-0.10.2


In [32]:
# Check what GPU is available in Google Colab
!nvidia-smi

# Define parameter search space
search_spaces = {
    'learning_rate': Real(0.001, 0.5, 'uniform'),
    'max_depth': Integer(1, 70),
    'n_estimators': Integer(100, 800),
}

# Initialize the XGBClassifier model with GPU support
model = xgb.XGBClassifier(
    objective='binary:logistic',  # Use for binary classification
    tree_method='gpu_hist',  # Use GPU for training
    verbosity=2
)

kf = KFold(n_splits=10)  # 10-fold cross-validation

# Using 'accuracy' as the scoring metric (can be changed to roc_auc, f1, etc.)
optimizer = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    n_iter=64,
    cv=kf,
    scoring='accuracy',  # Use classification accuracy as the scoring metric
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

# Fit the optimizer using X_train and y_train (classification labels, not log-transformed)
optimizer.fit(train_data, target_column)

# Get the best model
best_model = optimizer.best_estimator_

# Get cross-validation results
cv_results = optimizer.cv_results_

# Print best parameters and mean accuracy
mean_accuracy = cv_results['mean_test_score']  # Mean accuracy for each fold

print(f"Best parameters: {optimizer.best_params_}")
print(f"Mean Cross-validation Accuracy: {mean_accuracy.mean():.4f}")
print(f"Cross-validation Accuracy for each iteration: {mean_accuracy}")

print("Optimization complete. Best XGBoost classifier model saved.")


Wed Oct 23 13:08:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    


    E.g. tree_method = "hist", device = "cuda"



Best parameters: OrderedDict([('learning_rate', 0.02141371663804202), ('max_depth', 28), ('n_estimators', 110)])
Mean Cross-validation Accuracy: 0.7877
Cross-validation Accuracy for each iteration: [0.78855975 0.7862998  0.78628703 0.77846016 0.78516343 0.79642492
 0.79304137 0.77056946 0.78854699 0.77506384 0.80879724 0.82228039
 0.81778601 0.61756895 0.61756895 0.61756895 0.80203013 0.78854699
 0.82114402 0.79980848 0.7862998  0.81666241 0.81892237 0.8020429
 0.78628703 0.82000766 0.80542646 0.80430286 0.78965781 0.79530133
 0.79980848 0.79756129 0.79867211 0.79530133 0.79864658 0.81103166
 0.79304137 0.79753575 0.79643769 0.80429009 0.82005873 0.82340398
 0.80655005 0.61756895 0.82226762 0.79867211 0.79191777 0.80656282
 0.81553882 0.80094484 0.78965781 0.80091931 0.79868488 0.8020429
 0.80431563 0.79867211 0.79980848 0.78516343 0.7840526  0.79081971
 0.79193054 0.79979571 0.79306691 0.79193054]
Optimization complete. Best XGBoost classifier model saved.


In [36]:
import pandas as pd

# Make predictions using the best model directly on test_data
preds = best_model.predict(test_data)

# Prepare the output DataFrame
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': preds})

# Remove any duplicate rows by 'PassengerId'
output.drop_duplicates(subset='PassengerId', keep='first', inplace=True)

# Save predictions to a CSV file
output.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")



Predictions saved to predictions.csv



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [37]:
from google.colab import files
files.download('predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>