In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
import catboost as cb
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score ,precision_score
import numpy as np
import os
imputer = KNNImputer(n_neighbors=2, weights="uniform")
SpendingCols = ['RoomService', 'FoodCourt', 'VRDeck', 'ShoppingMall', 'Spa']
categorical_columns = ['familyName','Deck','Side','HomePlanet','Destination','AgeByDecade']


In [2]:
def read_data(data_type):
    spaceship_path = os.path.join('.', f"{data_type}.csv")
    df = pd.read_csv(filepath_or_buffer = spaceship_path, delimiter = ',')
    if data_type == 'train':
        X = df.drop(columns=['Transported'])   
        y = df['Transported']
        y = [int(x) for x in y]
        return X,y
    return df,None


In [3]:
def get_num_data(X) : 
    X[["Deck", "Num", "Side"]] = X["Cabin"].str.split("/", expand=True)
    X[['Age']] = imputer.fit_transform(X[['Age']])
    X[['Num']] = imputer.fit_transform(X[['Num']])
    X['Deck'].fillna('F', inplace=True)
    X['Side'].fillna('P', inplace=True)

    return X

In [4]:
def get_cat_data(X):
    X['familyName'] = X[~pd.isnull(X['Name'])]['Name'].apply(lambda x: x.split(' ')[-1])
    X['HomePlanet'].fillna(X.groupby('familyName')['HomePlanet'].ffill(), inplace=True)
    X['CryoSleep'].astype(bool)
    X['VIP'].astype(bool)
    X['CryoSleep'] = X['CryoSleep'].fillna(False)
    X['VIP'] = X['VIP'].fillna(False)
    X['AgeByDecade'] = pd.cut(x=X['Age'], bins=[0, 20, 29, 49, 79], labels=['Teens', '20s', '30s-40s', 'Elderly'])               

    #if there at least one value of Expenditures Cols not equal 0 else 1
    for index, row in X[X['CryoSleep'].isnull()].iterrows():
        if row['RoomService'] !=0 or row['FoodCourt'] !=0 or row['ShoppingMall']!=0 or row['Spa']!=0 or  row['VRDeck']!=0:
            row['CryoSleep'] = 0
        else:
            1
    for col in SpendingCols:
        X.loc[X['CryoSleep'] == True, col] = X.loc[X['CryoSleep'] == True, col].fillna(0)

    for col in categorical_columns:
        mode_value = X[col].mode()[0]
        X[col] = X[col].fillna(mode_value)
    for col in SpendingCols:
        X[[col]] = imputer.fit_transform(X[[col]])
    X['AllSpending'] = X['RoomService'] + X['ShoppingMall'] + X['FoodCourt'] + X['Spa'] + X['VRDeck']


    X['TotalExpensesofFamily'] = X.groupby('familyName')['AllSpending'].transform('sum')


    return X

In [5]:
def get_cat_col(X):
    columnsToDrop = ['PassengerId','Name','Cabin','Age']
    X.drop(columns=columnsToDrop, inplace=True)
    # Specify the indices of categorical features
    X = X.reset_index(drop=True)
    categorical_indices = [X.columns.get_indexer_for([column])[0] for column in categorical_columns]

    return X,categorical_indices

In [6]:
X,Y = read_data('train')
X = get_num_data(X)
X = get_cat_data(X)
X,categorical_indices = get_cat_col(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

param_grid = {
              'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
              'l2_leaf_reg':[3,1,5,10,100],

 }

model = cb.CatBoostClassifier(cat_features=categorical_indices)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10)

# Fit the grid search on your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

0:	learn: 0.6795595	total: 186ms	remaining: 3m 6s
1:	learn: 0.6697426	total: 205ms	remaining: 1m 42s
2:	learn: 0.6576719	total: 245ms	remaining: 1m 21s
3:	learn: 0.6436708	total: 283ms	remaining: 1m 10s
4:	learn: 0.6314943	total: 320ms	remaining: 1m 3s
5:	learn: 0.6190655	total: 356ms	remaining: 58.9s
6:	learn: 0.6096329	total: 396ms	remaining: 56.1s
7:	learn: 0.5994751	total: 434ms	remaining: 53.8s
8:	learn: 0.5902175	total: 482ms	remaining: 53.1s
9:	learn: 0.5828328	total: 519ms	remaining: 51.4s
10:	learn: 0.5749418	total: 557ms	remaining: 50.1s
11:	learn: 0.5673335	total: 590ms	remaining: 48.6s
12:	learn: 0.5613329	total: 638ms	remaining: 48.4s
13:	learn: 0.5543509	total: 676ms	remaining: 47.6s
14:	learn: 0.5489592	total: 714ms	remaining: 46.9s
15:	learn: 0.5421118	total: 750ms	remaining: 46.1s
16:	learn: 0.5368092	total: 783ms	remaining: 45.3s
17:	learn: 0.5325004	total: 817ms	remaining: 44.5s
18:	learn: 0.5276211	total: 851ms	remaining: 44s
19:	learn: 0.5225819	total: 885ms	remain

In [None]:
Xt,Y = read_data('test')
Xt = get_num_data(Xt)
Xt = get_cat_data(Xt)
output = pd.DataFrame({'PassengerId': Xt.PassengerId})
columnsToDrop = ['PassengerId','Name','Cabin','Age']
Xt.drop(columns=columnsToDrop, inplace=True)
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
predictions = best_model.predict(Xt)
output['Transported'] = predictions
output = output.replace({0: False, 1: True})

output.to_csv('CatBoot_nas.csv', index=False)

In [None]:
# from catboost import CatBoostClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint as sp_randint

# # Prepare your dataset
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# # Define the parameter distributions for randomized search
# param_dist = {
#     'learning_rate': [0.01, 0.03, 0.1],
#     'depth': sp_randint(4, 10),
#     'l2_leaf_reg': sp_randint(1, 10),
#     'iterations': [500, 1000, 1500]
# }

# # Initialize the CatBoost classifier
# model = cb.CatBoostClassifier(cat_features=categorical_indices)

# # Perform randomized search with cross-validation
# random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=5)

# # Fit the randomized search on your training data
# random_search.fit(X_train, y_train)

# # Get the best parameters and the best score
# best_params = random_search.best_params_
# best_score = random_search.best_score_

# print("Best Parameters:", best_params)
# print("Best Score:", best_score)

0:	learn: 0.6797666	total: 180ms	remaining: 2m 59s
1:	learn: 0.6676892	total: 210ms	remaining: 1m 44s
2:	learn: 0.6543302	total: 241ms	remaining: 1m 20s
3:	learn: 0.6426098	total: 274ms	remaining: 1m 8s
4:	learn: 0.6351827	total: 307ms	remaining: 1m 1s
5:	learn: 0.6254387	total: 344ms	remaining: 56.9s
6:	learn: 0.6147073	total: 380ms	remaining: 53.9s
7:	learn: 0.6054435	total: 426ms	remaining: 52.8s
8:	learn: 0.5977650	total: 461ms	remaining: 50.8s
9:	learn: 0.5891506	total: 499ms	remaining: 49.4s
10:	learn: 0.5828113	total: 534ms	remaining: 48s
11:	learn: 0.5751229	total: 570ms	remaining: 46.9s
12:	learn: 0.5689003	total: 604ms	remaining: 45.9s
13:	learn: 0.5623452	total: 640ms	remaining: 45.1s
14:	learn: 0.5563235	total: 677ms	remaining: 44.5s
15:	learn: 0.5502637	total: 719ms	remaining: 44.2s
16:	learn: 0.5446553	total: 759ms	remaining: 43.9s
17:	learn: 0.5399553	total: 800ms	remaining: 43.6s
18:	learn: 0.5357410	total: 837ms	remaining: 43.2s
19:	learn: 0.5322751	total: 879ms	remain

In [None]:
# Xt,Y = read_data('test')
# Xt = get_num_data(Xt)
# Xt = get_cat_data(Xt)
# output = pd.DataFrame({'PassengerId': Xt.PassengerId})
# columnsToDrop = ['PassengerId','Name','Cabin','Age']
# Xt.drop(columns=columnsToDrop, inplace=True)
# # Get the best model from the grid search
# best_model = random_search.best_estimator_

# # Make predictions on the test data using the best model
# predictions = best_model.predict(Xt)
# output['Transported'] = predictions
# output = output.replace({0: False, 1: True})

# output.to_csv('CatBoot_nas_ranbsrch.csv', index=False)