In [99]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [66]:
df=pd.read_csv('train.csv')

In [48]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [69]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [106]:
X=df.drop(['Transported','Name','PassengerId','Cabin'],axis=1)
y=df['Transported']

In [107]:
categorical_columns=X.select_dtypes('object').columns

In [108]:
categorical_columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP'], dtype='object')

In [109]:
numerical_columns=X.select_dtypes(exclude='object').columns

In [110]:
numerical_columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [179]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Assuming 'numerical_columns' and 'categorical_columns' are defined lists of column names
preprocessor = make_column_transformer(
    (transform(), ['Cabin']),
    (make_pipeline(
        SimpleImputer(strategy='mean')  # Imputation for numerical columns
    ), numerical_columns),
    (make_pipeline(
        SimpleImputer(strategy='most_frequent'),  # Imputation for categorical columns
        OneHotEncoder(sparse_output=False, drop='first')  # Encoding categorical variables
    ), categorical_columns)
)


TypeError: transform() missing 2 required positional arguments: 'self' and 'X'

In [166]:
rf_model=make_pipeline(preprocessor,RandomForestClassifier(random_state=42,n_estimators=100))

In [167]:
rf_model.fit(X_train,y_train)

In [168]:
rf_model.score(X_test,y_test)

0.7950075642965204

In [169]:
test=pd.read_csv('test.csv')
sub_test=test.drop(['PassengerId'],axis=1)
test_pred=rf_model.predict(sub_test).astype(bool)
submission=pd.DataFrame({'PassengerId': test['PassengerId'],'Transported':test_pred})
submission.to_csv('submission.csv',index=False)

In [170]:
from sklearn.neighbors import KNeighborsClassifier

In [171]:
knn_model=make_pipeline(preprocessor,KNeighborsClassifier(n_neighbors=3))
knn_model.fit(X_train, y_train)

In [172]:
knn_model.score(X_test,y_test)

0.7609682299546142

In [173]:
from sklearn.ensemble import GradientBoostingClassifier
xgb_model=make_pipeline(preprocessor,GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1))

xgb_model.fit(X_train, y_train)


In [174]:
xgb_model.score(X_test,y_test)

0.7957639939485628

In [175]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'gradientboostingclassifier__n_estimators': [100, 200, 300],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 1.0],
    'gradientboostingclassifier__max_depth': [1, 3, 5],
    'gradientboostingclassifier__subsample': [0.5, 0.7, 1.0],  # Optionally add more parameters
    'gradientboostingclassifier__min_samples_split': [2, 4, 6]
}


In [176]:
grid_search = GridSearchCV(
    xgb_model,  # Your previously defined pipeline
    param_grid,  # The parameter grid
    cv=5,  # Number of cross-validation folds
    scoring='accuracy',  # Can be changed depending on what metric you're focusing on
    verbose=1,  # Higher means more print outs
    n_jobs=-1  # Use all available cores
)


In [177]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [161]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Best model obtained from GridSearch
best_model = grid_search.best_estimator_

Best parameters: {'gradientboostingclassifier__learning_rate': 0.01, 'gradientboostingclassifier__max_depth': 5, 'gradientboostingclassifier__min_samples_split': 6, 'gradientboostingclassifier__n_estimators': 300, 'gradientboostingclassifier__subsample': 0.7}
Best cross-validation score: 0.80


In [162]:
predictions = best_model.predict(X_test)
# Evaluate these predictions using a metric, e.g., accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8071104387291982


In [164]:
test=pd.read_csv('test.csv')
sub_test=test.drop(['PassengerId'],axis=1)
test_pred=best_model.predict(sub_test).astype(bool)
submission=pd.DataFrame({'PassengerId': test['PassengerId'],'Transported':test_pred})
submission.to_csv('submission.csv',index=False)