In [111]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


In [112]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



In [113]:
train_data['HomePlanet'].fillna('Unknown', inplace=True)
train_data['CryoSleep'].fillna(False, inplace=True)
train_data['Destination'].fillna('Unknown', inplace=True)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['VIP'].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['HomePlanet'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['CryoSleep'].fillna(False, inplace=True)
  train_data['CryoSleep'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will neve

In [114]:
X = train_data.drop(columns=['PassengerId', 'Name', 'Cabin', 'Transported'])
y = train_data['Transported']

In [115]:
if y.nunique() != 2:
    raise ValueError("Target variable 'y' must be binary.")

In [116]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

In [117]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

In [118]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

In [119]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (6954, 10)
y_train shape: (6954,)


In [121]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

try:
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', refit=True, verbose=2)
    grid_search.fit(X_train, y_train)
    print("Grid search completed successfully.")
except Exception as e:
    print("Error during GridSearchCV fitting:", e)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.0s
[CV] END classifier__learning_rate=0.0

In [122]:
if hasattr(grid_search, 'best_estimator_'):
    best_model = grid_search.best_estimator_
    print("Best model found.")

In [123]:
try:
    y_val_pred = best_model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
except ValueError as e:
    print("Error during prediction:", e)

Error during prediction: could not convert string to float: 'Mars'




In [124]:
test_data['HomePlanet'].fillna('Unknown', inplace=True)
test_data['CryoSleep'].fillna(False, inplace=True)
test_data['Destination'].fillna('Unknown', inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['VIP'].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['HomePlanet'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['CryoSleep'].fillna(False, inplace=True)
  test_data['CryoSleep'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never w

In [125]:
X_test = test_data.drop(columns=['PassengerId', 'Name', 'Cabin'])
X_test = X_test.reindex(columns=X.columns, fill_value=0)

In [126]:
try:
    test_predictions = best_model.predict(X_test)
    print("Test predictions made successfully.")
except ValueError as e:
    print("Error during test prediction:", e)

Error during test prediction: could not convert string to float: 'Earth'




In [127]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

In [128]:
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
