In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  

## our Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

## Model Evaulations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.metrics import precision_score , recall_score , f1_score,accuracy_score
from sklearn.metrics import RocCurveDisplay

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [16]:
train=pd.read_csv("Data Files/train.csv")
test=pd.read_csv("Data Files/test.csv")
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [17]:
len(train)

8693

### Data Cleaning

In [18]:
df_train=train.drop("Name",axis=1)
df_test=test.drop("Name",axis=1)
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


## Check on null values

In [19]:
df_train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [20]:
df_test.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 823.6+ KB


In [22]:
# print all numerical columns and categorical columns
def print_numerical_and_categorical_columns(df):
    # Numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    print("Numerical Columns:")
    print(numerical_cols)

    # Categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    print("\nCategorical Columns:")
    print(categorical_cols)
    
print_numerical_and_categorical_columns(df_train)

Numerical Columns:
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

Categorical Columns:
['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']


In [23]:
print_numerical_and_categorical_columns(df_test)

Numerical Columns:
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

Categorical Columns:
['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']


In [24]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


# Data preprocessing

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess_data(df_train, df_test):
    # Separate Cabin into 3 columns
    df_train[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = df_train['Cabin'].str.split('/', expand=True)
    df_test[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = df_test['Cabin'].str.split('/', expand=True)

    # Drop original columns
    df_train.drop('Cabin', axis=1, inplace=True)
    df_test.drop('Cabin', axis=1, inplace=True)

    categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Num', 'Cabin_Side']

    # Most common values in each column
    modes = df_train[categorical_columns].mode().iloc[0]

    # Fill in missing values with the most common ones
    df_train[categorical_columns] = df_train[categorical_columns].fillna(modes)
    df_test[categorical_columns] = df_test[categorical_columns].fillna(modes)

    numerical_cols = df_train.columns[df_train.dtypes == 'float64']

    # Fill in missing values with the average value of each column
    df_train[numerical_cols] = df_train[numerical_cols].fillna(df_train[numerical_cols].mean())
    df_test[numerical_cols] = df_test[numerical_cols].fillna(df_test[numerical_cols].mean())

    # Change Cabin_Num to int
    df_train['Cabin_Num'] = df_train['Cabin_Num'].astype(int)
    df_test['Cabin_Num'] = df_test['Cabin_Num'].astype(int)

    # Separate features and target
    X = df_train.drop(['Transported', 'PassengerId'], axis=1)
    y = df_train['Transported'].astype(int)  # Convert boolean 'Transported' to int (True to 1, False to 0)

    # Define categorical and numerical features
    categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']
    numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Num']

    # Create the preprocessing pipelines for both numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Apply transformations
    X_processed = preprocessor.fit_transform(X)
    test_prep = preprocessor.fit_transform(df_test)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test,test_prep



X_train, X_test, y_train, y_test, test_prep = preprocess_data(df_train, df_test)


In [26]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_Deck,Cabin_Num,Cabin_Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,98,P
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,1499,S
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,1500,S
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,608,S


In [27]:
# # Separate features and target
# X = df_train.drop(['Transported', 'PassengerId'], axis=1)
# y = df_train['Transported'].astype(int)  # Convert boolean 'Transported' to int (True to 1, False to 0)
# # Apply transformations
# X_processed = preprocessor.fit_transform(X)

In [28]:
# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [29]:
len(X_train)

6954

In [30]:
len(X_test)

1739

In [31]:
X_train

array([[-0.05774287, -0.34058987, -0.25281708, ...,  0.        ,
         0.        ,  1.        ],
       [-0.82492266, -0.34058987,  0.46221088, ...,  0.        ,
         1.        ,  0.        ],
       [-0.05774287, -0.34058987, -0.28731404, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.47620457, -0.10108758, -0.28731404, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.36071884,  0.23391246, -0.28731404, ...,  0.        ,
         1.        ,  0.        ],
       [-0.05774287, -0.32997901,  0.01939532, ...,  0.        ,
         1.        ,  0.        ]])

# Model

In [32]:
!pip install lightgbm



In [33]:

lgb_classifier = LGBMClassifier()
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [None, 10, 20]}

grid_search_lgb = GridSearchCV(estimator=lgb_classifier, param_grid=param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1)
grid_search_lgb.fit(X_train, y_train)

print("Best Hyperparameters for LightGBM:", grid_search_lgb.best_params_)
best_lgb_model = grid_search_lgb.best_estimator_


[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1648
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Best Hyperparameters for LightGBM: {'max_depth': None, 'n_estimators': 50}


In [40]:
y_pred_lgb2 = best_lgb_model.predict(X_test)

In [42]:
accuracy_2_lgb = accuracy_score(y_test, y_pred_lgb2)
print("Accuracy with LightGBM:", accuracy_2_lgb)

Accuracy with LightGBM: 0.8062104657849338


In [43]:
y_pred_lgb = best_lgb_model.predict(test_prep) 

In [44]:
pred_df = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                       'Transported': y_pred_lgb})

# Change the data type of the 'Transported' column to boolean
pred_df['Transported'] = pred_df['Transported'].astype(bool)

pred_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [368]:
pred_df.to_csv("sample_submission.csv", index = False)

In [45]:
len(pred_df)

4277