In [251]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [223]:
data=pd.read_csv('train.csv')

In [224]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [225]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Missing Data

In [226]:
# Assuming you have a pandas DataFrame named 'data'
columns_with_missing_values = data.columns[data.isnull().any()].tolist()
missing_values_count = data[columns_with_missing_values].isnull().sum()

print("Columns with missing values and their counts:")
for column in columns_with_missing_values:
    print(f"{column}: {missing_values_count[column]}")

Columns with missing values and their counts:
HomePlanet: 201
CryoSleep: 217
Cabin: 199
Destination: 182
Age: 179
VIP: 203
RoomService: 181
FoodCourt: 183
ShoppingMall: 208
Spa: 183
VRDeck: 188
Name: 200


# Inital preprocessing

In [227]:
def preprocess_inputs(df):
    df=df.copy()
    # drop unnessecary columns
    df=df.drop(['PassengerId','Name'],axis=1)
    
    X=df.drop(['Transported'],axis=1)
    y=df['Transported']
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [228]:
X_train, X_test, y_train, y_test=preprocess_inputs(data)

In [229]:
X_train.columns

Index(['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype='object')

# Identidying data types

In [230]:
#classify features by type
binary_features=['CryoSleep','VIP']
nominal_features=['HomePlanet','Destination','deck','side','num']
numerical_features=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
cabin_features=['Cabin']

# Building Pipeline

In [231]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class CabinSplitTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=['Cabin']):
        self.columns = columns
    
    def fit(self, X, y=None):
        # CabinSplitTransformer doesn't need any fitting, so we just return self
        return self
    
    def transform(self, X):
        # Ensure X is a DataFrame
        X = pd.DataFrame(X)
        new_columns = []  # To collect new DataFrame slices

        for column in self.columns:
            if column in X.columns:
                # Split the column into three new columns
                splits = X[column].str.split('/', expand=True)
                if splits.shape[1] == 3:  # Ensure there are exactly three parts
                    splits.columns = ['Deck', 'Num', 'Side']
                    new_columns.append(splits)
                else:
                    raise ValueError(f"The column {column} does not contain exactly three parts separable by '/'.")
            else:
                raise KeyError(f"The column {column} was not found in the DataFrame.")

            # Drop the original column from X
            X = X.drop(column, axis=1)

        # Concatenate all the new columns with the remaining parts of the original DataFrame
        X = pd.concat([X] + new_columns, axis=1)
        
        return X



In [232]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Example data
data = pd.DataFrame({'Cabin': ['A/10/right', 'B/5/left', 'C/7/right','D/9/right']})

# Instantiate the transformer
cabin_split_transformer = CabinSplitTransformer()

# Define the pipeline
pipeline = Pipeline([
    ('cabin_processing', cabin_split_transformer),
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(data)

print(transformed_data)



  Deck Num   Side
0    A  10  right
1    B   5   left
2    C   7  right
3    D   9  right


In [233]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class CabinTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['Deck'] = X['Cabin'].str.extract(r'([A-Za-z])')
        X['Num'] = X['Cabin'].str.extract(r'(\d+)')
        X['Side'] = X['Cabin'].str[-1]  # Assuming the last character represents the side
        X.drop(columns=['Cabin'], inplace=True)  # Remove the original 'Cabin' column
        return X

# Example usage:
cabin_data = pd.DataFrame({'Cabin': ['C23 C25 C27', 'B55', 'D47', 'E33', 'F21']})
cabin_transformer = CabinTransformer()
cabin_transformed = cabin_transformer.fit_transform(cabin_data)
print(cabin_transformed)


  Deck Num Side
0    C  23    7
1    B  55    5
2    D  47    7
3    E  33    3
4    F  21    1


In [234]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class CabinTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Extracting deck, num, and side from the 'cabin' column
        X['deck'] = X['Cabin'].str.extract(r'([a-zA-Z]+)').fillna('Unknown')
        X['num'] = X['Cabin'].str.extract(r'(\d+)').astype(str).fillna('Unknown')
        X['side'] = X['Cabin'].str.extract(r'([^\d]+)').fillna('Unknown')
        
        # Dropping the original 'cabin' column
        X = X.drop(columns=['Cabin'])
        
        return X



In [235]:
#Construct transformers to handle each type of features
binary_transformer=Pipeline(steps=[
    ('nominal_impute',SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder()) #  no need to impute for binary variables as no missing values
])

nominal_transformer=Pipeline(steps=[
    ('nominal_impute',SimpleImputer(strategy='most_frequent')),
    ('nominal_encode',OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer=Pipeline(steps=[
    ('numerical',SimpleImputer(strategy='median'))
])





In [236]:
preprocessor=ColumnTransformer(transformers=[
    ('binary',binary_transformer, binary_features),
    ('nominal',nominal_transformer, nominal_features),
    ('numerical',numerical_transformer,numerical_features)
    
],sparse_threshold=0)

In [247]:
# Build the final pipeline
model=Pipeline(steps=[
    ('cabin_transformer', CabinTransformer()),
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('classifier',LogisticRegression())
])

In [248]:
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [249]:
acc=model.score(X_test,y_test)
print('Test Accuracy: {:.2f}%'.format(acc*100))

Test Accuracy: 71.82%


In [243]:
# Build the final pipeline
model=Pipeline(steps=[
    ('cabin_transformer', CabinTransformer()),
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('classifier',RandomForestClassifier())
])

In [244]:
model.fit(X_train,y_train)

In [245]:
acc=model.score(X_test,y_test)
print('Test Accuracy: {:.2f}%'.format(acc*100))

Test Accuracy: 78.30%


In [252]:
# Build the final pipeline
model=Pipeline(steps=[
    ('cabin_transformer', CabinTransformer()),
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('classifier',xgb.XGBClassifier())
])

In [253]:
model.fit(X_train,y_train)

In [254]:
acc=model.score(X_test,y_test)
print('Test Accuracy: {:.2f}%'.format(acc*100))

Test Accuracy: 79.22%
