In [134]:
print("Bismillah Hirrahamaa Nirraheem")

Bismillah Hirrahamaa Nirraheem


In [135]:
!ls Downloads/ | grep "spaceship"

spaceship-titanic
spaceship-titanic.zip


In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates['mod'] = go.layout.Template(layout=dict(font=dict(family="Fira Code",size=20)))
pio.templates.default = "plotly_dark+mod"
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score,confusion_matrix,roc_auc_score,roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from missingno import missingno
from zipfile import ZipFile

In [137]:
with ZipFile(file='/home/sohail/Downloads/spaceship-titanic.zip',mode="r") as file:
    file.extractall('/home/sohail/Downloads/spaceship-titanic')

In [138]:
train = pd.read_csv('/home/sohail/Downloads/spaceship-titanic/train.csv')
test = pd.read_csv('/home/sohail/Downloads/spaceship-titanic/test.csv')
sample = pd.read_csv('/home/sohail/Downloads/spaceship-titanic/sample_submission.csv')

In [139]:
total = train.copy()

### Table
<font size=4>

|SI.No|Name of Column|Description|
|-----|--------------|-----------|
|1|PassengerId|A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.|
|2|HomePlanet|The planet the passenger departed from, typically their planet of permanent residence|
|3|CryoSleep|Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins|
|4|Cabin|The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.|
|5|Destination|The planet the passenger will be debarking to|
|6|Age|The age of the passenger|
|7|VIP|Whether the passenger has paid for special VIP service during the voyage|
|8|RoomService, FoodCourt, ShoppingMall, Spa, VRDeck|Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities|
|9|Name|The first and last names of the passenger|
|10|Transported| Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict|

</font>

# Preprocessing

## Filling Nan's

In [140]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [141]:
total.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

as you can see there are quite a bit of Null values which need to be filled, we can apply some stratergies as we will see below

In [142]:
def preprocess_1(data:pd.DataFrame):
    df = data.copy()
    df[['Group','Within']] = df.PassengerId.str.split('_',expand=True).astype(np.float32)
    df = df[['Group','Within']+df.columns[1:-2].to_list()]
    df[['deck','num','side']] = df.Cabin.str.split('/',expand=True)
    df = df[df.columns[:4].to_list() + ['deck','num','side'] + df.columns[5:-3].to_list()]
    df['num'] = df['num'].astype(np.float32)
    return df

In [143]:
total_train = preprocess_1(train)
total_test = preprocess_1(test)

In [144]:
total_train.head()

Unnamed: 0,Group,Within,HomePlanet,CryoSleep,deck,num,side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,1.0,1.0,Europa,False,B,0.0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,2.0,1.0,Earth,False,F,0.0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,3.0,1.0,Europa,False,A,0.0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,3.0,2.0,Europa,False,A,0.0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,4.0,1.0,Earth,False,F,1.0,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
