In [178]:
import pandas as pd
import numpy as np
import plotly.express as px


## load data

In [179]:
train_raw_df = pd.read_csv("../dataset/spaceship-titanic/train.csv")
test_raw_df = pd.read_csv("../dataset/spaceship-titanic/test.csv")

In [180]:
train_process_df = train_raw_df.copy()

## Data Exploration

In [183]:
def explore_data(column, chart=[], chart_column="Transported"):
    column_data = train_raw_df[column]
    
    print("------- Column Info: -------")
    print(train_raw_df[column].info())
    
    print("------- Data Counts: -------")
    print(train_raw_df[column].value_counts())
    
    print("------- Null Check: -------")
    print(train_raw_df[column].isnull().sum())
    
    print("------- Describe -------")
    print(train_raw_df[column].describe())
    
    if "bar" in chart:
        print(f"------- Bar Plot {column} vs {chart_column}: -------")
        fig = px.bar(train_raw_df, x=column, color=chart_column, barmode="group")
        fig.show()
    if "pie" in chart:
        print(f"------- Pie Chart {column} Count plot: -------")
        values= train_raw_df[column].value_counts(dropna=False)
        values_dict =values.to_dict()
        fig = px.pie(values=list(values_dict.values()), names=list(values_dict.keys()))
        fig.show()
    if "hist" in chart:
        print(f"------- Hist Chart {column} : -------")
        fig = px.histogram(x=train_raw_df[column], barmode="group")
        fig.update_layout(bargap=0.1)
        fig.show()
        
    if "box" in chart:
        fig = px.box(train_raw_df, y=column)
        fig.show()
    
    return column_data

In [171]:
def change_data(column, type_cast=dict()):
    if type_cast:
        train_process_df[column] = train_process_df[column].astype(type_cast["to"])

### Data-Column Exploration

In [7]:
test_raw_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [8]:
train_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [10]:
train_raw_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [14]:
train_raw_df.Transported.value_counts()

True     4378
False    4315
Name: Transported, dtype: int64

#### PassengerId

In [185]:

explore_data("PassengerId", chart=["box"])

------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: PassengerId
Non-Null Count  Dtype 
--------------  ----- 
8693 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
0001_01    1
6136_01    1
6141_01    1
6139_06    1
6139_05    1
          ..
3126_01    1
3124_03    1
3124_02    1
3124_01    1
9280_02    1
Name: PassengerId, Length: 8693, dtype: int64
------- Null Check: -------
0
------- Describe -------
count        8693
unique       8693
top       0001_01
freq            1
Name: PassengerId, dtype: object


0       0001_01
1       0002_01
2       0003_01
3       0003_02
4       0004_01
         ...   
8688    9276_01
8689    9278_01
8690    9279_01
8691    9280_01
8692    9280_02
Name: PassengerId, Length: 8693, dtype: object

In [18]:
train_raw_df.PassengerId.sample(10)

5322    5686_02
4901    5223_03
831     0889_01
5687    6031_01
7017    7462_01
1985    2123_02
8460    9034_04
6972    7403_01
398     0437_01
8176    8736_01
Name: PassengerId, dtype: object

In [21]:
train_raw_df.PassengerId.str.split("_", expand=True)

Unnamed: 0,0,1
0,0001,01
1,0002,01
2,0003,01
3,0003,02
4,0004,01
...,...,...
8688,9276,01
8689,9278,01
8690,9279,01
8691,9280,01


In [23]:
train_raw_df.PassengerId.str.split("_", expand=True).iloc[:, 0].unique()

array(['0001', '0002', '0003', ..., '9278', '9279', '9280'], dtype=object)

In [24]:
len(train_raw_df.PassengerId.str.split("_", expand=True).iloc[:, 0].unique())

6217

### HomePlanet

In [187]:
explore_data("HomePlanet", chart=["pie"])

------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: HomePlanet
Non-Null Count  Dtype 
--------------  ----- 
8492 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
------- Null Check: -------
201
------- Describe -------
count      8492
unique        3
top       Earth
freq       4602
Name: HomePlanet, dtype: object
------- Pie Chart HomePlanet Count plot: -------


0       Europa
1        Earth
2       Europa
3       Europa
4        Earth
         ...  
8688    Europa
8689     Earth
8690     Earth
8691    Europa
8692    Europa
Name: HomePlanet, Length: 8693, dtype: object

In [35]:
fig = px.bar(train_raw_df, x="HomePlanet", color="Transported", barmode="group")
fig.show()

In [45]:
values= train_raw_df.HomePlanet.value_counts(dropna=False)

In [51]:
values_dict =values.to_dict()

In [52]:
values_dict

{'Earth': 4602, 'Europa': 2131, 'Mars': 1759, nan: 201}

In [54]:
fig = px.pie(values=list(values_dict.values()), names=list(values_dict.keys()))
fig.show()

#### CryoSleep

In [56]:
train_raw_df.CryoSleep.value_counts(dropna=False)

False    5439
True     3037
NaN       217
Name: CryoSleep, dtype: int64

In [57]:
fig = px.bar(train_raw_df, x="CryoSleep", color="Transported", barmode="group")
fig.show()

#### Cabin

In [60]:
train_raw_df.Cabin.value_counts(dropna=False)

NaN        199
G/734/S      8
C/137/S      7
B/201/P      7
G/109/P      7
          ... 
G/556/P      1
E/231/S      1
G/545/S      1
G/543/S      1
C/178/S      1
Name: Cabin, Length: 6561, dtype: int64

In [66]:
train_process_df["deck"] = train_raw_df.Cabin.str.split("/", expand=True)[0]
train_process_df["Num"] = train_raw_df.Cabin.str.split("/", expand=True)[1]
train_process_df["Side"] = train_raw_df.Cabin.str.split("/", expand=True)[2]

In [67]:
train_process_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S


In [77]:
train_process_df["Num"] = train_process_df.Num.astype("Int64")

In [78]:
train_process_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  deck          8494 non-null   object 
 15  Num           8494 non-null   Int64  
 16  Side          8494 non-null   object 
dtypes: Int64(1), bool(1), float64(6), object(9)
memory usage: 1.1+ MB


### Destination

In [80]:
train_raw_df.Destination.value_counts(dropna=False)

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: Destination, dtype: int64

In [81]:
fig = px.bar(train_raw_df, x="Destination", color="Transported", barmode="group")
fig.show()

#### Age

In [91]:
train_raw_df.Age.isnull().sum()

179

In [85]:
train_raw_df.Age.describe()

count    8514.000000
mean       28.827930
std        14.489021
min         0.000000
25%        19.000000
50%        27.000000
75%        38.000000
max        79.000000
Name: Age, dtype: float64

In [90]:
fig = px.histogram(x=train_raw_df["Age"], barmode="group")
fig.update_layout(bargap=0.1)
fig.show()

#### VIP (Feature)

In [95]:
train_raw_df.VIP.value_counts(dropna=False)

False    8291
NaN       203
True      199
Name: VIP, dtype: int64

In [96]:
fig = px.bar(train_raw_df, x="VIP", color="Transported", barmode="group")
fig.show()

#### Name 

In [98]:
train_raw_df.Name.isnull().sum()

200

In [99]:
train_raw_df.Name.value_counts()

Gollux Reedall        2
Elaney Webstephrey    2
Grake Porki           2
Sus Coolez            2
Apix Wala             2
                     ..
Jamela Griffy         1
Hardy Griffy          1
Salley Mckinn         1
Mall Frasp            1
Propsh Hontichre      1
Name: Name, Length: 8473, dtype: int64

#### RoomService

In [102]:
train_raw_df.RoomService.isnull().sum()

181

In [103]:
train_raw_df.RoomService.value_counts()

0.0       5577
1.0        117
2.0         79
3.0         61
4.0         47
          ... 
1612.0       1
2598.0       1
632.0        1
378.0        1
745.0        1
Name: RoomService, Length: 1273, dtype: int64

In [105]:
train_raw_df.RoomService.describe()

count     8512.000000
mean       224.687617
std        666.717663
min          0.000000
25%          0.000000
50%          0.000000
75%         47.000000
max      14327.000000
Name: RoomService, dtype: float64

In [106]:
fig = px.box(train_raw_df, y="RoomService")
fig.show()

#### FoodCourt 

In [108]:
train_raw_df.FoodCourt.isnull().sum()

183

In [109]:
px.box(train_raw_df, y="FoodCourt")

#### ShoppingMall

In [110]:
train_raw_df.ShoppingMall.isnull().sum()

208

In [111]:
train_raw_df.ShoppingMall.describe()

count     8485.000000
mean       173.729169
std        604.696458
min          0.000000
25%          0.000000
50%          0.000000
75%         27.000000
max      23492.000000
Name: ShoppingMall, dtype: float64

In [112]:
px.box(train_raw_df, y="ShoppingMall")

#### Spa

In [115]:
train_raw_df.Spa.isnull().sum()

183

In [116]:
train_raw_df.Spa.describe()

count     8510.000000
mean       311.138778
std       1136.705535
min          0.000000
25%          0.000000
50%          0.000000
75%         59.000000
max      22408.000000
Name: Spa, dtype: float64

In [117]:
px.box(train_raw_df, y="Spa")

#### VRDeck

In [118]:
train_raw_df.VRDeck.isnull().sum()

188

In [119]:
train_raw_df.VRDeck.describe()

count     8505.000000
mean       304.854791
std       1145.717189
min          0.000000
25%          0.000000
50%          0.000000
75%         46.000000
max      24133.000000
Name: VRDeck, dtype: float64

In [120]:
px.box(train_raw_df, y="VRDeck")