In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np

# Carga de datos

In [2]:
train_data = pd.read_csv("../spaceship-titanic/train.csv")
print(train_data.shape)
train_data.head(3)

(8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [3]:
test_data = pd.read_csv("../spaceship-titanic/test.csv")
test_data.shape

(4277, 13)

In [4]:
train_data['Dataset'] = 'train'
test_data['Dataset'] = 'test'
data = pd.concat([train_data, test_data], ignore_index=True)
data.shape

(12970, 15)

In [5]:
data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported      object
Dataset          object
dtype: object

# ETL Process

## Tamaño del grupo del pasajero

In [6]:
passengersInfo = data.PassengerId.apply( lambda gggg_pp : pd.Series(index=['PassengerGroup','PassengerGroupNum'],
                                                                    data=[gggg_pp.split('_')[0], gggg_pp.split('_')[1]]) )
data = data.merge(passengersInfo, how='inner', left_index=True, right_index=True)

In [7]:
group_size = passengersInfo.groupby('PassengerGroup')[['PassengerGroupNum']].max().rename(columns={'PassengerGroupNum':'GroupSize'}).reset_index()
group_size['GroupSize'] = group_size.GroupSize.astype('int')
data = data.merge(group_size, how='left', on='PassengerGroup')
data.drop(columns=['PassengerId','PassengerGroup','PassengerGroupNum'], inplace=True)

In [8]:
data.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported      object
Dataset          object
GroupSize         int64
dtype: object

## Nueva categoría para los nulos en `HomePlanet`

In [9]:
data.HomePlanet.fillna("Ns/Nc", inplace=True)

## Nueva categoría para los nulos en `CryoSleep`

In [10]:
data['CryoSleep'] = data.CryoSleep.astype('string').fillna('Ns/Nc')

## Cabin deck and cabin side

In [11]:
passengersInfo = data.Cabin.apply( lambda cabin : pd.Series(index=['CabinDeck','CabinSide'],
                                                            data=[cabin.split('/')[0], cabin.split('/')[2]]) if pd.notna(cabin)\
                                   else pd.Series(index=['CabinDeck', 'CabinSide'], data=["Ns/Nc","Ns/Nc"]) )
data = data.merge(passengersInfo, how='inner', left_index=True, right_index=True)
data.drop(columns=['Cabin'], inplace=True)

In [12]:
data.dtypes

HomePlanet       object
CryoSleep        string
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported      object
Dataset          object
GroupSize         int64
CabinDeck        object
CabinSide        object
dtype: object

## Nueva categoría para los nulos en `Destination`

In [13]:
data.Destination.fillna("Ns/Nc", inplace=True)

## Nueva categoría para los nulos en `VIP`

In [14]:
data['VIP'] = data.VIP.astype('string').fillna('Ns/Nc')

## Nivel de gasto total

- Puede que sea suficiente tener una variable binaria que indique si un pasajero ha tenido gastos extra (`Expenses`).
- También se podrían normalizar las variables de gastos extra por separado o binarizarlas.

In [15]:
data['TotalExpenses'] = data[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)

quantiles = data.loc[data.TotalExpenses!=0, 'TotalExpenses'].quantile([0.33, 0.66, 0.95]).values
bins = np.append( np.insert(quantiles, 0, [0, 0.5]), np.inf )

data['TotalExpensesBinarized'] = pd.cut(data.TotalExpenses, bins, include_lowest=True, labels=["ZERO","LOW","MEDIUM","HIGH","TOP"])

data.drop(columns=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalExpenses'], inplace=True)

## Rango de edad

En primer lugar, calculamos la mediana de edad en función de la variable `TotalExpensesBinarized` para los pasajeros que no tienen la edad informada.

In [16]:
tmp = data.groupby('TotalExpensesBinarized')[['Age']].median().rename(columns={'Age':'AgeMedianByExpenses'})
data = data.merge(tmp, how='left', on='TotalExpensesBinarized')
data['Age'] = data.apply(lambda row : row.AgeMedianByExpenses if pd.isna(row.Age) else row.Age, axis=1)

In [17]:
data['AgeBinarized'] = pd.cut(data.Age, bins=[-1, 15, 30, 45, 60, 75, np.inf], labels=["0-15", "16-30", "31-45", "46-60", "61-75", "76-?"])
data.drop(columns=['Age', 'AgeMedianByExpenses'], inplace=True)

## Descartamos la variable `Name`

In [18]:
data.drop(columns=['Name'], inplace=True)

# Conjuntos de train y test

In [19]:
data_train = data[ data.Dataset=='train' ].drop(columns=['Dataset'])
data_test = data[ data.Dataset=='test' ].drop(columns=['Dataset'])
data_train.shape, data_test.shape

((8693, 10), (4277, 10))

In [20]:
data_train.to_csv("data/processed_train.csv", index=False)
data_test.to_csv("data/processed_test.csv", index=False)

In [21]:
data.dtypes

HomePlanet                  object
CryoSleep                   string
Destination                 object
VIP                         string
Transported                 object
Dataset                     object
GroupSize                    int64
CabinDeck                   object
CabinSide                   object
TotalExpensesBinarized    category
AgeBinarized              category
dtype: object

In [22]:
data.sample(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,Transported,Dataset,GroupSize,CabinDeck,CabinSide,TotalExpensesBinarized,AgeBinarized
9170,Earth,False,TRAPPIST-1e,False,,test,1,G,S,MEDIUM,16-30
9357,Mars,True,TRAPPIST-1e,False,,test,7,F,S,ZERO,16-30
11620,Europa,False,TRAPPIST-1e,False,,test,1,C,P,HIGH,46-60
