In [1]:
import numpy as np
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression


In [2]:
df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')

In [3]:
df_train_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
df_test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [5]:
values = {'HomePlanet': "no_info",
        'CryoSleep': 0,
        'Cabin': "no_info/no_info/no_info",
        'Destination': "no_info",
        'Age': df_test_raw.Age.mean(), 
        'VIP': 0, 
        'RoomService': 0, 
        'FoodCourt': 0, 
        'ShoppingMall': 0, 
        'Spa': 0, 
        'VRDeck': 0, 
        'Name': "no_info"
        }

columns = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']


df_test_raw[columns] = df_test_raw[columns].fillna(value=values)
df_train_raw[columns] = df_train_raw[columns].fillna(value=values)



In [6]:
df_train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [7]:
df_train_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [8]:
df_train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [9]:
print("Destination unique - ", df_train_raw.Destination.unique())
print("HomePlanet unique - ", df_train_raw.HomePlanet.unique(), "count = ", len(df_train_raw.HomePlanet.unique()))
print("Cabin unique - ", df_train_raw.Cabin.unique(), ", count =", len(df_train_raw.Cabin.unique()))

Destination unique -  ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' 'no_info']
HomePlanet unique -  ['Europa' 'Earth' 'Mars' 'no_info'] count =  4
Cabin unique -  ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S'] , count = 6561


In [10]:
df_train_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


### Dummies and data transforming

In [11]:
def data_preprocessing_space(df):

    # Split "Cabin" info 
    cabin_deck_value = df["Cabin"].map(lambda x: x.split('/')[0])
    cabin_side_value = df["Cabin"].map(lambda x: x.split('/')[2])

    df = df.assign(Cabin_deck = cabin_deck_value)
    df = df.assign(Cabin_side = cabin_side_value)

    # Split "PassengerId" info 
    PassengerId_value = df["PassengerId"].map(lambda x: x.split('_')[0])
    Passenger_group_value = df["PassengerId"].map(lambda x: x.split('_')[1])

    df = df.assign(PassengerId = PassengerId_value)
    df = df.assign(Passenger_group = Passenger_group_value)


    # Create Dummies
    df_train_HomePlanet = pd.get_dummies(df['HomePlanet'], drop_first= True)
    df_train_Destination = pd.get_dummies(df['Destination'], drop_first= True)

    df_train_deck = pd.get_dummies(df['Cabin_deck'], drop_first= True)
    df_train_side = pd.get_dummies(df['Cabin_side'], drop_first= True)

    df_Passenger_group = pd.get_dummies(df['Passenger_group'], drop_first= True)


    # add dummies into df
    concat_list = [df, df_train_HomePlanet, df_train_Destination, df_train_deck, df_train_side, df_Passenger_group]
    df = pd.concat(concat_list, axis=1)

    # transform bool values to "0" and "1"
    df['CryoSleep'] = df['CryoSleep'].map({False:0, True:1})
    df['VIP'] = df['VIP'].map({False:0, True:1})


    # drop useless columns
    drop_list = ['PassengerId', 'Name', 'HomePlanet', 'Destination', 'Cabin', 'Cabin_deck', 'Cabin_side', 'Passenger_group']
    df = df.drop(drop_list, axis=1)

    df.columns.values

    # df.columns = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    #    'planet_Europa', 'planet_Mars', 'dest_PSO J318.5-22', 'dest_TRAPPIST-1e', 
    #    'deck_B', 'deck_C', 'deck_D','deck_E', 'deck_F', 'deck_G', 'deck_T', 'side_S', 
    #    'group_02', 'group_03', 'group_04', 'group_05', 'group_06', 'group_07', 'group_08']

   #  column_names_reordered = 

   #  df = df[column_names_reordered]

    return df
    

In [12]:
targets = df_train_raw['Transported'].map({False:0, True:1})
df_train_raw = df_train_raw.drop(['Transported'], axis=1)

In [13]:
df_train = data_preprocessing_space(df_train_raw)
df_train.columns.values

array(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Europa', 'Mars', 'no_info',
       'PSO J318.5-22', 'TRAPPIST-1e', 'no_info', 'B', 'C', 'D', 'E', 'F',
       'G', 'T', 'no_info', 'S', 'no_info', '02', '03', '04', '05', '06',
       '07', '08'], dtype=object)

In [14]:
df_test = data_preprocessing_space(df_test_raw)

### Scaler

In [15]:
scaler = StandardScaler()

In [16]:
list_for_scale = ['Age', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck']

unscaled_inputs = df_train[list_for_scale]
unscaled_inputs_test = df_test[list_for_scale]

unscaled_inputs

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,39.0,0.0,0.0,0.0,0.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0
4,16.0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0
8689,18.0,0.0,0.0,0.0,0.0,0.0
8690,26.0,0.0,0.0,1872.0,1.0,0.0
8691,32.0,0.0,1049.0,0.0,353.0,3235.0


### Scale df_train

In [17]:
scaler.fit(unscaled_inputs)

scaled_inputs = scaler.transform(unscaled_inputs)

df_train[list_for_scale] = scaled_inputs

df_train

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Europa,Mars,...,no_info,S,no_info.1,02,03,04,05,06,07,08
0,0,0.709680,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,-0.336473,0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,2.034807,1,-0.268001,1.959998,-0.283579,5.695623,-0.219796,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0.291219,0,-0.333105,0.523010,0.336851,2.687176,-0.092818,1,0,...,0,1,0,1,0,0,0,0,0,0
4,0,-0.894421,0,0.125652,-0.237159,-0.031059,0.231374,-0.261240,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,0.849167,1,-0.333105,3.992336,-0.283579,1.189173,-0.197751,1,0,...,0,0,0,0,0,0,0,0,0,0
8689,1,-0.754934,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,0,...,0,1,0,0,0,0,0,0,0,0
8690,0,-0.196986,0,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,0,0,...,0,1,0,0,0,0,0,0,0,0
8691,0,0.221475,0,-0.333105,0.376365,-0.283579,0.043013,2.589576,1,0,...,0,1,0,0,0,0,0,0,0,0


### Scale df_test

In [18]:
scaler.fit(unscaled_inputs_test)

scaled_inputs_test = scaler.transform(unscaled_inputs_test)

df_test[list_for_scale] = scaled_inputs_test

df_test

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Europa,Mars,...,no_info,S,no_info.1,02,03,04,05,06,07,08
0,1,-1.182216e-01,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,-6.886014e-01,0,-0.357339,-0.277879,-0.312173,2.287504,-0.246712,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,1.669682e-01,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,6.660505e-01,0,-0.357339,4.121518,-0.312173,-0.104002,0.226648,1,0,...,0,1,0,0,0,0,0,0,0,0
4,0,-6.173039e-01,0,-0.340723,-0.283840,0.832122,-0.267841,-0.246712,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,3.808606e-01,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,0,0,...,0,1,0,1,0,0,0,0,0,0
4273,0,9.512404e-01,0,-0.357339,0.277095,-0.281538,-0.258790,-0.130193,0,0,...,1,0,1,0,0,0,0,0,0,0
4274,1,-2.532995e-16,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,0,1,...,0,0,0,0,0,0,0,0,0,0
4275,0,-2.532995e-16,0,-0.357339,1.491019,-0.312173,-0.267841,0.176479,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_train.columns.values == df_test.columns.values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [20]:
# balancing no needed 

print(f"Persentage of 'Transported' passenger {(targets.sum() / len(df_train) * 100):.2f} %")

Persentage of 'Transported' passenger 50.36 %


In [21]:
df_train

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Europa,Mars,...,no_info,S,no_info.1,02,03,04,05,06,07,08
0,0,0.709680,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,-0.336473,0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,2.034807,1,-0.268001,1.959998,-0.283579,5.695623,-0.219796,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0.291219,0,-0.333105,0.523010,0.336851,2.687176,-0.092818,1,0,...,0,1,0,1,0,0,0,0,0,0
4,0,-0.894421,0,0.125652,-0.237159,-0.031059,0.231374,-0.261240,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,0.849167,1,-0.333105,3.992336,-0.283579,1.189173,-0.197751,1,0,...,0,0,0,0,0,0,0,0,0,0
8689,1,-0.754934,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,0,...,0,1,0,0,0,0,0,0,0,0
8690,0,-0.196986,0,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,0,0,...,0,1,0,0,0,0,0,0,0,0
8691,0,0.221475,0,-0.333105,0.376365,-0.283579,0.043013,2.589576,1,0,...,0,1,0,0,0,0,0,0,0,0


### Checkpoint

In [22]:
df_train_preprocessed = df_train.copy()
df_test_preprocessed = df_test.copy()

In [23]:
df_train_preprocessed.to_csv('Space_titanic_train_preprocessed.csv', index= False)
df_test_preprocessed.to_csv('Space_titanic_test_preprocessed.csv', index= False)

### LogisticRegression

In [24]:
x_train, x_test, y_train, y_test = train_test_split(df_train_preprocessed, targets, train_size=0.8, random_state=20)

In [25]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(6954, 31) (1739, 31) (6954,) (1739,)


In [26]:
reg = LogisticRegression()

In [27]:
reg.fit(x_train, y_train)

LogisticRegression()

### Accuracy

In [28]:
reg.score(x_train, y_train)

0.7888984756974403

In [29]:
feature_name = x_train.columns.values

summary_table = pd.DataFrame(columns=['feature_name'], data=feature_name)
summary_table['Coefficien'] = np.transpose(reg.coef_)

summary_table['Odds_ratio'] = np.exp(summary_table['Coefficien'])
summary_table.sort_values('Odds_ratio', ascending=False)


Unnamed: 0,feature_name,Coefficien,Odds_ratio
15,C,2.214546,9.157252
8,Europa,1.490638,4.439926
0,CryoSleep,1.337249,3.80855
14,B,0.99353,2.700751
4,FoodCourt,0.723999,2.062665
18,F,0.577363,1.781334
22,S,0.576642,1.78005
9,Mars,0.576016,1.778938
10,no_info,0.430475,1.537987
16,D,0.40974,1.506426


In [30]:
result = reg.predict(df_test)

result

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [31]:
df_predict = df_test_raw['PassengerId']

df_result = pd.DataFrame(df_predict)
df_result['Transported'] = result

df_result['Transported'] = df_result['Transported'].map({1:True, 0:False})


df_result

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [32]:
df_result.to_csv('submission.csv', index= False)