In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
from pipelines.data_preparation import CustomDataPreparation
from pipelines.data_preprocessing import CustomDataPreprocessing

In [3]:
# reading data
data = pd.read_csv("./data/train.csv")

# changing column names
new_columns = ["passenger_id", "home_planet", "cryo_sleep", "cabin", "destination", "age", "vip", "room_service", "food_court", "shopping_mall", "spa", "vr_deck", "name", "transported"]
data.columns = new_columns

# seperating X and y
X = data.drop(["transported"], axis=1)
y = data.transported

In [4]:
X.head()

Unnamed: 0,passenger_id,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [5]:
# splitting data into training and testing set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

In [6]:
x_train.shape

(7823, 13)

<br>
<br>
<br>

In [7]:
custom_pipeline = Pipeline([
    ("data_preparation", CustomDataPreparation()),
    ("data_preprocessing", CustomDataPreprocessing())
])

In [8]:
final_data = custom_pipeline.fit_transform(x_train)

In [9]:
final_data.head()

Unnamed: 0,age,room_service,food_court,shopping_mall,spa,vr_deck,group,number_in_group,num_in_cabin,total_spending,...,deck_T,side_P,side_S,age_category_child,age_category_middle_adult,age_category_old_adult,age_category_young_adult,cryo_sleep,vip,gender
1432,0.235294,31.435897,0.0,0.045455,0.0,0.0,-0.689033,0.0,-0.144231,0.35343,...,0,0,1,0,1,0,0,0,0,1.0
6858,-0.058824,0.0,12.892086,0.0,11.896552,0.024691,0.574414,0.0,-0.223558,0.602911,...,0,1,0,0,0,0,1,0,0,1.0
4436,-0.176471,0.0,0.0,0.0,0.0,0.0,0.01584,0.0,0.425481,-0.496881,...,0,1,0,0,0,0,1,1,0,1.0
7230,0.352941,0.0,0.0,0.0,7.517241,5.530864,0.678693,0.0,0.115385,-0.039501,...,0,0,1,0,1,0,0,0,0,0.0
2992,-0.352941,28.128205,0.0,3.636364,10.155172,0.0,-0.309097,0.0,-0.36899,0.726958,...,0,1,0,0,0,0,1,0,0,1.0


In [10]:
final_data.shape

(7823, 33)

In [11]:
final_data.isna().any().value_counts()

False    33
dtype: int64

In [17]:
final_y = y_train.apply(lambda x: 1 if x else 0)

In [18]:
final_y.head()

1432    0
6858    1
4436    0
7230    1
2992    0
Name: transported, dtype: int64

<br>
<br>
<br>

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [59]:
forest = RandomForestClassifier()

In [60]:
forest.fit(final_data, final_y)

predictions = forest.predict(final_data)
accuracy_score(final_y, predictions)

1.0

In [61]:
scores = cross_val_score(forest, final_data, final_y, scoring="accuracy", cv=10)
scores.mean()

0.8051902153498416

<br>

In [42]:
final_test_data = custom_pipeline.transform(x_test)

In [47]:
final_test_data["deck_T"] = 0

In [51]:
final_test_data.head()

Unnamed: 0,age,room_service,food_court,shopping_mall,spa,vr_deck,group,number_in_group,num_in_cabin,total_spending,...,deck_T,side_P,side_S,age_category_child,age_category_middle_adult,age_category_old_adult,age_category_young_adult,cryo_sleep,vip,gender
304,-0.470588,10.692308,5.021583,28.818182,0.051724,26.098765,-0.947091,1.0,-0.418269,1.2079,...,0,0,1,0,0,0,1,0,0,1.0
2697,-0.529412,0.102564,13.007194,0.0,0.0,0.024691,-0.385216,0.0,0.058894,0.133056,...,0,0,1,0,0,0,1,0,0,0.0
8424,0.823529,0.0,0.0,0.0,0.0,0.0,0.95831,0.0,1.247596,-0.496881,...,0,0,1,0,1,0,0,1,0,0.0
1672,0.470588,0.0,4.863309,19.818182,0.0,0.0,-0.631614,0.0,-0.144231,0.039501,...,0,1,0,0,1,0,0,0,0,0.0
8458,0.941176,0.0,0.0,0.0,0.0,0.0,0.96623,1.0,-0.147837,-0.496881,...,0,1,0,0,1,0,0,1,0,1.0


In [44]:
final_test_data.isna().any().value_counts()

False    32
dtype: int64

In [49]:
final_test_data = final_test_data[['age', 'room_service', 'food_court', 'shopping_mall', 'spa', 'vr_deck',
       'group', 'number_in_group', 'num_in_cabin', 'total_spending',
       'home_planet_Earth', 'home_planet_Europa', 'home_planet_Mars',
       'destination_55 Cancri e', 'destination_PSO J318.5-22',
       'destination_TRAPPIST-1e', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'deck_T', 'side_P', 'side_S',
       'age_category_child', 'age_category_middle_adult',
       'age_category_old_adult', 'age_category_young_adult', 'cryo_sleep',
       'vip', 'gender']]

In [50]:
final_test_data.shape

(870, 33)

In [52]:
final_test_y = y_test.apply(lambda x: 1 if x else 0)

In [54]:
final_test_y.shape

(870,)

<br>

In [62]:
predictions = forest.predict(final_test_data)
accuracy_score(final_test_y, predictions)

0.7793103448275862