In [1]:
# general imports
import pandas as pd
from sklearn.pipeline import Pipeline

In [2]:
# custom pipelines
from pipelines.data_preparation import CustomDataPreparation
from pipelines.data_preprocessing import CustomDataPreprocessing

<br>
<br>
<br>

Data collection

In [12]:
# reading data
data = pd.read_csv("./data/train.csv")

# seperating X and y
X = data.drop(["Transported"], axis=1)
y = data.Transported

<br>
<br>
<br>

Data preparation

In [13]:
# creating custom pipeline
custom_pipeline = Pipeline([
    ("data_preparation", CustomDataPreparation()),
    ("data_preprocessing", CustomDataPreprocessing())
])

In [15]:
# transforming data
final_data = custom_pipeline.fit_transform(X)

In [17]:
# shape of final dataset
final_data.shape

(8693, 33)

In [18]:
# checking null values
final_data.isna().any().value_counts()

False    33
dtype: int64

In [19]:
# converting (True, False) -> (1, 0)
final_y = y.apply(lambda x: 1 if x else 0)

<br>
<br>
<br>

Model Training

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [22]:
# creating random forest model
forest = RandomForestClassifier(n_estimators=100, max_depth=7)

In [23]:
# training model
forest.fit(final_data, final_y)

# making predictions on training data
predictions = forest.predict(final_data)
accuracy_score(final_y, predictions)

0.8063959507649833

In [24]:
# checking model performance using cross validation
scores = cross_val_score(forest, final_data, final_y, scoring="accuracy", cv=10)
scores.mean()

0.7854644656958059

<br>

Saving model

In [25]:
import joblib

In [26]:
# saving model as pickle file
joblib.dump(forest, "random_forest.pkl")

['random_forest.pkl']

<br>
<br>

Making predictions on test data

In [27]:
# loading model
loaded_model = joblib.load("./models/random_forest.pkl")

In [28]:
# loading test data
test_data = pd.read_csv("./data/test.csv")

test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [29]:
# saving PassengerId as it will be used during submission
passenger_id = test_data.PassengerId

In [30]:
# transforming training data
test_data_prepared = custom_pipeline.transform(test_data)

In [32]:
# shape of transformed data
test_data_prepared.shape

(4277, 33)

In [33]:
# checking null values
test_data_prepared.isna().any().value_counts()

False    33
dtype: int64

In [35]:
# making predictions
test_predictions = loaded_model.predict(test_data_prepared)

# converting (1, 0) -> (True, False)
test_predictions_bool = pd.Series(test_predictions).apply(lambda x: True if x == 1 else False)

In [39]:
# creating dataframe of PassengerId and Transported
# required format for submission
test_df = pd.DataFrame(data={"PassengerId": passenger_id, "Transported": test_predictions_bool})

In [40]:
# checking dataframe
test_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [84]:
# saving dataframe as a csv file
test_df.to_csv("./data/submission_1.csv", index=None)