In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# loading training data
data = pd.read_csv("./data/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# dropping name column because text
# dropping cabin column because more than half values are null
data.drop(["Name", "Cabin"], axis=1, inplace=True)

In [4]:
# shape of data
data.shape

(891, 10)

<br>

### Filling null values

In [5]:
# filling with mode
age_mode = data.Age.mode()
data.Age.fillna(value=age_mode[0], inplace=True)

In [6]:
# filling with max frequency
max_freq = data.Embarked.value_counts().index[0]
data.Embarked.fillna(value=max_freq, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


<br>

### Feature selection

In [36]:
# seperating features and labels
features = ["Pclass", "Sex", "Age", "Fare"]
X = data[features]
y = data["Survived"]

In [37]:
# changing column names to standard form
X.columns = ["p_class", "sex", "age", "fare"]

In [38]:
# rearranging columns
X = X[["age", "fare", "p_class", "sex"]]

<br>

### Data preprocessing

In [39]:
# converting str to int
X.sex = X.sex.apply(lambda x: 0 if x == "female" else 1)

In [12]:
# converts embarked str to int
def embarked_to_num(s: str) -> int:
    if s == 'S':
        return 0
    elif s == 'C':
        return 1
    else:
        return 2

In [13]:
# X.embarked = X.embarked.apply(lambda x: embarked_to_num(x))

In [40]:
X.head()

Unnamed: 0,age,fare,p_class,sex
0,22.0,7.25,3,1
1,38.0,71.2833,1,0
2,26.0,7.925,3,0
3,35.0,53.1,1,0
4,35.0,8.05,3,1


<br>

### Training the model

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [41]:
# random forest model
forest = RandomForestClassifier(max_leaf_nodes=30)

In [42]:
# training model
forest.fit(X, y)

In [43]:
# accuracy on training data
predictions = forest.predict(X)
accuracy_score(y, predictions)

0.8888888888888888

In [44]:
# cross validation accuracy
scores = cross_val_score(forest, X, y, cv=10, scoring="accuracy")
print(scores.mean())

0.8474406991260924


In [45]:
# feature importance
forest.feature_importances_

array([0.17733119, 0.24355881, 0.14652861, 0.43258139])

<br>

### Evaluating on test data

In [75]:
# loading test data
test_data = pd.read_csv("./data/test.csv")

In [76]:
# transformation pipeline

# dropping columns
test_data.drop(["Name", "Cabin"], axis=1, inplace=True)
passenger_id = test_data.PassengerId

# filling null values
test_data.Age.fillna(value=age_mode[0], inplace=True)

fare_median = data.Fare.median()
test_data.Fare.fillna(value=fare_median, inplace=True)

# feature selection
features = ["Pclass", "Sex", "Age", "Fare"]
X_test = test_data[features]

# rearranging columns
X_test.columns = ["p_class", "sex", "age", "fare"]
X_test =X_test[["age", "fare", "p_class", "sex"]]

# data preprocessing
X_test.sex = X_test.sex.apply(lambda x: 0 if x == "female" else 1)

In [77]:
X_test.head()

Unnamed: 0,age,fare,p_class,sex
0,34.5,7.8292,3,1
1,47.0,7.0,3,0
2,62.0,9.6875,2,1
3,27.0,8.6625,3,1
4,22.0,12.2875,3,0


In [78]:
# making predictions on test data
predictions = forest.predict(X_test)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [79]:
# combining passenger id and predictions
answer_df = pd.DataFrame({"PassengerId": passenger_id, "Survived": predictions})

In [80]:
# saving to csv file
answer_df.to_csv("./data/predictions/random_forest.csv", index=False)