In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# read in data from csv file
test = pd.read_csv(r'data\test.csv')
train = pd.read_csv(r'data\train.csv')
gender_submission = pd.read_csv(r'data\gender_submission.csv')

In [2]:
passenger_id = test['PassengerId']

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [4]:
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [7]:
test['Age'] = test['Age'].fillna(test['Age'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

train['Age'] = train['Age'].fillna(train['Age'].median())
train['Fare'] = train['Fare'].fillna(train['Fare'].median())

In [8]:
# 1 hot encode the column 'Pclass'
test = pd.get_dummies(test, columns=['Embarked'])
test = pd.get_dummies(test, columns=['Sex'])

train = pd.get_dummies(train, columns=['Embarked'])
train = pd.get_dummies(train, columns=['Sex'])

In [9]:
# standardize the column age
test['Age'] = (test['Age'] - test['Age'].mean()) / test['Age'].std()
# standardize the column fare
test['Fare'] = (test['Fare'] - test['Fare'].mean()) / test['Fare'].std()

# do same standardization for train data
train['Age'] = (train['Age'] - train['Age'].mean()) / train['Age'].std()
train['Fare'] = (train['Fare'] - train['Fare'].mean()) / train['Fare'].std()

In [10]:
y = train['Survived']
X = train.drop(['Survived'], axis=1)

In [11]:
# using scikit-learn to build a random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# split the data into training and validation data, for both features and target
train, val, train_labels, val_labels = train_test_split(X, y, random_state=1)

# create the model
model = RandomForestClassifier(n_estimators=400, max_depth=5, random_state=1)

# fit the model
model.fit(train, train_labels)

# make predictions for train data
pred_train = model.predict(train)

#get accuracy_score using sci-kit learn
from sklearn.metrics import accuracy_score
print(accuracy_score(train_labels, pred_train))

# make predictions for validation data
pred_val = model.predict(val)

#get accuracy_score using sci-kit learn
from sklearn.metrics import accuracy_score
print(accuracy_score(val_labels, pred_val))

# make predictions for test data
pred_test = model.predict(test)


0.8667664670658682
0.7802690582959642


In [12]:
# create a submission dataframe
submission = pd.DataFrame({'PassengerId': passenger_id, 'Survived': pred_test})
# convert the dataframe to a csv file that can be uploaded
# this is saved in the same directory as your notebook
filename = 'titanic-predictions.csv'

submission.to_csv(filename, index=False)