In [111]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# divide data into train and test sets
from sklearn.model_selection import train_test_split 

In [112]:
# load dataset
test_df = pd.read_csv('test.csv')

In [113]:
# LOOK AT THE DATA AND SEE IF THERE ARE ANY SUMMARY STATISTICS THAT MIGHT GIVE YOU SOME INSIGHTS

In [114]:
# print shape of train and test dataframes
print(train_df.shape) 
print(test_df.shape) 

(891, 14)
(418, 11)


In [115]:
# print first 5 rows of train dataframe
print(train_df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin  Embarked  has_cabin  is_male  
0      0         A/5 21171   7.2500   NaN         0          1        1  
1      0          PC 17599  71.2833   C85         0          0        0  
2      0  STON/O2. 3101282   7.9250   NaN         0          1        0  
3      0            1138

In [116]:
# print train dataframe info 
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    int64  
 12  has_cabin    891 non-null    int64  
 13  is_male      891 non-null    int32  
dtypes: float64(2), int32(1), int64(7), object(4)
memory usage: 94.1+ KB
None


In [117]:
# select all columns that are objects
categorical_columns = train_df.select_dtypes(include=['object'])
# print number of categorical columns
print(f'There are {len(categorical_columns.columns.tolist())} categorical columns in the dataset:')
# for each column in categorical columns, print column name and number of unique values
for cols in categorical_columns.columns: 
    print(cols,':', len(categorical_columns[cols].unique()),'labels')

There are 4 categorical columns in the dataset:
Name : 891 labels
Sex : 2 labels
Ticket : 681 labels
Cabin : 148 labels


In [118]:
# DATA CLEANING AND FEATURE ENGINEERING

In [119]:
# create booleans for each of the embarkment points
train_df['Embarked'] = train_df['Embarked'].fillna('S')
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
# print the boolean variable for each of the embarkment points
print(train_df['Embarked'].value_counts())

Series([], Name: Embarked, dtype: int64)


In [120]:
# Create a boolean for is male. 
#not sure if this is how you do it

train_df_sex = pd.get_dummies(train_df['Sex'])
isMale = train_df_sex['male']
isMale

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: male, Length: 891, dtype: uint8

In [121]:
#Create a boolean for whether someone has a cabin
PeopleWithCabin = train_df[train_df['Cabin'].isnull() == False]
PeopleWithoutCabin = train_df[train_df['Cabin'].isnull()]

#still working on how to create the boolean for this 

In [122]:
# check for missing values in the data
print(train_df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked       891
has_cabin        0
is_male          0
dtype: int64


In [123]:
# create boolean variable for each of the embarkment points
# for each value in the Embarked column
for uwu in train_df.Embarked.unique(): 
    # if it is a string (just to be safe lol)
    if type(uwu) == str: 
        # create a new column with the boolean value
        train_df['emb' + uwu] = (train_df.Embarked == uwu) * 1 

In [124]:
# create boolean variable for is male
train_df['is_male'] = (train_df.Sex == 'male') * 1 

In [125]:
# create boolean variable for has cabin
train_df.loc[:, 'has_cabin'] = 0
train_df.loc[train_df.Cabin.isna(), 'has_cabin'] = 1

In [126]:
# fill missing age values as 100 
train_df['Age'].fillna(100)

0       22.0
1       38.0
2       26.0
3       35.0
4       35.0
       ...  
886     27.0
887     19.0
888    100.0
889     26.0
890     32.0
Name: Age, Length: 891, dtype: float64

In [127]:
# divide the data into a training set and a testing set
X = train_df[["Pclass", "Age", "Sex", "SibSp", "Parch", "Embarked"]]
Y = train_df["Survived"]

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0)