In [1]:
import pandas as pd
import numpy as np
import acquire
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

### Acquire Data

In [2]:
# Acquire Data from sql database using function created in acquire.py
df = acquire.get_titanic_data()

Preview data:

In [3]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [5]:
df.dtypes

passenger_id      int64
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class            object
deck             object
embark_town      object
alone             int64
dtype: object

### Prepare Data

- drop columns
- fillna
- split
- impute mean, mode, median: SimpleImputer
- integer encoding: LabelEncoder
- one hot encoding: OneHotEncoder
- scale

In [6]:
# Drop deck column because majority is null
df.drop(columns=['deck'], inplace=True)

In [7]:
# Fill empty cells with Nan
df.fillna(np.nan, inplace=True)

In [8]:
# Split data into train and test
train, test = train_test_split(df, train_size=.8, random_state=123)

In [9]:
# Preview the NaNs in embarked column
train.embarked.value_counts(dropna=False)

S      515
C      128
Q       67
NaN      2
Name: embarked, dtype: int64

The imputer completes missing values. It can replace the empty values in different ways. i.e. using the mean, median, or mode. Here we impute using the mode and median and using SimpleImputer.

In [10]:
# Select 'most_frequent strategy in hyper parameters. Assign imputer to variable. 
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# Fit imputer to embarked column from train data
imp_mode.fit(train[['embarked']])
# Transform the data on the embarked columb in train data and save over the old data in that column
train['embarked'] = imp_mode.transform(train[['embarked']])
# Transform test data as well using the imputer fit to train data! We cannot fit to test data becuase that is supposed to be an unknown.
test['embarked'] = imp_mode.transform(test[['embarked']])

In [11]:
# Preview train embarked values to confirm the 2 NaNs now have a value of S.
train.embarked.value_counts()

S    517
C    128
Q     67
Name: embarked, dtype: int64

Create a new imputer using the median stragegy and use this to fill in the Nans in the age column.

In [12]:
#148 Nans prior to imputing.
train.age.value_counts(dropna=False)

NaN      148
22.00     24
24.00     22
30.00     22
28.00     21
21.00     21
36.00     21
18.00     20
19.00     20
25.00     17
29.00     16
35.00     15
32.00     15
26.00     14
16.00     14
31.00     14
34.00     13
27.00     13
23.00     13
20.00     12
33.00     11
39.00     11
17.00     11
38.00     10
42.00      9
47.00      9
4.00       9
45.00      9
2.00       8
40.00      8
        ... 
58.00      2
45.50      2
59.00      2
10.00      2
6.00       2
71.00      2
32.50      2
55.00      2
13.00      2
64.00      2
70.00      2
46.00      2
0.42       1
74.00      1
55.50      1
0.92       1
0.67       1
14.50      1
80.00      1
34.50      1
23.50      1
12.00      1
63.00      1
53.00      1
36.50      1
11.00      1
65.00      1
66.00      1
70.50      1
0.75       1
Name: age, Length: 85, dtype: int64

In [13]:
imp_median = SimpleImputer(missing_values=np.nan, strategy = 'median')
train['age'] = imp_median.fit_transform(train[['age']])
train.age.isnull().sum()

0

In [14]:
# NaNs now added to median age of 28
train.age.value_counts(dropna=False)

28.00    169
22.00     24
24.00     22
30.00     22
21.00     21
36.00     21
19.00     20
18.00     20
25.00     17
29.00     16
35.00     15
32.00     15
26.00     14
16.00     14
31.00     14
34.00     13
27.00     13
23.00     13
20.00     12
39.00     11
33.00     11
17.00     11
38.00     10
45.00      9
47.00      9
42.00      9
4.00       9
2.00       8
40.00      8
48.00      7
        ... 
6.00       2
45.50      2
30.50      2
61.00      2
71.00      2
58.00      2
46.00      2
70.00      2
13.00      2
55.00      2
32.50      2
64.00      2
0.42       1
74.00      1
55.50      1
0.92       1
0.67       1
14.50      1
80.00      1
34.50      1
23.50      1
12.00      1
63.00      1
53.00      1
36.50      1
11.00      1
65.00      1
66.00      1
70.50      1
0.75       1
Name: age, Length: 84, dtype: int64

### Encoding

Encoding allows us convert columns of strings into integers. We can then use these integers to create features that can be put into algorithms/models.

1. integer encoding
1. one hot encoding

In [15]:
# Values are S, C, or Q
train.embarked.value_counts()

S    517
C    128
Q     67
Name: embarked, dtype: int64

In [16]:
# Assign a variable to the encoder
int_encoder = LabelEncoder()
# Fit the encoder to our embarked column from our train data
int_encoder.fit(train.embarked)
# Transform the data and save over the old
train.embarked = int_encoder.transform(train.embarked)


In [17]:
# Values are now 2, 0, or 1
train.embarked.value_counts()

2    517
0    128
1     67
Name: embarked, dtype: int64

In [18]:
# Convert train.embarked into an array and preview
embarked_array = np.array(train.embarked)
embarked_array[0:5]

array([0, 1, 0, 1, 0])

In [19]:
# Reshape it because encoder requires a 2-d array
embarked_array = embarked_array.reshape(len(embarked_array), 1)

In [20]:
# Assign variable to one hot encoder
ohe = OneHotEncoder(sparse=False, categories='auto')

In [21]:
# fit encoder to array
embarked_ohe = ohe.fit_transform(embarked_array)
# preview
embarked_ohe

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [23]:
test.embarked = int_encoder.transform(test.embarked)

In [24]:
embarked_array = np.array(test.embarked).reshape(len(test.embarked), 1)

In [25]:
embarked_test_ohe = ohe.transform(embarked_array)

In [26]:
embarked_test_ohe[0:5]

array([[0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])