In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from acquire import get_titanic_data

df = get_titanic_data()

In [2]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df.tail(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
888,888,0,3,female,,1,2,23.45,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0,C,First,C,Cherbourg,1
890,890,0,3,male,32.0,0,0,7.75,Q,Third,,Queenstown,1


In [4]:
df.shape

(891, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
deck            203 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [6]:
df.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [7]:
df.sex.value_counts()

male      577
female    314
Name: sex, dtype: int64

In [8]:
df.fare.value_counts(bins=10, sort=False)

(-0.513, 51.233]      732
(51.233, 102.466]     106
(102.466, 153.699]     31
(153.699, 204.932]      2
(204.932, 256.165]     11
(256.165, 307.398]      6
(307.398, 358.63]       0
(358.63, 409.863]       0
(409.863, 461.096]      0
(461.096, 512.329]      3
Name: fare, dtype: int64

In [9]:
df.embark_town.value_counts(dropna=False)

Southampton    644
Cherbourg      168
Queenstown      77
NaN              2
Name: embark_town, dtype: int64

In [10]:
df.embarked

0      S
1      C
2      S
3      S
4      S
5      Q
6      S
7      S
8      S
9      C
10     S
11     S
12     S
13     S
14     S
15     S
16     Q
17     S
18     S
19     C
20     S
21     S
22     Q
23     S
24     S
25     S
26     C
27     S
28     Q
29     S
      ..
861    S
862    S
863    S
864    S
865    S
866    C
867    S
868    S
869    S
870    S
871    S
872    S
873    S
874    C
875    C
876    S
877    S
878    S
879    C
880    S
881    S
882    S
883    S
884    S
885    Q
886    S
887    S
888    S
889    C
890    Q
Name: embarked, Length: 891, dtype: object

In [11]:
df.embark_town.fillna('Other', inplace=True)

In [12]:
df.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Other            2
Name: embark_town, dtype: int64

In [13]:
df.embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: embarked, dtype: int64

In [14]:
df.drop(columns=['deck'], inplace=True)

In [17]:
df.embarked.head(10)

0    S
1    C
2    S
3    S
4    S
5    Q
6    S
7    S
8    S
9    C
Name: embarked, dtype: object

In [15]:
df.embarked.fillna('O', inplace=True)

In [16]:
df.embarked.value_counts()

S    644
C    168
Q     77
O      2
Name: embarked, dtype: int64

In [19]:
encoder = LabelEncoder()

df.embarked.fillna('Unknown', inplace=True)

encoder.fit(df.embarked)
df.embarked = encoder.transform(df.embarked)

df.embarked.head(10)

0    3
1    0
2    3
3    3
4    3
5    2
6    3
7    3
8    3
9    0
Name: embarked, dtype: int64

In [20]:
df.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,embarked,alone
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,2.343434,0.602694
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,1.167398,0.489615
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,2.0,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,3.0,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,3.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,3.0,1.0


In [22]:
from sklearn_model_selection import train_test_split
train, test - train_test_split(df)

#find the parameters for the scaling from the traiing data set, then apply to the best data set

fare_min = train.Fare.min()
fare_max - train.Fare.max()

# scale the training data
train.Fare = (train.Fare - fare_min) / (fare_max - fare_min)

# scale the test data
test.Fare = (test.Fare - fare_min) / (fare_max - fare_min)

ModuleNotFoundError: No module named 'sklearn_model_selection'

In [23]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train[['fare']])
train.fare = scaler.transform(train[['fare']])


NameError: name 'train' is not defined