In [1]:
import pandas as pd
import numpy as np
import acquire
import util
import pandas_profiling
import prepare

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = acquire.get_titanic_data()

In [3]:
#pandas_profiling.ProfileReport(df)

Drop Columns:

    -I'm going to drop "deck" because it only has 203 columns out of 891

In [4]:
df.drop(columns=["deck"], inplace=True)

Fillna:

    -Do a consistent fill of missing values

In [5]:
df.fillna(np.nan, inplace=True)

Split:

    -Split the data into train and test sets

In [6]:
train, test = train_test_split(df, train_size=.8, random_state=123)

Impute:

    -fill the NANs with the mode value in embarked

In [7]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

    -fit and transform train 
    
    -transform test

In [8]:
imp_mode.fit(train[["embarked"]])
train["embarked"] = imp_mode.transform(train[["embarked"]])
test["embarked"] = imp_mode.transform(test[["embarked"]])

In [9]:
train.embarked.value_counts()
test.embarked.value_counts()

S    129
C     40
Q     10
Name: embarked, dtype: int64

    -fill the NANs with the median value in age

In [10]:
imp_median = SimpleImputer(missing_values=np.nan, strategy="median")

In [11]:
train["age"] = imp_median.fit_transform(train[["age"]])

    -check to make sure there are no more NULL values in the age column


In [12]:
train.age.isnull().sum()

0

Encoding:

    -Integer Encoding
    
    -One hot encoder

    -create encoder

In [13]:
int_encoder = LabelEncoder()

    -fit to train set

In [14]:
int_encoder.fit(train.embarked)

LabelEncoder()

    -transform the "embarked" column data from letters to integers

In [15]:
train.embarked = int_encoder.transform(train.embarked)

    -check the counts of train

In [16]:
train.embarked.value_counts()

2    517
0    128
1     67
Name: embarked, dtype: int64

    -transform "embarked" into a np array

In [17]:
embarked_array = np.array(train.embarked)
embarked_array[0:5]

array([0, 1, 0, 1, 0])

    -transform "embarked" into a 2D array to feed to OneHot

In [18]:
embarked_array = embarked_array.reshape(len(embarked_array),1)

    -create OneHotEncoder

In [19]:
ohe = OneHotEncoder(sparse=False, categories="auto")

    -fit and transform to embarked_array

In [20]:
embarked_ohe = ohe.fit_transform(embarked_array)
embarked_ohe

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [21]:

test.embarked = int_encoder.transform(test.embarked)

    -transform test.embarked

In [22]:
embarked_array = np.array(test.embarked).reshape(len(test.embarked), 1)

    -transform embarked_array with OneHot

In [23]:
embarked_test_ohe = ohe.transform(embarked_array)

In [24]:
embarked_test_ohe[0:5]

array([[0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [25]:
df = acquire.get_iris_data()

In [26]:
prepare.prep_iris(df)
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [27]:
df = acquire.get_titanic_data()

In [28]:
prepare.prep_titanic(df)

(     passenger_id  survived  pclass     sex       age  sibsp  parch      fare  \
 0               0         0       3    male  0.271174      1      0  0.014151   
 1               1         1       1  female  0.472229      1      0  0.139136   
 2               2         1       3  female  0.321438      0      0  0.015469   
 3               3         1       1  female  0.434531      1      0  0.103644   
 4               4         0       3    male  0.434531      0      0  0.015713   
 5               5         0       3    male       NaN      0      0  0.016510   
 6               6         0       1    male  0.673285      0      0  0.101229   
 7               7         0       3    male  0.019854      3      1  0.041136   
 8               8         1       3  female  0.334004      0      2  0.021731   
 9               9         1       2  female  0.170646      1      0  0.058694   
 10             10         1       3  female  0.044986      1      1  0.032596   
 11             

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(4)
memory usage: 83.6+ KB


In [30]:
df.age.value_counts(dropna=False)

NaN      177
24.00     30
22.00     27
18.00     26
28.00     25
19.00     25
30.00     25
21.00     24
25.00     23
36.00     22
29.00     20
32.00     18
26.00     18
35.00     18
27.00     18
16.00     17
31.00     17
34.00     15
23.00     15
33.00     15
20.00     15
39.00     14
17.00     13
42.00     13
40.00     13
45.00     12
38.00     11
50.00     10
2.00      10
4.00      10
        ... 
28.50      2
63.00      2
0.83       2
30.50      2
70.00      2
57.00      2
0.75       2
13.00      2
59.00      2
10.00      2
64.00      2
40.50      2
45.50      2
32.50      2
20.50      1
24.50      1
0.67       1
70.50      1
0.92       1
74.00      1
34.50      1
14.50      1
80.00      1
12.00      1
53.00      1
36.50      1
55.50      1
66.00      1
23.50      1
0.42       1
Name: age, Length: 89, dtype: int64