In [1]:
import acquire

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Use the function defined in acquire.py to load the iris data.
df = acquire.get_iris_data()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
species_name      150 non-null object
measurement_id    150 non-null int64
sepal_length      150 non-null float64
sepal_width       150 non-null float64
petal_length      150 non-null float64
petal_width       150 non-null float64
species_id        150 non-null int64
dtypes: float64(4), int64(2), object(1)
memory usage: 8.3+ KB


In [4]:
# Drop the species_id and measurement_id columns.
df = df.drop(['species_id', 'measurement_id'], axis=1)
df.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [5]:
# Rename the species_name column to just species.
df.rename(columns={'species_name': 'species'}, inplace=True)
df.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [6]:
# Encode the species name using a sklearn label encoder.
encoder = LabelEncoder()
encoder.fit(df.species)
df.species = encoder.transform(df.species)
df.species

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
120    2
121    2
122    2
123    2
124    2
125    2
126    2
127    2
128    2
129    2
130    2
131    2
132    2
133    2
134    2
135    2
136    2
137    2
138    2
139    2
140    2
141    2
142    2
143    2
144    2
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int64

In [7]:
# Create a function named prep_iris that accepts the untransformed iris data, and
# returns the data with the transformations above applied.
def prep_iris(df):
    df = acquire.get_iris_data()
    df = df.drop(['species_id', 'measurement_id'], axis=1)
    df = df.rename(columns={'species_name': 'species'})
    encoder = LabelEncoder()
    encoder.fit(df.species)
    df.species = encoder.transform(df.species)
    return df
    

In [8]:
print(prep_iris(df))

     species  sepal_length  sepal_width  petal_length  petal_width
0          0           5.1          3.5           1.4          0.2
1          0           4.9          3.0           1.4          0.2
2          0           4.7          3.2           1.3          0.2
3          0           4.6          3.1           1.5          0.2
4          0           5.0          3.6           1.4          0.2
5          0           5.4          3.9           1.7          0.4
6          0           4.6          3.4           1.4          0.3
7          0           5.0          3.4           1.5          0.2
8          0           4.4          2.9           1.4          0.2
9          0           4.9          3.1           1.5          0.1
10         0           5.4          3.7           1.5          0.2
11         0           4.8          3.4           1.6          0.2
12         0           4.8          3.0           1.4          0.1
13         0           4.3          3.0           1.1         

In [9]:
# Use the function you defined in acquire.py to load the titanic data set.
df = acquire.get_titanic_data()

In [10]:
# Handle the missing values in the embark_town and embarked columns.
df.embark_town.value_counts(dropna=False)
df.embark_town.fillna('Other', inplace=True)
df.embark_town.value_counts(dropna=False)
df.embarked.value_counts(dropna=False)
df.embarked.fillna('Other', inplace=True)
df.embarked.value_counts(dropna=False)

S        644
C        168
Q         77
Other      2
Name: embarked, dtype: int64

In [11]:
# Remove the deck column.
df.drop(columns=['deck'], inplace=True)

In [12]:
# Use a label encoder to transform the embarked column.
df.info()
encoder = LabelEncoder()
encoder.fit(df.embarked)
df.embarked = encoder.transform(df.embarked)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        891 non-null object
class           891 non-null object
embark_town     891 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(4)
memory usage: 83.6+ KB


In [13]:
# Scale the age and fare columns using a min max scaler.
# Why might this be beneficial? When might you not want to do this?
train, test = train_test_split(df, test_size=.2, random_state=123,)

scaler = MinMaxScaler()
scaler.fit(train[['age', 'fare']])
train.age = scaler.transform(train[['age', 'fare']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [14]:
# Create a function named prep_titanic that accepts the untransformed titanic data, and
# returns the data with the transformations above applied.
def prep_titanic(df):
    df.embark_town.value_counts(dropna=False)
    df.embark_town.fillna('Other', inplace=True)
    df.embark_town.value_counts(dropna=False)
    df.embarked.value_counts(dropna=False)
    df.embarked.fillna('Other', inplace=True)
    df.embarked.value_counts(dropna=False)
    df.drop(columns=['deck'], inplace=True)
    df.info()
    encoder = LabelEncoder()
    encoder.fit(df.embarked)
    df.embarked = encoder.transform(df.embarked)
    return df

In [15]:
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,3,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,3,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,3,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,3,Third,Southampton,1
5,5,0,3,male,,0,0,8.4583,2,Third,Queenstown,1
6,6,0,1,male,54.0,0,0,51.8625,3,First,Southampton,1
7,7,0,3,male,2.0,3,1,21.0750,3,Third,Southampton,0
8,8,1,3,female,27.0,0,2,11.1333,3,Third,Southampton,0
9,9,1,2,female,14.0,1,0,30.0708,0,Second,Cherbourg,0


## Inverse Transform

In [16]:
df.embarked
encoder2 = LabelEncoder()
encoder2.fit(df.embarked)
df.embarked = encoder.inverse_transform(df.embarked)

In [17]:
df.embarked

0      S
1      C
2      S
3      S
4      S
5      Q
6      S
7      S
8      S
9      C
10     S
11     S
12     S
13     S
14     S
15     S
16     Q
17     S
18     S
19     C
20     S
21     S
22     Q
23     S
24     S
25     S
26     C
27     S
28     Q
29     S
      ..
861    S
862    S
863    S
864    S
865    S
866    C
867    S
868    S
869    S
870    S
871    S
872    S
873    S
874    C
875    C
876    S
877    S
878    S
879    C
880    S
881    S
882    S
883    S
884    S
885    Q
886    S
887    S
888    S
889    C
890    Q
Name: embarked, Length: 891, dtype: object