# Exercise 1-03: Applying Feature Engineering to Text Data

In [9]:
# In this exercise, we will be converting the text features of the tips dataset into numerical data.

In [10]:
# Import scikit-learn's LabelEncoder() class, as well as the pandas library.
# Load tips dataset from seaborn library and display the top 5 rows.
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [11]:
# Convert each of the text features into numeric values using the class that 
# was imported previously (LabelEncoder). For each of the categorical features, 
# we use the built-in fit_transform() method from the class, which will assign a numeric 
# value to each category and output the result.
enc = LabelEncoder()
tips["sex"] = enc.fit_transform(tips['sex'].astype('str'))
tips["smoker"] = enc.fit_transform(tips['smoker'].astype('str'))
tips["day"] = enc.fit_transform(tips['day'].astype('str'))
tips["time"] = enc.fit_transform(tips['time'].astype('str'))

In [12]:
# Print out the top values of the tips dataset
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


# Exercise 1-04: Normalizing and Standardizing Data

In [14]:
# Using the tips variable, which contains the entire dataset, normalize the 
# data using the normalization formula and store it in a new variable called 
# tips_normalized. Print out the top 10 values.
# All of the values have been converted into their equivalents in a range between 0 and 1. 
tips_normalized = (tips - tips.min())/(tips.max()-tips.min())
tips_normalized.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0.291579,0.001111,0.0,0.0,0.666667,0.0,0.2
1,0.152283,0.073333,1.0,0.0,0.666667,0.0,0.4
2,0.375786,0.277778,1.0,0.0,0.666667,0.0,0.4
3,0.431713,0.256667,1.0,0.0,0.666667,0.0,0.2
4,0.450775,0.29,0.0,0.0,0.666667,0.0,0.6
5,0.465438,0.412222,1.0,0.0,0.666667,0.0,0.6
6,0.119397,0.111111,1.0,0.0,0.666667,0.0,0.2
7,0.498743,0.235556,1.0,0.0,0.666667,0.0,0.6
8,0.250733,0.106667,1.0,0.0,0.666667,0.0,0.2
9,0.245287,0.247778,1.0,0.0,0.666667,0.0,0.2


In [15]:
# Again, using the tips variable, standardize the data using the formula for 
# standardization and store it in a variable called tips_standardized. 
# Print out the top 10 values.
# Compared to normalization, in standardization, the values distribute normally 
# around zero.
tips_standardized = (tips - tips.mean())/tips.std()
tips_standardized.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,-0.314066,-1.436993,-1.340598,-0.783179,0.278585,-0.620307,-0.598961
1,-1.061054,-0.967217,0.742879,-0.783179,0.278585,-0.620307,0.452453
2,0.137497,0.36261,0.742879,-0.783179,0.278585,-0.620307,0.452453
3,0.437416,0.225291,0.742879,-0.783179,0.278585,-0.620307,-0.598961
4,0.539635,0.442111,-1.340598,-0.783179,0.278585,-0.620307,1.503867
5,0.618266,1.237116,0.742879,-0.783179,0.278585,-0.620307,1.503867
6,-1.237411,-0.721488,0.742879,-0.783179,0.278585,-0.620307,-0.598961
7,0.796869,0.087972,0.742879,-0.783179,0.278585,-0.620307,1.503867
8,-0.533108,-0.750398,0.742879,-0.783179,0.278585,-0.620307,-0.598961
9,-0.562313,0.167472,0.742879,-0.783179,0.278585,-0.620307,-0.598961
