In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# wrangle data
import pandas as pd
import numpy as np

# visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# preparing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# modeling



# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

# my modules
import wrangle_mall as wm
import summarize
import prepare
import env

### Acquire df

In [2]:
df = wm.get_mallcustomer_data()

In [3]:
wm.df_summary(df)

--- Shape: (200, 4)
--- Info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
gender            200 non-null object
age               200 non-null int64
annual_income     200 non-null int64
spending_score    200 non-null int64
dtypes: int64(3), object(1)
memory usage: 7.8+ KB
--- Descriptions
        gender                  age        annual_income       spending_score
count      200               200.00               200.00               200.00
unique       2                  nan                  nan                  nan
top     Female                  nan                  nan                  nan
freq       112                  nan                  nan                  nan
mean       NaN                38.85                60.56                50.20
std        NaN                13.97                26.26                25.82
min        NaN                18.00                15.00                 1.00
25%        NaN             

### Prepare df

- Handle outliers


    - First I will use Q3 + 1.5*IQR and Q1 - 1.5*IQR for outliers.

In [4]:
columns = ["age", "annual_income", "spending_score"]

df = wm.remove_outliers_iqr(df, columns, 1.5)

- The function to handle outliers removed two rows based on high outliers in the annual_income column.

In [5]:
wm.df_summary(df)

--- Shape: (198, 4)
--- Info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 198 entries, 1 to 198
Data columns (total 4 columns):
gender            198 non-null object
age               198 non-null int64
annual_income     198 non-null int64
spending_score    198 non-null int64
dtypes: int64(3), object(1)
memory usage: 7.7+ KB
--- Descriptions
        gender                  age        annual_income       spending_score
count      198               198.00               198.00               198.00
unique       2                  nan                  nan                  nan
top     Female                  nan                  nan                  nan
freq       112                  nan                  nan                  nan
mean       NaN                38.93                59.79                50.20
std        NaN                14.02                25.24                25.75
min        NaN                18.00                15.00                 1.00
25%        NaN             

- Split df

In [6]:
train, test = train_test_split(df, test_size=.30, random_state=123)

- Scale df

In [7]:
column_list = ["age", "annual_income", "spending_score"]

train_scaled, test_scaled, scaler = wm.scale_minmax(train, test, column_list)
train_scaled.head()

Unnamed: 0_level_0,gender,age,annual_income,spending_score,age_scaled,annual_income_scaled,spending_score_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,Male,64,19,3,0.88,0.04,0.02
153,Female,44,78,20,0.5,0.57,0.19
86,Male,48,54,46,0.58,0.35,0.46
150,Male,34,78,90,0.31,0.57,0.91
177,Male,58,88,15,0.77,0.66,0.14


- Encode df

In [8]:
train_enscaled, test_enscaled = wm.one_hot_encode(train_scaled, test_scaled, col_name="gender")
train_enscaled.head()

Unnamed: 0_level_0,gender,age,annual_income,spending_score,age_scaled,annual_income_scaled,spending_score_scaled,Female,Male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9,Male,64,19,3,0.88,0.04,0.02,0.0,1.0
153,Female,44,78,20,0.5,0.57,0.19,1.0,0.0
86,Male,48,54,46,0.58,0.35,0.46,0.0,1.0
150,Male,34,78,90,0.31,0.57,0.91,0.0,1.0
177,Male,58,88,15,0.77,0.66,0.14,0.0,1.0


- Drop gender column that is not encoded and keep encoded columns for Female and Male

In [9]:
train_enscaled.drop(columns="gender", inplace=True)
train_enscaled.head()

Unnamed: 0_level_0,age,annual_income,spending_score,age_scaled,annual_income_scaled,spending_score_scaled,Female,Male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,64,19,3,0.88,0.04,0.02,0.0,1.0
153,44,78,20,0.5,0.57,0.19,1.0,0.0
86,48,54,46,0.58,0.35,0.46,0.0,1.0
150,34,78,90,0.31,0.57,0.91,0.0,1.0
177,58,88,15,0.77,0.66,0.14,0.0,1.0


- Handle Missing Values


    - This step will not be necessary as there are no missing values in the df.