In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

from wrangle import wrangle_telco
import split_scale


# Read in Telco df using wrangle_telco() function

In [30]:
df = wrangle_telco()

In [31]:
df.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1695 non-null object
monthly_charges    1695 non-null float64
tenure             1695 non-null int64
total_charges      1695 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


# Split and Scale data using Standard Scaler Function

In [17]:
def split_my_data(df, train_pct=0.70, seed=123):
    train, test = train_test_split(df, train_size=train_pct, random_state=seed)
    return train, test

In [18]:
train, test = split_my_data(df)

In [19]:
train.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
1469,8661-BOYNW,84.4,72,6096.45
163,0960-HUWBM,104.1,65,6700.05
392,2346-LOCWC,20.5,58,1191.4
1546,9114-DPSIA,81.0,72,5750.0
797,4891-NLUBA,61.45,61,3751.15


In [20]:
test.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
252,1494-EJZDW,20.15,10,220.8
632,3795-GWTRD,75.55,63,4707.85
472,2900-PHPLN,19.55,70,1462.05
1029,6211-WWLTF,99.7,63,6330.4
910,5494-WOZRZ,82.0,71,5999.85


In [21]:
print(train.shape)
print(test.shape)

(1186, 4)
(509, 4)


## Our train test split function works fine. Now we can use it in our standard_scaler function

- I wanted to make a function where I could feed in the telco df even though it has a column that is not numeric and use a Standard Scaler on the numeric columns.

- I wanted the function to return the scaler, train_scaled, and test_scaled with customer_id still intact.

- I moved the customer_id columns to the index, so I can just reset the index after I scale the numeric values.

- The Standard Scaler returns arrays, so I had to convert those arrays back to dataframes.

- I had to reset and rename the index to be a customer_id column again.

In [24]:
def telco_standard_scaler(df):
    df.set_index('customer_id', inplace=True)
    train, test = split_my_data(df)
    scaler = StandardScaler().fit(train)
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                    columns=train.columns.values)
                    .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                    columns=test.columns.values)
                   .set_index([test.index.values]))
    train_scaled.reset_index(inplace=True)
    train_scaled.rename(columns={'index': 'customer_id'}, inplace=True)
    test_scaled.reset_index(inplace=True)
    test_scaled.rename(columns={'index': 'customer_id'}, inplace=True)
    return scaler, train_scaled, test_scaled

In [25]:
# test the function

scaler, train_scaled, test_scaled = split_scale.telco_standard_scaler(df)

In [26]:
# validate that it can return the scaler if I need to revert to unscaled

scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [27]:
# validate train_scaled

train_scaled

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,8661-BOYNW,0.672591,0.838946,0.910993
1,0960-HUWBM,1.237494,0.441745,1.144459
2,2346-LOCWC,-1.159761,0.044543,-0.986222
3,9114-DPSIA,0.575095,0.838946,0.776991
4,4891-NLUBA,0.014492,0.214772,0.003859
...,...,...,...,...
1181,6701-YVNQG,0.795894,0.838946,0.990382
1182,7996-MHXLW,-1.026421,0.498488,-0.795845
1183,8242-PDSGJ,-0.624967,0.725460,-0.405518
1184,8200-LGKSR,0.638180,0.782203,0.922462


In [28]:
# validate test_scaled

test_scaled

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,1494-EJZDW,-1.169798,-2.679126,-1.361639
1,3795-GWTRD,0.418814,0.328258,0.373899
2,2900-PHPLN,-1.187003,0.725460,-0.881538
3,6211-WWLTF,1.111323,0.328258,1.001483
4,5494-WOZRZ,0.603770,0.782203,0.873630
...,...,...,...,...
504,4701-LKOZD,0.826004,0.725460,1.015716
505,9488-FVZCC,0.532082,0.838946,0.742837
506,5727-MYATE,0.856113,0.838946,1.027475
507,2253-KPMNB,-0.421372,0.668717,-0.239721


### Pretty, pretty scaled data...

# Validate my telco_standard_scaler() function in validation notebook

# If you only had numeric columns, this function works.

In [37]:
df = wrangle_telco()

In [38]:
numerics = df.drop(columns='customer_id')
numerics.head()

Unnamed: 0,monthly_charges,tenure,total_charges
0,109.7,71,7904.25
1,84.65,63,5377.8
2,90.45,65,5957.9
3,45.2,54,2460.55
4,116.8,72,8456.75


In [39]:
def standard_scaler(df):
    train, test = split_my_data(df)
    scaler = StandardScaler().fit(train)
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                    columns=train.columns.values)
                    .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                    columns=test.columns.values)
                   .set_index([test.index.values]))
    return scaler, train_scaled, test_scaled

In [40]:
scaler, train_scaled, test_scaled = standard_scaler(numerics)

In [41]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [42]:
train_scaled

Unnamed: 0,monthly_charges,tenure,total_charges
1469,0.672591,0.838946,0.910993
163,1.237494,0.441745,1.144459
392,-1.159761,0.044543,-0.986222
1546,0.575095,0.838946,0.776991
797,0.014492,0.214772,0.003859
...,...,...,...
1122,0.795894,0.838946,0.990382
1346,-1.026421,0.498488,-0.795845
1406,-0.624967,0.725460,-0.405518
1389,0.638180,0.782203,0.922462


In [43]:
test_scaled

Unnamed: 0,monthly_charges,tenure,total_charges
252,-1.169798,-2.679126,-1.361639
632,0.418814,0.328258,0.373899
472,-1.187003,0.725460,-0.881538
1029,1.111323,0.328258,1.001483
910,0.603770,0.782203,0.873630
...,...,...,...
759,0.826004,0.725460,1.015716
1603,0.532082,0.838946,0.742837
944,0.856113,0.838946,1.027475
375,-0.421372,0.668717,-0.239721


## Validate that my standard_scaler() function works in validation notebook

### Could I reattach my numeric scaled df to my origin original df?

In [59]:
train_all = train.merge(train_scaled, how='inner', on=None, left_index=True, right_index=True)
train_all.columns

Index(['customer_id', 'monthly_charges_x', 'tenure_x', 'total_charges_x',
       'monthly_charges_y', 'tenure_y', 'total_charges_y'],
      dtype='object')

In [60]:
train_all.columns = ['customer_id', 'monthly_charges', 'tenure', 'total_charges',
       'monthly_charges_scaled', 'tenure_scaled', 'total_charges_scaled']

In [61]:
train_all.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,monthly_charges_scaled,tenure_scaled,total_charges_scaled
1469,8661-BOYNW,84.4,72,6096.45,0.672591,0.838946,0.910993
163,0960-HUWBM,104.1,65,6700.05,1.237494,0.441745,1.144459
392,2346-LOCWC,20.5,58,1191.4,-1.159761,0.044543,-0.986222
1546,9114-DPSIA,81.0,72,5750.0,0.575095,0.838946,0.776991
797,4891-NLUBA,61.45,61,3751.15,0.014492,0.214772,0.003859


### Looks like I can by merging my train and train_scaled on their shared index.

# Yes!! Now let's do the inverse!

In [51]:
def scale_inverse(scaler, train_scaled, test_scaled):
    train_unscaled = (pd.DataFrame(scaler.inverse_transform(train_scaled), 
                    columns=train_scaled.columns.values)
                    .set_index([train_scaled.index.values]))
    test_unscaled = (pd.DataFrame(scaler.inverse_transform(test_scaled), 
                    columns=test_scaled.columns.values)
                   .set_index([test_scaled.index.values]))
    return train_unscaled, test_unscaled

In [52]:
train_unscaled, test_unscaled = scale_inverse(scaler, train_scaled, test_scaled)

In [53]:
# Validate our DFs are unscaled

train_unscaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1469,84.4,72.0,6096.45
163,104.1,65.0,6700.05
392,20.5,58.0,1191.4
1546,81.0,72.0,5750.0
797,61.45,61.0,3751.15


In [54]:
test_unscaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
252,20.15,10.0,220.8
632,75.55,63.0,4707.85
472,19.55,70.0,1462.05
1029,99.7,63.0,6330.4
910,82.0,71.0,5999.85


## Validate that my scale_inverse() function works in the validation notebook

# Split and Scale df Using uniform_scaler() function

- Now that we have seen one way to keep our non-numeric column in the df, I'm going to just make my functions to work on numeric DataFrames.

- If I want to use the functions on my telco df with customer_id in it, I will set the non-numeric column to be the index, do all of my splitting and scaling, and then when I need to, I can reset the index.

In [65]:
df.set_index('customer_id', inplace=True)

In [66]:
df.head()

Unnamed: 0_level_0,monthly_charges,tenure,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013-SMEOE,109.7,71,7904.25
0014-BMAQU,84.65,63,5377.8
0016-QLJIS,90.45,65,5957.9
0017-DINOC,45.2,54,2460.55
0017-IUDMW,116.8,72,8456.75


In [68]:
def uniform_scaler(df):
    train, test = split_my_data(df)
    scaler = (QuantileTransformer(n_quantiles=100, 
                                  output_distribution='uniform', 
                                  random_state=123, copy=True)
                                  .fit(train))
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                                 columns=train.columns.values)
                                .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                                columns=test.columns.values)
                               .set_index([test.index.values]))
    return scaler, train_scaled, test_scaled

In [69]:
scaler, train_scaled, test_scaled = uniform_scaler(df)

In [70]:
scaler

QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=100,
                    output_distribution='uniform', random_state=123,
                    subsample=100000)

In [71]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,0.669192,1.0,0.752846
0960-HUWBM,0.83937,0.510101,0.828438
2346-LOCWC,0.212121,0.373737,0.221889
9114-DPSIA,0.641389,1.0,0.712854
4891-NLUBA,0.479454,0.424242,0.507107


In [73]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,0.171717,0.025253,0.025422
3795-GWTRD,0.580226,0.469697,0.60769
2900-PHPLN,0.060606,0.691919,0.313386
6211-WWLTF,0.810763,0.469697,0.786201
5494-WOZRZ,0.652346,0.757576,0.740894


## Validate my uniform_scaler() function in my Validation notebook

# Split and Scale my df using the gaussian_scaler() function

In [74]:
df.head(1)

Unnamed: 0_level_0,monthly_charges,tenure,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013-SMEOE,109.7,71,7904.25


In [75]:
def gaussian_scaler(df):
    train, test = split_my_data(df)
    scaler = (PowerTransformer(method='yeo-johnson', 
                               standardize=False, 
                               copy=True)
                              .fit(train))
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                                 columns=train.columns.values)
                                .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                                columns=test.columns.values)
                               .set_index([test.index.values]))
    return scaler, train_scaled, test_scaled

In [77]:
scaler, train_scaled, test_scaled = gaussian_scaler(df)

In [78]:
scaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=False)

In [79]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,11.717472,9950.70254,120.161288
0960-HUWBM,12.902125,7856.438822,125.628091
2346-LOCWC,5.840528,6040.584127,55.281773
9114-DPSIA,11.496409,9950.70254,116.891462
4891-NLUBA,10.099975,6785.375569,95.525468


In [80]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,5.7876,117.379965,24.196746
3795-GWTRD,11.129928,7309.661082,106.361697
2900-PHPLN,5.695594,9323.346607,60.990392
6211-WWLTF,12.650286,7309.661082,122.313217
5494-WOZRZ,11.562005,9634.096705,119.259823


## Validate my gaussian_scaler() function in my Validation notebook

# Split and Scale my df using the min_max_scaler() function

In [81]:
df.head(1)

Unnamed: 0_level_0,monthly_charges,tenure,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013-SMEOE,109.7,71,7904.25


In [82]:
def min_max_scaler(df):
    train, test = split_my_data(df)
    scaler = (MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(train))
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                                 columns=train.columns.values)
                                .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                                columns=test.columns.values)
                               .set_index([test.index.values]))
    return scaler, train_scaled, test_scaled

In [83]:
scaler, train_scaled, test_scaled = min_max_scaler(df)

In [84]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [85]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,0.657698,1.0,0.702286
0960-HUWBM,0.854011,0.901408,0.772045
2346-LOCWC,0.020927,0.802817,0.135399
9114-DPSIA,0.623817,1.0,0.662246
4891-NLUBA,0.428999,0.84507,0.431235


In [86]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,0.017439,0.126761,0.023224
3795-GWTRD,0.569507,0.873239,0.541802
2900-PHPLN,0.01146,0.971831,0.166678
6211-WWLTF,0.810164,0.873239,0.729324
5494-WOZRZ,0.633782,0.985915,0.691122


## Validate my min_max_scaler() function in my Validation notebook

# Split and Scale my df using the iqr_robust_scaler() function

In [89]:
df.head(1)

Unnamed: 0_level_0,monthly_charges,tenure,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013-SMEOE,109.7,71,7904.25


In [87]:
def iqr_robust_scaler(df):
    train, test = split_my_data(df)
    scaler = (RobustScaler(quantile_range=(25.0,75.0), 
                           copy=True, 
                           with_centering=True, 
                           with_scaling=True)
                          .fit(train))
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                                 columns=train.columns.values)
                                .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                                columns=test.columns.values)
                               .set_index([test.index.values]))
    return scaler, train_scaled, test_scaled

In [90]:
scaler, train_scaled, test_scaled = iqr_robust_scaler(df)

In [91]:
scaler

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

In [92]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,0.298318,0.363636,0.513837
0960-HUWBM,0.592897,0.045455,0.639483
2346-LOCWC,-0.657196,-0.272727,-0.507197
9114-DPSIA,0.247477,0.363636,0.44172
4891-NLUBA,-0.04486,-0.136364,0.02564


In [93]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,-0.66243,-2.454545,-0.709237
3795-GWTRD,0.165981,-0.045455,0.224787
2900-PHPLN,-0.671402,0.272727,-0.450859
6211-WWLTF,0.527103,-0.045455,0.562536
5494-WOZRZ,0.26243,0.318182,0.493729


## Validate my iqr_robust_scaler() function in my Validation notebook

# Boom! Ready for Data Science!