In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

from wrangle import wrangle_telco
import split_scale

# Read in Telco df using wrangle_telco() function

In [2]:
df = wrangle_telco()

In [3]:
df.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1695 non-null object
monthly_charges    1695 non-null float64
tenure             1695 non-null int64
total_charges      1695 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


# Split and Scale data using Standard Scaler Function

In [None]:
def split_my_data(df, train_pct=0.70, seed=123):
    train, test = train_test_split(df, train_size=train_pct, random_state=seed)
    return train, test

In [None]:
train.shape

In [None]:
test.shape

## Our train test split function works fine. Now we can use it in our standard_scaler function

- I wanted to make a function where I could feed in the telco df even though it has a column that is not numeric and use a Standard Scaler on the numeric columns.

- I wanted the function to return the scaler, train_scaled, and test_scaled with customer_id still intact.

- I moved the customer_id columns to the index, so I can just reset the index after I scale the numeric values.

- The Standard Scaler returns arrays, so I had to convert those arrays back to dataframes.

- I had to reset and rename the index to be a customer_id column again.

In [None]:
def telco_standard_scaler(df):
    df.set_index('customer_id', inplace=True)
    train, test = split_my_data(df)
    scaler = StandardScaler().fit(train)
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                    columns=train.columns.values)
                    .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                    columns=test.columns.values)
                   .set_index([test.index.values]))
    train_scaled.reset_index(inplace=True)
    train_scaled.rename(columns={'index': 'customer_id'}, inplace=True)
    test_scaled.reset_index(inplace=True)
    test_scaled.rename(columns={'index': 'customer_id'}, inplace=True)
    return scaler, train_scaled, test_scaled

In [None]:
# test the function

scaler, train_scaled, test_scaled = standard_scaler(df)

In [None]:
# validate that it can return the scaler if I need to revert to unscaled

scaler

In [None]:
# validate train_scaled

train_scaled

In [None]:
# validate test_scaled

test_scaled

# Make Sure I can read in my telco_standard_scaler() function

In [None]:
scaler, train_scaled, test_scaled = split_scale.telco_standard_scaler(df)

In [None]:
scaler

In [None]:
train_scaled

In [None]:
test_scaled

# Pretty, pretty scaled data...

# If you only had numeric columns, this function works.

In [None]:
df = wrangle_telco()

In [5]:
numerics = df.drop(columns='customer_id')
numerics.head()

Unnamed: 0,monthly_charges,tenure,total_charges
0,109.7,71,7904.25
1,84.65,63,5377.8
2,90.45,65,5957.9
3,45.2,54,2460.55
4,116.8,72,8456.75


In [None]:
def standard_scaler(df):
    train, test = split_my_data(df)
    scaler = StandardScaler().fit(train)
    train_scaled = (pd.DataFrame(scaler.transform(train), 
                    columns=train.columns.values)
                    .set_index([train.index.values]))
    test_scaled = (pd.DataFrame(scaler.transform(test), 
                    columns=test.columns.values)
                   .set_index([test.index.values]))
    return scaler, train_scaled, test_scaled

In [None]:
scaler, train_scaled, test_scaled = standard_scaler(numerics)

In [None]:
scaler

In [None]:
train_scaled

In [None]:
test_scaled

## Validate that my standard_scaler() function works from split_scale.py module

In [7]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(numerics)

In [8]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1469,0.672591,0.838946,0.910993
163,1.237494,0.441745,1.144459
392,-1.159761,0.044543,-0.986222
1546,0.575095,0.838946,0.776991
797,0.014492,0.214772,0.003859


In [11]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
252,-1.169798,-2.679126,-1.361639
632,0.418814,0.328258,0.373899
472,-1.187003,0.72546,-0.881538
1029,1.111323,0.328258,1.001483
910,0.60377,0.782203,0.87363


# Yes!! The rest should go way faster!