### Exercises

Our scenario continues:

As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.

Create split_scale.py that will contain the functions that follow. Each scaler function should create the object, fit and transform both train and test. They should return the scaler, train dataframe scaled, test dataframe scaled. Be sure your indices represent the original indices from train/test, as those represent the indices from the original dataframe. Be sure to set a random state where applicable for reproducibility!

        split_my_data(X, y, train_pct)

        standard_scaler()

        scale_inverse()

        uniform_scaler()

        gaussian_scaler()

        min_max_scaler()

        iqr_robust_scaler()

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pydataset import data
import wrangle
import util

#### Class Examples and Explanation

X: independent variables   uppercase X means it's a df, a set of variables

y: target                  lowercase y means there is one target variable

You can pass an array, you can pass a dataframe, you can pass multiple dataframes

In [2]:
from sklearn.model_selection import train_test_split

y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [3]:
train_test_split(y, random_state=123)

[array([5, 8, 3, 1, 6, 9, 2]), array([4, 0, 7])]

In [4]:
# using mpg data
# just two dataframes with all info together

### train_df, test_df = train_test_split(df)

# breaking the two dataframes, train and test, into four

### x_train = train_df.drop(columns="hwy")
### y_train = train_df.hwy
### x_test = test_df.drop(columns="hwy")
### y_test = test_df.hwy

#### Split My Data

In [5]:
telco = wrangle.wrangle_telco()
telco.head()

Unnamed: 0,customer_id,monthly_charges,total_charges,tenure
0,0013-SMEOE,109.7,7904.25,71
1,0014-BMAQU,84.65,5377.8,63
2,0016-QLJIS,90.45,5957.9,65
3,0017-DINOC,45.2,2460.55,54
4,0017-IUDMW,116.8,8456.75,72


In [6]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
total_charges      1685 non-null float64
tenure             1685 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.8+ KB


In [7]:
telco.describe()

Unnamed: 0,monthly_charges,total_charges,tenure
count,1685.0,1685.0,1685.0
mean,60.872374,3728.933947,57.07181
std,34.71221,2571.252806,17.72913
min,18.4,20.35,1.0
25%,24.05,1278.8,48.0
50%,64.45,3623.95,64.0
75%,90.55,5999.85,71.0
max,118.75,8672.45,72.0


In [8]:
# df[[]] = 1D dataframe
telco_y = telco[["total_charges"]]
telco_y.head()

# x = df.drop(columns="total_charges")
# y = df.total_charges

Unnamed: 0,total_charges
0,7904.25
1,5377.8
2,5957.9
3,2460.55
4,8456.75


In [9]:
telco_X = telco[["monthly_charges", "tenure"]]
telco_X.head()

Unnamed: 0,monthly_charges,tenure
0,109.7,71
1,84.65,63
2,90.45,65
3,45.2,54
4,116.8,72


In [10]:
train_X, test_X, train_y, test_y = train_test_split(telco_X, telco_y, train_size=.80, random_state=123)


In [11]:
# split the data into four sets for x and y, train and test
from sklearn.model_selection import train_test_split

def split_my_data(X, y, train_pct):
    train_x, test_x, train_y, test_y = train_test_split(telco_x, telco_y, train_size=.80, random_state=123)
    return pd.DataFrame(train_X), pd.DataFrame(test_X), pd.DataFrame(train_y), pd.DataFrame(test_y)

type(train_X)


pandas.core.frame.DataFrame

#### Scale My Data

In [12]:
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler


In [21]:
# Create scaler object function
def standard_scaler(train):
    scaler_object = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train)
    return scaler_object

In [66]:
# Create Uniform scaler object
def uniform_scaler(train):
    scaler_object = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train).fit(train)
    return scaler_object

In [69]:
# function to transform
def apply_scaler(df, scaler_obj):
    scaled_df = pd.DataFrame(scaler_obj.transform(df))
    return scaled_df

In [70]:

train_X_scaled = apply_scaler(train_X, scaler_obj_X)

In [71]:
# scale_inverse()
# train_unscaled = pd.DataFrame(scaler.inverse_transform(train_scaled), columns=train_scaled.columns.values).set_index([train.index.values])

def scale_inverse(scaled, scaler_obj):
    unscaled_df = pd.DataFrame(scaler_obj.inverse_transform(scaled), columns=scaled.columns.values).set_index([scaled.index.values])
    return unscaled_df

In [43]:
scale_inverse(train_X_scaled, scaler_obj_X)

Unnamed: 0,0,1
0,75.50,70.0
1,20.30,55.0
2,109.05,65.0
3,98.30,70.0
4,116.25,71.0
5,111.75,68.0
6,57.50,57.0
7,39.70,50.0
8,107.60,62.0
9,70.80,68.0


In [44]:
# gaussian_scaler()
def gaussian_scaler(train):
    scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train)
    return scaler

In [45]:
gaussian_scaler(train_X)

PowerTransformer(copy=True, method='yeo-johnson', standardize=False)

In [48]:
# min_max_scaler()
def min_max_scaler(train):
    scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(train)
    return scaler

In [49]:
min_max_scaler(train_X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [None]:
# iqr_robust_scaler()

In [50]:
def iqr_robust_scaler(train):
    scaler = RobustScaler(quantile_range=(25.0,75.0), copy=True, with_centering=True, with_scaling=True).fit(train)
    return scaler

In [51]:
iqr_robust_scaler(train_X)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

In [68]:
scaler_X = standard_scaler(train_X)
scaler_y = standard_scaler(train_y)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [61]:
scaled_train_X = apply_scaler(train_X, scaler_X)
scaled_train_X

Unnamed: 0,0,1
0,0.419607,0.729412
1,-1.169158,-0.130571
2,1.385242,0.442751
3,1.075836,0.729412
4,1.592472,0.786745
5,1.462954,0.614748
6,-0.098468,-0.015907
7,-0.610787,-0.417232
8,1.343508,0.270754
9,0.284332,0.614748


In [62]:
unscaled_train_X = scale_inverse(scaled_train_X, scaler_X)
unscaled_train_X

Unnamed: 0,0,1
0,75.50,70.0
1,20.30,55.0
2,109.05,65.0
3,98.30,70.0
4,116.25,71.0
5,111.75,68.0
6,57.50,57.0
7,39.70,50.0
8,107.60,62.0
9,70.80,68.0


In [67]:
uni_scaler_X = uniform_scaler(train_X)
uni_scaler_X

QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=100,
                    output_distribution='uniform', random_state=123,
                    subsample=100000)

In [72]:
gaussian_scaler(train_X)

PowerTransformer(copy=True, method='yeo-johnson', standardize=False)

In [73]:
min_max_scaler(train_X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [74]:
iqr_robust_scaler(train_X)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)