In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

from wrangle import wrangle_telco
import split_scale

In [2]:
df = wrangle_telco()
df.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1695 non-null object
monthly_charges    1695 non-null float64
tenure             1695 non-null int64
total_charges      1695 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


# Validate split_my_data()

In [4]:
train, test = split_scale.split_my_data(df)

In [5]:
train.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
1469,8661-BOYNW,84.4,72,6096.45


In [6]:
train.shape

(1186, 4)

In [7]:
test.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
252,1494-EJZDW,20.15,10,220.8


In [8]:
test.shape

(509, 4)

# Validate telco_standard_scaler()

In [9]:
scaler, train_scaled, test_scaled = split_scale.telco_standard_scaler(df)

In [10]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
train_scaled.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,8661-BOYNW,0.672591,0.838946,0.910993
1,0960-HUWBM,1.237494,0.441745,1.144459
2,2346-LOCWC,-1.159761,0.044543,-0.986222
3,9114-DPSIA,0.575095,0.838946,0.776991
4,4891-NLUBA,0.014492,0.214772,0.003859


In [12]:
test_scaled.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,1494-EJZDW,-1.169798,-2.679126,-1.361639
1,3795-GWTRD,0.418814,0.328258,0.373899
2,2900-PHPLN,-1.187003,0.72546,-0.881538
3,6211-WWLTF,1.111323,0.328258,1.001483
4,5494-WOZRZ,0.60377,0.782203,0.87363


# Validate my standard_scaler() for numerics only

In [13]:
df = wrangle_telco()
numerics = df.drop(columns='customer_id')
numerics.head()

Unnamed: 0,monthly_charges,tenure,total_charges
0,109.7,71,7904.25
1,84.65,63,5377.8
2,90.45,65,5957.9
3,45.2,54,2460.55
4,116.8,72,8456.75


In [14]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(numerics)

In [15]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1469,0.672591,0.838946,0.910993
163,1.237494,0.441745,1.144459
392,-1.159761,0.044543,-0.986222
1546,0.575095,0.838946,0.776991
797,0.014492,0.214772,0.003859


In [17]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
252,-1.169798,-2.679126,-1.361639
632,0.418814,0.328258,0.373899
472,-1.187003,0.72546,-0.881538
1029,1.111323,0.328258,1.001483
910,0.60377,0.782203,0.87363


# Validate my scale_inverse() function

In [18]:
train_unscaled, test_unscaled = split_scale.scale_inverse(scaler, train_scaled, test_scaled)

In [19]:
train_unscaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1469,84.4,72.0,6096.45
163,104.1,65.0,6700.05
392,20.5,58.0,1191.4
1546,81.0,72.0,5750.0
797,61.45,61.0,3751.15


In [20]:
test_unscaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
252,20.15,10.0,220.8
632,75.55,63.0,4707.85
472,19.55,70.0,1462.05
1029,99.7,63.0,6330.4
910,82.0,71.0,5999.85


# Validate my telco_scale_inverse() function

- If I want to return a df with the customer_id (non-numeric column) still in it.

In [21]:
df = wrangle_telco()

In [22]:
scaler, train_scaled, test_scaled = split_scale.telco_standard_scaler(df)

In [23]:
train_unscaled, test_unscaled = split_scale.telco_scale_inverse(scaler, train_scaled, test_scaled)

In [24]:
train_unscaled.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,8661-BOYNW,84.4,72.0,6096.45
1,0960-HUWBM,104.1,65.0,6700.05
2,2346-LOCWC,20.5,58.0,1191.4
3,9114-DPSIA,81.0,72.0,5750.0
4,4891-NLUBA,61.45,61.0,3751.15


In [25]:
test_unscaled.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,1494-EJZDW,20.15,10.0,220.8
1,3795-GWTRD,75.55,63.0,4707.85
2,2900-PHPLN,19.55,70.0,1462.05
3,6211-WWLTF,99.7,63.0,6330.4
4,5494-WOZRZ,82.0,71.0,5999.85


# Validate my uniform_scaler() function

In [26]:
df = wrangle_telco()

In [27]:
df.set_index('customer_id', inplace=True)

In [28]:
scaler, train_scaled, test_scaled = split_scale.uniform_scaler(df)

In [29]:
scaler

QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=100,
                    output_distribution='uniform', random_state=123,
                    subsample=100000)

In [30]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,0.669192,1.0,0.752846
0960-HUWBM,0.83937,0.510101,0.828438
2346-LOCWC,0.212121,0.373737,0.221889
9114-DPSIA,0.641389,1.0,0.712854
4891-NLUBA,0.479454,0.424242,0.507107


In [31]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,0.171717,0.025253,0.025422
3795-GWTRD,0.580226,0.469697,0.60769
2900-PHPLN,0.060606,0.691919,0.313386
6211-WWLTF,0.810763,0.469697,0.786201
5494-WOZRZ,0.652346,0.757576,0.740894


### I can peek at the distribution of the train_scaled and test_scaled if I want.

In [32]:
train_scaled.describe()

Unnamed: 0,monthly_charges,tenure,total_charges
count,1186.0,1186.0,1186.0
mean,0.500018,0.519964,0.500005
std,0.288686,0.317644,0.28895
min,0.0,0.0,0.0
25%,0.251865,0.247475,0.249235
50%,0.499481,0.489899,0.497496
75%,0.748782,0.757576,0.750382
max,1.0,1.0,1.0


In [33]:
test_scaled.describe()

Unnamed: 0,monthly_charges,tenure,total_charges
count,509.0,509.0,509.0
mean,0.500241,0.507496,0.485657
std,0.276997,0.328644,0.290554
min,0.005772,0.0,5e-05
25%,0.266667,0.222222,0.241079
50%,0.490842,0.489899,0.484534
75%,0.729382,0.757576,0.727521
max,0.999311,1.0,0.99539


# Validate my gaussian_scaler() function

In [34]:
df.head(1)

Unnamed: 0_level_0,monthly_charges,tenure,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013-SMEOE,109.7,71,7904.25


In [35]:
scaler, train_scaled, test_scaled = split_scale.gaussian_scaler(df)

In [36]:
scaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=False)

In [37]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,11.717472,9950.70254,120.161288
0960-HUWBM,12.902125,7856.438822,125.628091
2346-LOCWC,5.840528,6040.584127,55.281773
9114-DPSIA,11.496409,9950.70254,116.891462
4891-NLUBA,10.099975,6785.375569,95.525468


In [38]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,5.7876,117.379965,24.196746
3795-GWTRD,11.129928,7309.661082,106.361697
2900-PHPLN,5.695594,9323.346607,60.990392
6211-WWLTF,12.650286,7309.661082,122.313217
5494-WOZRZ,11.562005,9634.096705,119.259823


# Validate my min_max_scaler() function

In [39]:
scaler, train_scaled, test_scaled = split_scale.min_max_scaler(df)

In [40]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [41]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,0.657698,1.0,0.702286
0960-HUWBM,0.854011,0.901408,0.772045
2346-LOCWC,0.020927,0.802817,0.135399
9114-DPSIA,0.623817,1.0,0.662246
4891-NLUBA,0.428999,0.84507,0.431235


In [42]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,0.017439,0.126761,0.023224
3795-GWTRD,0.569507,0.873239,0.541802
2900-PHPLN,0.01146,0.971831,0.166678
6211-WWLTF,0.810164,0.873239,0.729324
5494-WOZRZ,0.633782,0.985915,0.691122


# Validate my iqr_robust_scaler() function

In [44]:
scaler, train_scaled, test_scaled = split_scale.iqr_robust_scaler(df)

In [45]:
scaler

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

In [46]:
train_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
8661-BOYNW,0.298318,0.363636,0.513837
0960-HUWBM,0.592897,0.045455,0.639483
2346-LOCWC,-0.657196,-0.272727,-0.507197
9114-DPSIA,0.247477,0.363636,0.44172
4891-NLUBA,-0.04486,-0.136364,0.02564


In [47]:
test_scaled.head()

Unnamed: 0,monthly_charges,tenure,total_charges
1494-EJZDW,-0.66243,-2.454545,-0.709237
3795-GWTRD,0.165981,-0.045455,0.224787
2900-PHPLN,-0.671402,0.272727,-0.450859
6211-WWLTF,0.527103,-0.045455,0.562536
5494-WOZRZ,0.26243,0.318182,0.493729
