Create split_scale.py that will contain the functions that follow. Each scaler function should create the object, fit and transform both train and test. They should return the scaler, train dataframe scaled, test dataframe scaled. Be sure your indices represent the original indices from train/test, as those represent the indices from the original dataframe. Be sure to set a random state where applicable for reproducibility!

In [1]:
from wrangle import wrangle_telco
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

In [2]:
customers = wrangle_telco()

In [3]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.8+ KB


In [4]:
X = customers[["tenure", "monthly_charges"]]
y = customers[["total_charges"]]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2, random_state = 123)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (1348, 2)
X_test: (337, 2)
y_train: (1348, 1)
y_test: (337, 1)


In [5]:
X_train

Unnamed: 0,tenure,monthly_charges
119,70,75.50
1424,55,20.30
385,65,109.05
1140,70,98.30
1504,71,116.25
...,...,...
1131,65,19.85
1356,63,19.55
1416,7,19.85
1399,35,69.15


Now we transform the data

**Standard_scaler**

In [6]:
scaler = StandardScaler().fit(X_train, y_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns.values).set_index([X_train.index.values])

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns.values).set_index([X_test.index.values])

In [7]:
X_train

Unnamed: 0,tenure,monthly_charges
119,70,75.50
1424,55,20.30
385,65,109.05
1140,70,98.30
1504,71,116.25
...,...,...
1131,65,19.85
1356,63,19.55
1416,7,19.85
1399,35,69.15


In [8]:
import split_scale

In [9]:
train, test, _, _ = split_scale.split_my_data(X, y, .8)

In [10]:
test

Unnamed: 0,tenure,monthly_charges
305,72,20.50
452,67,111.30
917,63,109.20
1421,64,19.45
1557,24,24.10
...,...,...
1642,72,80.80
460,23,25.10
1170,72,25.20
1083,38,81.00


In [11]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test)

In [15]:
train

Unnamed: 0,tenure,monthly_charges
119,70,75.50
1424,55,20.30
385,65,109.05
1140,70,98.30
1504,71,116.25
...,...,...
1131,65,19.85
1356,63,19.55
1416,7,19.85
1399,35,69.15


In [25]:
scaler, train_scaled, test_scaled = split_scale.gaussian_scaler(train, test)

In [26]:
scaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [20]:
scaler, train, test = split_scale.iqr_robust_scaler(train, test)

In [22]:
scaler

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)