In [None]:
#DS imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydataset
import seaborn as sns
from scipy import stats


#Modeling and scaling
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import train_test_split

#My files
import env
import wrangle as w
import model as m

np.random.seed(123)

## Exercises: Getting Data

In [None]:
#get zillow data
df =w.get_zillow_data()

In [None]:
df.head()

In [None]:
#Clean Zillow data
df = w.clean_zillow(df)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#Drop null values for zillow
df = df.dropna()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### 1. Apply the scalers we talked about in this lesson to your data and visualize the results for the unscaled and scaled distribution 

In [None]:
train, validate, test = w.train_validate_test_split(df)

In [None]:
train.head()

In [None]:
# Validate my split

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

In [None]:
train.head()

In [None]:
#Prepare for scaling by limiting to three features and target
columns_to_scale = ['bedrooms', 'bathrooms','area','tax_value']

In [None]:
#making copies for each method
train_scaled_minmax = train.copy()
train_scaled_standard = train.copy()
train_scaled_robust = train.copy()

### Looking at this dataset, we want to scale several columns: bedrooms, bathrooms, tax_value

### Min-Max Scaler

In [None]:
#Using min max scaler
scaler = MinMaxScaler()
train_scaled_minmax[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

In [None]:
train_scaled_minmax

In [None]:
#Visualize
plt.figure(figsize=(13, 6))
plt.subplot(121)
plt.hist(train[columns_to_scale], ec='black')
plt.title('Original')
plt.subplot(122)
plt.hist(train_scaled_minmax[columns_to_scale], ec='black')
plt.title('Scaled')

In [None]:
print(train.bedrooms.head(2))
print(train_scaled_minmax.bedrooms.head(2))

### Standard Scaler

In [None]:
#Standard Scaler
scaler = StandardScaler()
# Fit and transform on training data
train_scaled_standard[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

In [None]:
#Visualize
plt.figure(figsize=(13, 6))
plt.subplot(121)
plt.hist(train[columns_to_scale], ec='black')
plt.title('Original')
plt.subplot(122)
plt.hist(train_scaled_standard[columns_to_scale], ec='black')
plt.title('Scaled')

In [None]:
print(train.bedrooms.head(2))
print(train_scaled_standard.bedrooms.head(2))

### Robust Scaler

In [None]:
#Robust Scaler
scaler = RobustScaler()
# Fit and transform on training data
train_scaled_robust[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

In [None]:
#Visualize this
plt.figure(figsize=(13, 6))
plt.subplot(121)
plt.hist(train[columns_to_scale], ec='black')
plt.title('Original')
plt.subplot(122)
plt.hist(train_scaled_robust[columns_to_scale], ec='black')
plt.title('Scaled')

In [None]:
print(train.bedrooms.head(2))
print(train_scaled_robust.bedrooms.head(2))

### 2. Apply the .inverse_transform method to your scaled data. Is the resulting dataset the exact same as the original data?

### Yes, it is

In [None]:
#Using min max scaler
scaler = MinMaxScaler()
original_data = train[['bedrooms']]
scaled_data = scaler.fit_transform(original_data)

In [None]:
original_data[:5]

In [None]:
scaled_data[:5]

In [None]:
scaler.inverse_transform(scaled_data)[:5]

### 3. Read the documentation for sklearn's QuantileTransformer. Use normal for the output_distribution and apply this scaler to your data. Visualize the result of your data scaling.

In [None]:
qt = QuantileTransformer(n_quantiles=10, random_state=0, output_distribution='normal')

In [None]:
qt = qt.fit_transform(train[columns_to_scale])

In [None]:
#Visualize this
plt.figure(figsize=(13, 6))
plt.hist(qt, ec='black')
plt.title('Quantile Transformed')

### 4. Use the QuantileTransformer, but omit the output_distribution argument. Visualize your results. What do you notice?

In [None]:
qt_no_output = QuantileTransformer(n_quantiles=10, random_state=0)

In [None]:
qt_no_output = qt_no_output.fit_transform(train[columns_to_scale])

In [None]:
#Visualize this
plt.figure(figsize=(13, 6))
plt.hist(qt_no_output, ec='black')
plt.title('Quantile Transformed no output distribution')

### 5. Based on the work you've done, choose a scaling method for your dataset. Write a function within your prepare.py that accepts as input the train, validate, and test data splits, and returns the scaled versions of each. Be sure to only learn the parameters for scaling from your training data!

In [None]:
train.head()

In [None]:
#Prepare for modeling

#Drop columns not needed for modeling

def scale_data(train, 
               validate, 
               test, 
               columns_to_scale=['bedrooms', 'bathrooms', 'area','tax_value'],
               return_scaler=False):
    '''This function takes in train, validate, test, and outputs scaled data based on
    the chosen method (quantile scaling) using the columns selected as the only columns
    that will be scaled. This function also returns the scaler object as an array if set 
    to true'''
    # make copies of our original data
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
     # select a scaler
    scaler = QuantileTransformer(random_state=123, output_distribution='normal')
     # fit on train
    scaler.fit(train[columns_to_scale])
    # applying the scaler:
    train_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(train[columns_to_scale]),
                                                  columns=train[columns_to_scale].columns.values).set_index([train.index.values])
                                                  
    validate_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(validate[columns_to_scale]),
                                                  columns=validate[columns_to_scale].columns.values).set_index([validate.index.values])
    
    test_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(test[columns_to_scale]),
                                                 columns=test[columns_to_scale].columns.values).set_index([test.index.values])
    if return_scaler:
        return scaler, train_scaled, validate_scaled, test_scaled
    else:
        return train_scaled, validate_scaled, test_scaled

In [None]:
scaler, train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test, return_scaler=True)

In [None]:
train_scaled.head(2)

In [None]:
test_scaled.head(2)

In [None]:
train.info()

In [None]:
print(f' Min value scaled Bedrooms is : {train_scaled.bedrooms.min()}')
print(f' Max value scaled Bedrooms is: {train_scaled.bedrooms.max()}')

In [None]:
## Use this function to visualize scalers

def visualize_scaler(scaler, df, columns_to_scale, bins=10):
    fig, axs = plt.subplots(len(columns_to_scale), 2, figsize=(16,9))
    df_scaled = df.copy()
    df_scaled[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    for (ax1, ax2), col in zip(axs, columns_to_scale):
        ax1.hist(df[col], bins=bins)
        ax1.set(title=f'{col} before scaling', xlabel=col, ylabel='count')
        ax2.hist(df_scaled[col], bins=bins)
        ax2.set(title=f'{col} after scaling with {scaler.__class__.__name__}', xlabel=col, ylabel='count')
    plt.tight_layout()
#    return fig, axs

In [None]:
# MinMaxScaler Applied
visualize_scaler(scaler=MinMaxScaler(), 
                 df=train, 
                 columns_to_scale=columns_to_scale, 
                 bins=50)

In [None]:
# QuantileTransformer Applied
visualize_scaler(scaler=QuantileTransformer(output_distribution='normal'), 
                 df=train,
                 columns_to_scale=columns_to_scale, 
                 bins=50)