In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, PolynomialFeatures

import wrangle
import prepare


import warnings
warnings.filterwarnings("ignore")

%matplotlib inline



In [6]:
# acquire data and split data using a function from wrangle module
train, validate, test = wrangle.wrangle_zillow()
# get rows and columns
train.shape, validate.shape, test.shape 

((26227, 8), (11241, 8), (9367, 8))

In [8]:
# peak into data
train.head()

Unnamed: 0,bathroom,bedroom,sqft,tax_amount,county,Los Angeles,Orange,Ventura
24005,3.0,4.0,2815.0,349000.0,Los Angeles,1,0,0
10859,2.0,2.0,1973.0,793000.0,Los Angeles,1,0,0
40615,2.0,3.0,1517.0,127619.0,Los Angeles,1,0,0
44400,2.0,3.0,1316.0,488770.0,Los Angeles,1,0,0
47188,2.0,2.0,1416.0,305739.0,Orange,0,1,0


In [9]:
# scale data
train_scaled, validate_scaled, test_scaled= prepare.scale_data(train, validate, test, 
                                                       columns_to_scale=['bedroom', 'bathroom', 'sqft','Los Angeles','Orange', 'Ventura' ],
                                                       return_scaler=False)

In [11]:
#. peak into scaled data
train_scaled.head()

Unnamed: 0,bathroom,bedroom,sqft,tax_amount,county,Los Angeles,Orange,Ventura
24005,0.666667,0.75,0.698418,349000.0,Los Angeles,1.0,0.0,0.0
10859,0.333333,0.25,0.420897,793000.0,Los Angeles,1.0,0.0,0.0
40615,0.333333,0.5,0.2706,127619.0,Los Angeles,1.0,0.0,0.0
44400,0.333333,0.5,0.204351,488770.0,Los Angeles,1.0,0.0,0.0
47188,0.333333,0.25,0.23731,305739.0,Orange,0.0,1.0,0.0


In [13]:
#create a function to isolate the target variable
def X_y_split(train, validate, test, target):
    '''
    takes in a dataframe and a target variable
    returns the X_train, y_train, X_validate, y_validate, X_test, y_test
    '''  


    X_train = train.drop(columns= ['county', 'tax_amount'])
    y_train = train[[target]]

    X_validate = validate.drop(columns= ['county', 'tax_amount'])
    y_validate = validate[[target]]

    X_test = test.drop(columns= ['county', 'tax_amount'])
    y_test = test[[target]]
        
    return X_train, y_train, X_validate, y_validate, X_test, y_test


In [14]:
# create attributes

X_train, y_train, X_validate, y_validate, X_test, y_test = X_y_split(train_scaled, validate_scaled, test_scaled, 'tax_amount')

In [15]:
def get_rmse_mean(train,validate, target):
    train['baseline_mean'] = train[target].mean()
    validate['baseline_mean'] = train[target].mean()
    train_RMSE_mean  = mean_squared_error(train[target], train['baseline_mean'], squared=False )
    validae_RMSE_mean = mean_squared_error(validate[target], validate['baseline_mean'], squared=False )
    print('RMSE using mean')
    print(F'train_RMSE: {train_RMSE_mean}')
    print(f'validate RMSE: {validae_RMSE_mean}')
    return train_RMSE_mean, validae_RMSE_mean

In [43]:
train_rmse_mean, validate_rmse_mean  = get_rmse_mean(train, validate,'tax_amount')

RMSE using mean
train_RMSE: 282783.9723515096
validate RMSE: 276920.1915397434


In [44]:

def get_rmse_median(train,validate, target):
    train['baseline_median'] = train[target].median()
    validate['baseline_median'] = validate[target].median()
    train_RMSE_median  = mean_squared_error(train[target], train['baseline_median'], squared=False )
    validae_RMSE_median = mean_squared_error(validate[target], validate['baseline_median'], squared=False )
    print('RMSE using median')
    print(F'train_RMSE: {train_RMSE_median}')
    print(f'validate RMSE: {validae_RMSE_median}')
    return train_RMSE_median, validae_RMSE_median

In [45]:
train_rmse_median, validate_rmse_median  = get_rmse_median(train, validate,'tax_amount')

RMSE using median
train_RMSE: 287882.6226479039
validate RMSE: 281169.87292888097


In [46]:

def get_linear_regression(X_train,y_train,X_validate,y_validate, target):
    # create the model object
    lm = LinearRegression()  
    # Fit the model
    lm.fit(X_train, y_train[target])   
    # Predict train
    y_train['prediction_OLS'] = lm.predict(X_train)
    # predict validate 
    y_validate['prediction_OLS'] = lm.predict(X_validate)  
    # evaluate train RMSE
    rmse_train = round (mean_squared_error(y_train[target], y_train['prediction_OLS'],squared=False ), 2)
    # evaluate validate rmse
    rmse_validate = round (mean_squared_error(y_validate[target], y_validate['prediction_OLS'],squared=False), 2)
    
    return rmse_train, rmse_validate


In [47]:
rmse_lm_train, rmse_lm_validate = get_linear_regression(X_train,y_train,X_validate,y_validate, 'tax_amount')
rmse_lm_train, rmse_lm_validate

(245858.1, 241315.04)

In [48]:
def get_lassoLars(X_train,y_train,X_validate,y_validate, target, alpha):
    # create the model object
    lars = LassoLars(alpha)  
    # Fit the model
    lars.fit(X_train, y_train[target])   
    # Predict train
    y_train['prediction_lassoLars'] = lars.predict(X_train)
    # predict validate 
    y_validate['prediction_lassoLars'] = lars.predict(X_validate)  
    # evaluate train RMSE
    rmse_train = round (mean_squared_error(y_train[target], y_train['prediction_lassoLars'],squared=False ), 2)
    # evaluate validate rmse
    rmse_validate = round (mean_squared_error(y_validate[target], y_validate['prediction_lassoLars'],squared=False), 2)
    
    return rmse_train, rmse_validate
    

In [49]:
lassoLars_alpha = []
lassoLars_rmse_train = []
lassoLars_rmse_validate = []
for i in range(0,11): 
    rmse_train, rmse_validate = get_lassoLars(X_train,y_train,X_validate,y_validate, 'tax_amount', alpha=i)
    lassoLars_alpha.append(i)
    lassoLars_rmse_train.append(rmse_train)
    lassoLars_rmse_validate.append(rmse_validate)
    
lassoLars_df = pd.DataFrame({'alpha': lassoLars_alpha,
                            'rmse_train': lassoLars_rmse_train,
                            'rmse_validate': lassoLars_rmse_validate})
    
lassoLars_df['rmse_differnce'] =  lassoLars_df['rmse_train'] - lassoLars_df['rmse_validate']
lassoLars_df

Unnamed: 0,alpha,rmse_train,rmse_validate,rmse_differnce
0,0,245849.76,241300.19,4549.57
1,1,245849.76,241300.05,4549.71
2,2,245849.76,241299.91,4549.85
3,3,245849.77,241299.77,4550.0
4,4,245849.77,241299.63,4550.14
5,5,245849.77,241299.49,4550.28
6,6,245849.77,241299.35,4550.42
7,7,245849.78,241299.22,4550.56
8,8,245849.78,241299.08,4550.7
9,9,245849.79,241298.94,4550.85


In [50]:
rmse_lars_train, rmse_lars_validate = get_lassoLars(X_train,y_train,X_validate,y_validate, 'tax_amount', 1)
rmse_lars_train, rmse_lars_validate

(245849.76, 241300.05)

In [51]:
def get_tweedie(X_train,y_train,X_validate,y_validate, target, power, alpha):
    # create the model object
    glm = TweedieRegressor(power=power, alpha=alpha)
    # Fit the model
    glm.fit(X_train, y_train[target])   
    # Predict train
    y_train['prediction_GLM'] = glm.predict(X_train)
    # predict validate 
    y_validate['prediction_GLM'] = glm.predict(X_validate)  
    # evaluate train RMSE
    rmse_train = round (mean_squared_error(y_train[target], y_train['prediction_GLM'],squared=False ), 2)
    # evaluate validate rmse
    rmse_validate = round (mean_squared_error(y_validate[target], y_validate['prediction_GLM'],squared=False), 2)
    
    return rmse_train, rmse_validate
    

In [52]:
tweedie_alpha = []
tweedie_rmse_train = []
tweedie_rmse_validate = []
for i in range(0,11): 
    rmse_train, rmse_validate = get_tweedie(X_train,y_train,X_validate,y_validate, 'tax_amount', power = 0, alpha=i)
    tweedie_alpha.append(i)
    tweedie_rmse_train.append(rmse_train)
    tweedie_rmse_validate.append(rmse_validate)
    
tweedie_df = pd.DataFrame({'alpha': tweedie_alpha,
                            'rmse_train': tweedie_rmse_train,
                            'rmse_validate': tweedie_rmse_validate})
    
tweedie_df['rmse_differnce'] =  tweedie_df['rmse_train'] - tweedie_df['rmse_validate']
tweedie_df

Unnamed: 0,alpha,rmse_train,rmse_validate,rmse_differnce
0,0,245849.76,241300.19,4549.57
1,1,276293.73,270618.83,5674.9
2,2,279099.6,273351.12,5748.48
3,3,280206.79,274425.71,5781.08
4,4,280801.45,275002.1,5799.35
5,5,281172.89,275361.87,5811.02
6,6,281427.02,275607.91,5819.11
7,7,281611.87,275786.81,5825.06
8,8,281752.37,275922.76,5829.61
9,9,281862.78,276029.58,5833.2


In [53]:
rmse_glm_train, rmse_glm_validate = get_tweedie(X_train,y_train,X_validate,y_validate, 'tax_amount', 1, 0)
rmse_glm_train, rmse_glm_validate

(245395.21, 240971.32)

In [54]:
def get_polynomial(X_train,y_train,X_validate,y_validate, target, degree):
    
    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree)

    # fit and transform X_train_scaled
    X_train_degree2 = pf.fit_transform(X_train)

    # transform X_validate_scaled 
    X_validate_degree2 = pf.transform(X_validate)
    
    # create the model object
    lm2 = LinearRegression()
    # Fit the model
    lm2.fit(X_train_degree2, y_train[target])   
    # Predict train
    y_train['prediction_polynomial'] = lm2.predict(X_train_degree2)
    # predict validate 
    y_validate['prediction_polynomial'] = lm2.predict(X_validate_degree2)  
    # evaluate train RMSE
    rmse_train = round (mean_squared_error(y_train[target], y_train['prediction_polynomial'],squared=False ), 2)
    # evaluate validate rmse
    rmse_validate = round (mean_squared_error(y_validate[target], y_validate['prediction_polynomial'],squared=False), 2)
    
    return rmse_train, rmse_validate

In [55]:
polynomial_degree = []
polynomial_rmse_train = []
polynomial_rmse_validate = []
for i in range(1,5): 
    rmse_train, rmse_validate = get_polynomial(X_train,y_train,X_validate,y_validate, 'tax_amount', degree=i)
    polynomial_degree.append(i)
    polynomial_rmse_train.append(rmse_train)
    polynomial_rmse_validate.append(rmse_validate)
    
polynomial_df = pd.DataFrame({'degree': polynomial_degree,
                            'rmse_train': polynomial_rmse_train,
                            'rmse_validate': polynomial_rmse_validate})
    
polynomial_df['rmse_differnce'] =  polynomial_df['rmse_train'] - polynomial_df['rmse_validate']
polynomial_df

Unnamed: 0,degree,rmse_train,rmse_validate,rmse_differnce
0,1,245852.46,241306.56,4545.9
1,2,244804.26,240194.08,4610.18
2,3,244184.8,239871.32,4313.48
3,4,243940.2,240000.67,3939.53


In [56]:
rmse_poly_train, rmse_poly_validate= get_polynomial(X_train,y_train,X_validate,y_validate,'tax_amount', 3)
rmse_poly_train, rmse_poly_validate

(244184.8, 239871.32)

In [57]:
rmse_glm_train, rmse_glm_validate = get_tweedie(X_train,y_train,X_validate,y_validate,'tax_amount', 1, 0)
rmse_glm_train, rmse_glm_validate

(245395.21, 240971.32)

In [58]:
rmse_lars_train, rmse_lars_validate = get_lassoLars(X_train,y_train,X_validate,y_validate, 'tax_amount', 1)
rmse_lars_train, rmse_lars_validate

(245849.76, 241300.05)

In [59]:
rmse_lm_train, rmse_lm_validate = get_linear_regression(X_train,y_train,X_validate,y_validate, 'tax_amount')
rmse_lm_train, rmse_lm_validate

(245858.1, 241315.04)

In [60]:
metric_df = pd.DataFrame({'model': ['baseline', 'LinearRegreesion', 'LassoLars(alpha=1)', 'TweedieRegreesor(power=0, alpha=0)','Polynomial Regression(degree=3)'],
                         'train_RMSE': [train_rmse_mean, rmse_lm_train, rmse_lars_train, rmse_glm_train, rmse_poly_train],
                         'validate_RMSE': [validate_rmse_mean, rmse_lm_validate, rmse_lars_validate, rmse_glm_validate, rmse_poly_validate]})
metric_df['difference'] = metric_df['train_RMSE'] - metric_df['validate_RMSE']
metric_df

Unnamed: 0,model,train_RMSE,validate_RMSE,difference
0,baseline,282783.972352,276920.19154,5863.780812
1,LinearRegreesion,245858.1,241315.04,4543.06
2,LassoLars(alpha=1),245849.76,241300.05,4549.71
3,"TweedieRegreesor(power=0, alpha=0)",245395.21,240971.32,4423.89
4,Polynomial Regression(degree=3),244184.8,239871.32,4313.48


In [61]:
def get_polynomial_test(X_train,y_train,X_test,y_test, target, degree):
    
    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree)

    # fit and transform X_train_scaled
    X_train_degree = pf.fit_transform(X_train)

    # transform X_validate_scaled 
    X_test_degree = pf.transform(X_test)
    
    # create the model object
    lm = LinearRegression()
    # Fit the model
    lm.fit(X_train_degree, y_train[target])   

    # predict test 
    y_test['prediction_polynomial_test'] = lm.predict(X_test_degree)  
    # evaluate test rmse
    rmse_test = round (mean_squared_error(y_test[target], y_test['prediction_polynomial_test'],squared=False), 2)
    r2 = explained_variance_score(y_test[target], y_test['prediction_polynomial_test'])
    print('Using polynomial Regresion on test')
    print(f'RMSE : { rmse_test}')
    print(f'r2 : {r2}')
    return  rmse_test

In [62]:
get_polynomial_test(X_train,y_train,X_test,y_test, 'tax_amount', 3)


Using polynomial Regresion on test
RMSE : 243328.52
r2 : 0.25385988399720893


243328.52

In [63]:
y_test

Unnamed: 0,tax_amount,prediction_polynomial_test
14082,362126.0,454528.0
26213,105352.0,263808.0
18180,93148.0,324032.0
39421,616000.0,409408.0
44544,367771.0,290816.0
...,...,...
40800,585788.0,604992.0
41866,441655.0,657600.0
49496,784529.0,914368.0
12310,289169.0,449728.0
