In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
import time
import re
pd.set_option('display.max_rows', 500)

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm as anova

def EDA_plots(df, features = None, targets = ['SalePrice']):
    '''
    EDA_plots: a function to automate basic preliminary EDA on [features] vs [targets]
    
    args:
        df: a dataframe
        features: a list of column names to run the basic EDA functions on. If nothing is passed, all features will be used.
        targets: a list of column names to use as targets in the basic EDA functions. If nothing is passed, ['SalePrice'] 
                 will be used.
        
    output:
        - Prints scatterplots and boxplots of [features] vs [targets]
        - Prints histograms of [features]
        - Uses crude method of datatype == str to decide whether to treat a feature as categorical or continuous
            - If the data in a column is of string type, dummifies that column and runs multiple linear regressions 
              on the dummies vs [targets]
            - If you would rather do simple anovas than the dummified multiple regressions, there is commented-out
              code which you can uncomment to do anovas instead
            - If the data in a column is not string type, runs simple linear regressions on that column vs [targets] 
              and prints the correlation and R^2 values
    '''
    # default features is None, in which case the function will use all features due to the first 2 lines below
    if features is None:
        features = df.columns
    
    
    for feature in features:
        for target in targets:
            if feature != target and feature != 'PID': # ignore unique identifier
                print('feature: ',feature) # print feature name
                # scatterplot
                scatter = px.scatter(x = df[feature], y = df[target])
                scatter.update_layout(
                    title={
                        'text': f'Scatterplot, {feature} vs {target}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = f'{target}'
                )
                scatter.show()
                # histogram
                hist = px.histogram(x = df[feature])
                hist.update_layout(
                    title={
                        'text': f'Distribution of {feature}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = 'Frequency'
                )
                hist.show()
                # boxplot
                box = px.box(x = df[feature], y = df[target])
                box.update_layout(
                    title={
                        'text': f'Boxplot, {feature} vs {target}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = f'{target}'
                )
                box.show()
                # the dataset does not have unique indices, this fixes that
                temp = df[df[feature].isna() == False].reset_index(drop = True)
                
                if type(temp.loc[0, feature]) != str: # continuous
                    corr = temp[feature].corr(temp[target])
                    print(f'Correlation between {feature} and {target} is {corr}')
                    linreg = stats.linregress(temp[feature], temp[target] )
                    print(linreg)
                    print('r^2 = ',linreg.rvalue**2)
                if type(temp.loc[0, f'{feature}']) == str: # categorical
                    fit = ols(f'{target} ~ C({feature})', data=temp).fit()
                    print(fit.summary()) # comment this out and uncomment the below lines to get simpler anova
                    # anova_table = anova(fit, typ=2)
                    # print(anova_table)
            print()

In [20]:
df = pd.read_csv('./../data/ames_housing_price_data_v4.csv')
radial = pd.read_csv('./../data/house_coordinates_1.0.csv', index_col = 0)

for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)
        
df = pd.merge(radial, df, how = 'right', on = 'PID')
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df.reset_index(drop=True, inplace = True)

price = df['SalePrice']
price_log = df['SalePrice_log']

In [21]:
rad_drops = ['Address', 'Coords4', 'latitude', 'longitude']

In [22]:
radial

Unnamed: 0_level_0,Address,Coords4,latitude,longitude,police,fire_station,post_box,post_office,library,town_hall,...,retail,orchard,vineyard,scrub,grass,farmyard,farmland,water,reservoir,wetland
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
909176150,"436 HAYWARD AVE, Ames, Iowa, USA","436 Hayward Ave, Ames, IA 50014, USA",42.017780,-93.651452,1,1,0,2,1,0,...,0,0,0,0,753,0,9,11,0,1
905476230,"3416 WEST ST, Ames, Iowa, USA","3416 West St, Ames, IA 50014, USA",42.024697,-93.664186,1,1,0,2,1,0,...,0,0,0,0,540,0,8,8,0,1
911128020,"320 S 2ND ST, Ames, Iowa, USA","320 S 2nd St, Ames, IA 50010, USA",42.021389,-93.614855,1,2,1,1,1,0,...,2,0,0,3,131,0,5,13,10,0
535377150,"1524 DOUGLAS AVE, Ames, Iowa, USA","1524 Douglas Ave, Ames, IA 50010, USA",42.038070,-93.612065,1,2,2,2,1,0,...,0,0,0,1,52,1,1,5,12,0
534177230,"2304 FILLMORE AVE, Ames, Iowa, USA","2304 Fillmore Ave, Ames, IA 50010, USA",42.044900,-93.631893,0,0,1,2,0,0,...,0,0,0,0,18,0,0,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903205040,"1021 RIDGEWOOD AVE, Ames, Iowa, USA","1021 Ridgewood Ave, Ames, IA 50010, USA",42.031937,-93.626510,1,2,1,1,1,0,...,1,0,0,0,196,0,0,3,0,0
905402060,"3619 MARY CIR, Ames, Iowa, USA","3619 Mary Cir, Ames, IA 50014, USA",42.027798,-93.666899,1,1,0,0,1,0,...,0,0,0,1,473,0,8,5,0,1
909275030,"2140 SUNSET DR 2142, Ames, Iowa, USA","2140 Sunset Dr, Ames, IA 50014, USA",42.019944,-93.643206,1,1,0,2,1,0,...,0,0,0,0,772,0,4,11,0,1
907192040,"5319 CLEMENS BLVD, Ames, Iowa, USA","5319 Clemens Blvd, Ames, IA 50014, USA",42.016826,-93.690382,0,0,0,0,0,0,...,0,0,0,3,28,13,14,14,1,1


In [6]:
droplist = ['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold', 'Utilities', 'MiscFeature']
alwaysdrop = ['PID', 'SalePrice', 'SalePrice_log', 'sold_datetime', '2ndFlrSF_log', 'GrLivArea', '1stFlrSF', 'LotArea', 'LotFrontage_log']
df2 = df.drop((alwaysdrop + droplist + rad_drops), axis = 1)

In [7]:
to_dummify = [
    'Street_paved',
    'Alley',
    'LandContour',
    'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    'OverallQual',
    'OverallCond',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'CentralAir',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PavedDrive',
    'PoolQC',
    'Fence',
    'MiscFeature',
    'MoSold',
    'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    'BsmtCond_ord',
    'BsmtQual_ord',
    'BsmtExposure_ord',
    'GarageType_com',
    'number_floors',
    'attic',
    'PUD',
    'Functional_ord',
    'Garage_age_bin',
    'Remod_age_bin',
    'SaleType',
    'SaleCondition' 
]

In [8]:
for item in droplist:
    if item in to_dummify:
        to_dummify.remove(item)

In [9]:
df3 = pd.get_dummies(df2, columns = to_dummify, drop_first=True)

In [10]:
kfold = KFold(n_splits=10, shuffle = True, random_state = 1)

In [11]:
params_log = {'alpha' : [1e-6, 1e-5, 1e-4]
         }

In [12]:
lasso3 = Lasso(normalize = True, max_iter = 1000, tol = 0.001)
lasso_tuner3 = GridSearchCV(lasso3, params_log, cv=kfold, return_train_score = True)
lasso_tuner3.fit(df3, price_log)

GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [13]:
lasso_tuner3.cv_results_['mean_test_score']

array([0.94725832, 0.94742872, 0.92187333])

In [14]:
lasso_tuner3.cv_results_['mean_train_score']

array([0.96365374, 0.95764765, 0.92836701])

In [15]:
print(lasso_tuner3.best_params_)
print(max(lasso_tuner3.cv_results_['mean_test_score']))

{'alpha': 1e-05}
0.9474287186752373


In [16]:
feat_imp_rad = pd.Series(data = lasso_tuner3.best_estimator_.coef_, index = df3.columns)
feat_imp_rad = feat_imp_rad.sort_values(ascending = False)
ignored_rad = feat_imp_rad[feat_imp_rad == 0]
feat_imp_rad = feat_imp_rad[feat_imp_rad != 0]
print(len(feat_imp_rad))
print(feat_imp_rad)
print(len(ignored_rad))
print(ignored_rad)

164
GrLivArea_log                                                 0.263109
Neighborhood_GrnHill                                          0.186720
1stFlrSF_log                                                  0.134472
OverallQual_10                                                0.088643
LotArea_log                                                   0.084640
OverallQual_9                                                 0.073535
GarageQual_5                                                  0.056223
Neighborhood_Crawfor                                          0.043882
PoolQC_5                                                      0.041758
OverallQual_8                                                 0.040931
OverallCond_9                                                 0.038933
Neighborhood_StoneBr                                          0.034169
OverallCond_8                                                 0.029129
Functional_ord_7                                              0.026212
Ne

In [17]:
grouped_cols = [
    '__public',
    '__health',
    '__leisure',
    '__catering',
    '__accommodation_indoor',
    '__accommodation_outdoor',
    '__shopping',
    '__money',
    '__tourism_destination',
    '__miscellaneous',
    '__worship',
    '__christian_all',
    '__natural',
    '__traffic',
    '__traffic_fuel_and_parking',
    '__traffic_water',
    '__transport_air',
    '__landuse_leisure',
    '__waterway'
]

In [18]:
EDA_plots(radial, features = radial.columns[1:40], targets = ['SalePrice_log'])

feature:  Coords4


KeyError: 'SalePrice_log'

In [None]:
#EDA_plots(radial, features = radial.columns[40:80])

In [None]:
#EDA_plots(radial, features = radial.columns[80:120])

pos: farmland, farmyard, water

In [None]:
#EDA_plots(radial, features = radial.columns[120:151])