# 0.0. Imports

In [1]:
import pandas                      as pd
import numpy                       as np
import seaborn                     as sns
import inflection
import math
import datetime

from boruta                        import BorutaPy
from sklearn.ensemble              import RandomForestRegressor
from sklearn.linear_model          import LinearRegression, Lasso
from sklearn.preprocessing         import RobustScaler, MinMaxScaler, LabelEncoder
from scipy.stats                   import chi2_contingency
from scipy                         import stats  as ss
from matplotlib                    import pyplot as plt
from IPython.display               import Image
from matplotlib                    import gridspec
from IPython.core.display          import HTML
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

## 0.1. Helper functions

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [35, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2. Loading Data

In [4]:
# load data
df_raw = pd.read_csv('dataset/kc_house_data.csv', low_memory=False) 

# 1.0. Data Preparation

In [5]:
df1 = df_raw.copy()

In [6]:
df1.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


## 1.1. Data Dimension

In [7]:
# verify number of rows and columns in the dataset

print('Number of rows: {}'.format (df1.shape[0]))
print('Number of Cols: {}'.format (df1.shape[1]))

Number of rows: 21613
Number of Cols: 21


## 1.3. Data types

In [8]:
# change date object to datetime
df1['date'] = pd.to_datetime(df1['date'])

In [9]:
df1.dtypes

id                        int64
date             datetime64[ns]
price                   float64
bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
sqft_above                int64
sqft_basement             int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
lat                     float64
long                    float64
sqft_living15             int64
sqft_lot15                int64
dtype: object

## 1.4. Number of NA

In [10]:
# show sum of NA 
df1.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

## 1.5. Descriptive Statistics

In [11]:
# Separete numeric and categorical variables
num_attributes = df1.select_dtypes(include = ['int64', 'float64'])
cat_attributes = df1.select_dtypes(exclude = ['int64', 'float64', 'datetime64[ns]'])

print( 'Num of Categorical : {}'.format( cat_attributes.shape[1]) )
print( 'Num of Numerical: {}'.format( num_attributes.shape[1] ) )

Num of Categorical : 0
Num of Numerical: 20


In [19]:
# Central Tendency - Mean, Median
ct1 = pd.DataFrame(num_attributes.apply( np.mean )).T
ct2 = pd.DataFrame(num_attributes.apply( np.median )).T

# Dispersion - std deviation, min, max, range, skew, kurtosis
d1 = pd.DataFrame(num_attributes.apply( std )).T
d2 = pd.DataFrame(num_attributes.apply( min )).T
d3 = pd.DataFrame(num_attributes.apply( max )).T
d4 = pd.DataFrame(num_attributes.apply( lambda x: x.max() - x.min() )).T
d5 = pd.DataFrame(num_attributes.apply( lambda x: x.skew() )).T
d6 = pd.DataFrame(num_attributes.apply( lambda x: x.kurtosis() )).T

# concatenate
m = pd.concat([ d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
m.columns = [ 'attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1000102.0,9900000000.0,9899000000.0,4580302000.0,3904930000.0,2876499000.0,0.243329,-1.260542
1,price,75000.0,7700000.0,7625000.0,540088.1,450000.0,367118.7,4.024069,34.58554
2,bedrooms,0.0,33.0,33.0,3.370842,3.0,0.9300403,1.9743,49.063653
3,bathrooms,0.0,8.0,8.0,2.114757,2.25,0.7701453,0.511108,1.279902
4,sqft_living,290.0,13540.0,13250.0,2079.9,1910.0,918.4196,1.471555,5.243093
5,sqft_lot,520.0,1651359.0,1650839.0,15106.97,7618.0,41419.55,13.060019,285.07782
6,floors,1.0,3.5,2.5,1.494309,1.5,0.5399764,0.616177,-0.484723
7,waterfront,0.0,1.0,1.0,0.007541757,0.0,0.0865152,11.385108,127.632494
8,view,0.0,4.0,4.0,0.2343034,0.0,0.7662998,3.39575,10.893022
9,condition,1.0,5.0,4.0,3.40943,3.0,0.650728,1.032805,0.525764


In [20]:

num_attributes.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,21613.0,4580302000.0,2876566000.0,1000102.0,2123049000.0,3904930000.0,7308900000.0,9900000000.0
price,21613.0,540088.1,367127.2,75000.0,321950.0,450000.0,645000.0,7700000.0
bedrooms,21613.0,3.370842,0.9300618,0.0,3.0,3.0,4.0,33.0
bathrooms,21613.0,2.114757,0.7701632,0.0,1.75,2.25,2.5,8.0
sqft_living,21613.0,2079.9,918.4409,290.0,1427.0,1910.0,2550.0,13540.0
sqft_lot,21613.0,15106.97,41420.51,520.0,5040.0,7618.0,10688.0,1651359.0
floors,21613.0,1.494309,0.5399889,1.0,1.0,1.5,2.0,3.5
waterfront,21613.0,0.007541757,0.0865172,0.0,0.0,0.0,0.0,1.0
view,21613.0,0.2343034,0.7663176,0.0,0.0,0.0,0.0,4.0
condition,21613.0,3.40943,0.650743,1.0,3.0,3.0,4.0,5.0


In [22]:
# level of variables 
num_attributes.apply( lambda x: x.unique().shape[0])

id               21436
price             4028
bedrooms            13
bathrooms           30
sqft_living       1038
sqft_lot          9782
floors               6
waterfront           2
view                 5
condition            5
grade               12
sqft_above         946
sqft_basement      306
yr_built           116
yr_renovated        70
zipcode             70
lat               5034
long               752
sqft_living15      777
sqft_lot15        8689
dtype: int64