In [None]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re
import sys, os, random

from sklearn.linear_model import LinearRegression

sys.path.insert(0, "/Users/schwalmdaniel/github/xgboost/python-package")
#sys.path.insert(0, "e:/xgboost/python-package")
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score


%matplotlib inline

# reproducible results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',50)

root = '/Users/schwalmdaniel/github/kaggle/ml_training/session2'
#root = 'e:/kaggle/house_prices_kaggle'

# data explanation here: https://rstudio-pubs-static.s3.amazonaws.com/155304_cc51f448116744069664b35e7762999f.html

train=pd.read_csv(root + "/kc_house_data.csv")

# have a look at the ds
train.head()

In [None]:
# first let's see what is the shape of the data (cols, rows)

train.shape

In [None]:
# what are the data types of the columns. Note that data types are inferred from the data and use 'fat' data types.

train.dtypes

In [None]:
# check missing data

train.isnull().sum()

In [None]:
# print the distribution of the target variable

plt.figure(figsize=(20,8))
sns.distplot(train['price'])

In [None]:
train.price.hist()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(train.corr(),annot=True)

In [None]:
# list the features in the order of correlation to the price

train.corr()['price'].sort_values(ascending=False)

In [None]:
# look for strongly correlating the variables ( correlation > 0.99)

c = train.corr().abs().unstack().sort_values(ascending=False)
for index, value in [(i, v) for i,v in c.items() if v < 1.0][:20]:
    print("{} {} {}".format(index[0],index[1],value))
    
# no strongly correlating features that should be removed

In [None]:
# Our dataset is composed of a target variable (e.g. label) and a lot of features. 
# Usually with 'X' they refer to the features and with 'y' the target variable
# Let us split our training set according to this

X = train.drop(['id','date','price'], axis=1)
y = train['price']

In [None]:
# let's start to train the model by splitting the training set to training and validation set
# this way we can check how accurate is our model for previously unseen data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print ('Training shape: %s, test shape: %s' % (X_train.shape, X_test.shape))

In [None]:
# create a basic linear regression model

lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
# print the Root Mean Squared Error (RMSE) for the model. 
# This indicates how good our model is, what is the average error it is working with
rms = np.sqrt(mean_squared_error(y_pred, y_test))
print(rms)

# print the R-squared (R^2) for the model. 
# This indicates how good our model is, ideally it should be between 0 and 1. 
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

print(r2_score(y_pred, y_test))

## Model optimization

In [None]:
sns.pairplot(train, x_vars=['bedrooms','bathrooms','sqft_living','sqft_lot'], y_vars='price', height=7, aspect=0.7,)

In [None]:
sns.pairplot(train, x_vars=['sqft_above','sqft_basement','sqft_living15','sqft_lot15'], y_vars='price', height=7, aspect=0.7,)

In [None]:
train.bedrooms.describe()

In [None]:
train = train[(train['bedrooms'] < 12)]

In [None]:
train.sqft_living.describe()

In [None]:
train = train[(train['sqft_living'] < 8000)]

In [None]:
train.sqft_lot.describe()

In [None]:
train = train[(train['sqft_lot'] < 1000000)]

In [None]:
train.sqft_lot15.describe()

In [None]:
train = train[(train['sqft_lot15'] < 50000)]

In [None]:
train['sqft_living_diff'] = train['sqft_living15'] -  train['sqft_living']
train['sqft_lot_diff'] = train['sqft_lot15'] -  train['sqft_lot']

In [None]:
train['house_age'] = 150 - (2020.0 - train['yr_built'])

train['recently_renovated'] = train['yr_renovated'].apply(lambda x: 1 if x > 0 and 2020 - x < 15 else 0)

In [None]:
# generate arbitrary features from feature interactions by math operations

colpairs = ['sqft_living','price','sqft_above','sqft_living15','bathrooms',
            'sqft_basement','bedrooms','floors','sqft_lot','sqft_lot15','sqft_living_diff','sqft_lot_diff']

from itertools import combinations

for colTuple in list(combinations(colpairs,2)):
    col1 = colTuple[0]
    col2 = colTuple[1]
    
    train[col1 + '_pow2'] = pow(train[col1],2)
    
    train[col1 + '_per_' + col2] = train[col1].div(train[col2])
    train.loc[~np.isfinite(train[col1 + '_per_' + col2]), col1 + '_per_' + col2] = 0.0
    train[col2 + '_per_' + col1] = train[col2].div(train[col1])
    train.loc[~np.isfinite(train[col2 + '_per_' + col1]), col2 + '_per_' + col1] = 0.0


In [None]:
# Our dataset is composed of a target variable (e.g. label) and a lot of features. 
# Usually with 'X' they refer to the features and with 'y' the target variable
# Let us split our training set according to this

X = train.drop(['id','date','price'], axis=1)
y = train['price']

In [None]:
# let's start to train the model by splitting the training set to training and validation set
# this way we can check how accurate is our model for previously unseen data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print ('Training shape: %s, test shape: %s' % (X_train.shape, X_test.shape))

In [None]:
# create a basic linear regression model

lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
# print the Root Mean Squared Error (RMSE) for the model. This indicates how good our model is.
# What is the average error it is working with
rms = np.sqrt(mean_squared_error(y_pred, y_test))
print(rms)
print(r2_score(y_pred, y_test))