# Melbourne House Sale Prices Advanced Regression Techniques
This notebook contains the implementation of a Decision Tree Regressor to predict the Sale Price of a house in Melbourne. Dataset is provided by Kaggle. 



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
file_path_train = '../input/train.csv'
file_path_test = '../input/test.csv'
data_train = pd.read_csv(file_path_train) 
data_test = pd.read_csv(file_path_test)

## Data Exploration
Before creating our model, we'd like to explore relationships between our dependent SalePrice variable and its independent features.

In [None]:
data_train['SalePrice'].describe()

In [None]:
sns.distplot(data_train['SalePrice']);
print('Skewness: %f' % data_train['SalePrice'].skew())
print('Peakness: %f' % data_train['SalePrice'].kurt())

### Numerical Features Data Exploration

In [None]:
cols = ['LotArea', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'MiscVal', 'PoolArea']
for col in cols:
    data = pd.concat([data_train['SalePrice'], data_train[col]], axis=1)
    data.plot.scatter(x=col, y='SalePrice');

#### In Summary
1. It looks like the features that have little to no relationship with SalePrice are: LotArea, MiscVal, and PoolArea
2. It looks like the features that have at least a Medium correlation with SalePrice are: TotalBsmtSF, GrLivArea, and GarageArea

## Categorical Features Data Exploration

In [None]:
cols = ['OverallQual', 'OverallCond', 'YearBuilt']
for col in cols:
    data = pd.concat([data_train['SalePrice'], data_train[col]], axis=1)
    f, ax = plt.subplots(figsize=(12,6))
    fig = sns.boxplot(x=col, y='SalePrice', data=data)

### In Summary
1. OverallCond has little to no relationship with SalePrice
2. OverallQual is strongly linearly related with SalePrice, YearBuilt is mildly linearly related with SalePrice


## Correlated Features

In [None]:
corr_matrix = data_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_matrix, vmax=0.8, square=True);

We can see here that TotalBsmtSF and 1stFlrSF are strongly correlated. This is also true for the relationship between GarageCars and GarageArea.

In [None]:
# Number of top correlated variables for heatmap
k = 20
cols = corr_matrix.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(data_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
print(cols)

## Missing Data

In [None]:
#missing data
total = data_train.isnull().sum().sort_values(ascending=False)
percent = (data_train.isnull().sum() / data_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
#dealing with missing data
data_train = data_train.drop((missing_data[missing_data['Total'] > 1]).index,1)
data_train = data_train.drop(data_train.loc[data_train['Electrical'].isnull()].index)
data_train.isnull().sum().max()

## Outliers

In [None]:
# We'll standardize our data and remove outliers that aren't within a certain spread range
saleprice_scaled = StandardScaler().fit_transform(data_train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]

In [None]:
cols = ['TotalBsmtSF', 'GrLivArea', 'GarageArea']
for col in cols:
    data = pd.concat([data_train['SalePrice'], data_train[col]], axis=1)
    data.plot.scatter(x=col, y='SalePrice');

In [None]:
data_train.sort_values(by = 'GrLivArea', ascending = False)[:2]
data_train = data_train.drop(data_train[data_train['Id'] == 1299].index)
data_train = data_train.drop(data_train[data_train['Id'] == 524].index)

## Training

In [None]:
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
X = data_train[features]
y = data_train.SalePrice

In [None]:
scoring = metrics.make_scorer(metrics.r2_score)
# grid_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
#               param_grid={'min_samples_split': range(2, 10)},
#               scoring=scoring, cv=100, refit=True)

from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
grid_cv = GridSearchCV(BaggingRegressor(LinearRegression()), param_grid={'n_estimators' : [10, 100]}, 
                       cv=1000, refit=True, scoring=scoring, verbose=1, n_jobs=6)

grid_cv.fit(X, y);

### Dealing with Missing Data

In [None]:
X_test = data_test[features]
missing_data = X_test.isnull().sum()
X_test = X_test.fillna((missing_data[missing_data > 0]).mean())

In [None]:
X_test.isnull().sum()

In [None]:
predictions = grid_cv.best_estimator_.predict(X_test[features])

In [None]:
results = pd.concat([data_test['Id'], pd.Series(predictions)], axis=1)
results.columns = ['Id', 'SalePrice']
results.describe()

In [None]:
results.to_csv(path_or_buf='./submission.csv', index=False)

In [None]:
import pickle
pickle.dump(grid_cv, open('model', 'wb'))