# Documentation

https://www.kaggle.com/code/pmarcelino/comprehensive-data-exploration-with-python

https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard

https://www.kaggle.com/code/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from xgboost import XGBRegressor
import sklearn.metrics as metrics
import math
from scipy.stats import norm, skew
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Loading the data

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train.head()

In [None]:
train.shape, test.shape

# Data analysis

Keeping in mind that the column we are gonna predict is the SalePrice column.

In [None]:
train['SalePrice'].describe()

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='bright')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(train['SalePrice'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True)
plt.show()

### Numerical features & SalePrice
We are gonna analyze the relationship of 'SalePrice' with some numerical variables, in particular those which we think have a strong relation.


In [None]:
# GrLivArea has a high chance of having a strong relationship with SalePrice, since usually the bigger the house the higher the price
var = 'GrLivArea'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [None]:
# TotalBsmtSF The square foot area of the basement seems to be related to the SalePrice of the house too
var = 'TotalBsmtSF'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000));

### Categorical features & SalePrice

In [None]:
# OverallQual will definetly have a strong bond with SalePrice, so let's plot it
var = 'OverallQual'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
# YearBuilt might have some influence in the SalePrice
var = 'YearBuilt'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

It's not a big correlation, but we can see that older houses tend to be cheaper than newer ones.

Let's make a heatmap of all features to see the correlation with Sale

In [None]:
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, cmap="Spectral", square=True);

We can see OverallQual has a very strong correlation with SalePrice, followed by GrLivArea.

Plotting the 10 variables most correlated to 'SalePrice'.

In [None]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, cmap="Spectral", square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Our top three are **'OverallQual'**, **'GrLivArea'** and **'GarageCars'**. 

Note that GarageCars and GarageArea have a very close bond, this is because the number of cars that fit in the garage depends on its GarageArea. We will consider erasing one of the two, having an incline for GarageArea since it has less correlation to 'SalePrice'.

Also note that 'TotRmsAbvGrd' and 'GrLivArea' do, in some cases, represent the same data. 

In [None]:
# Focusing on the features that have a high correlationnwith SalePrice
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], size = 2.5)
plt.show();

Since SalePrice is not normal, a transformation of the data will have to be applied.

In [None]:
#applying log transformation
train['SalePrice'] = np.log(train['SalePrice'])

sns.distplot(train['SalePrice'], fit=norm);

## Outliers

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

There are two houses in the bottom right corner that, although they have a large GrLivArea, cost little money. These are huge outliers that will mess with the model's predictions, so it's best to delete them.

In [None]:
#Deleting outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

## Missing data

Before doing this, we will concatenate train and test datasets so that the same features are erased on both of them. We will split them again later.

In [None]:
y_train = train['SalePrice']
test_id = test['Id']
all_data = pd.concat([train, test], axis=0, sort=False)
all_data = all_data.drop(['Id', 'SalePrice'], axis=1)

In [None]:
Total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum() / all_data.isnull().count()*100).sort_values(ascending=False)
missing_data = pd.concat([Total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

Variables with a high percentage of missing data should be deleted for the model's sake. Features like PoolQC, MiscFeature, FireplaceQu will be erased. 

All Garage features have the same amount of missing data, which could mean it refers to the same rows. We can delete GarageX variables and just keep GarageCars.

MasVnArea and MasVnrType are strongly correlated to YearBuilt and OverallQual, which are alreasy considered so they can also be deleted.

Since only one row of Electrical has missing values, we will jus delete that row and preserve the feature. 

In [None]:
# df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)
# df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
# df_train.isnull().sum().max() #just checking that there's no missing data missing...

all_data.drop((missing_data[missing_data['Total'] > 5]).index, axis=1, inplace=True)
print(all_data.isnull().sum().max())

In [None]:
total = all_data.isnull().sum().sort_values(ascending=False)
total.head(20)

As we still have some missing values, we are gonna fill them.

In [None]:
# filling the numeric data
numeric_missed = ['BsmtFinSF1',
                  'BsmtFinSF2',
                  'BsmtUnfSF',
                  'TotalBsmtSF',
                  'BsmtFullBath',
                  'BsmtHalfBath',
                  'GarageArea',
                  'GarageCars']

for feature in numeric_missed:
    all_data[feature] = all_data[feature].fillna(0)

In [None]:
#filling categorical data
categorical_missed = ['Exterior1st',
                  'Exterior2nd',
                  'SaleType',
                  'MSZoning',
                   'Electrical',
                     'KitchenQual']

for feature in categorical_missed:
    all_data[feature] = all_data[feature].fillna(all_data[feature].mode()[0])

In [None]:
#Fill in the remaining missing values with the values that are most common for this feature.

all_data['Functional'] = all_data['Functional'].fillna('Typ')

In [None]:
all_data.drop(['Utilities'], axis=1, inplace=True)

In [None]:
all_data.isnull().sum().max() #just checking that there's no missing data missing...

## Feature Engineering

Searching for Normality

Having the data in a normal distribution helps the model have a clearer insight to the data. Also certain models like the linear regression model will asume the data has a normal distribution pattern. 

We will check now the distribution of GrLivArea.

In [None]:
#histogram and normal probability plot
sns.distplot(all_data['GrLivArea'], fit=norm);
fig = plt.figure()

In [None]:
#data transformation
all_data['GrLivArea'] = np.log(all_data['GrLivArea'])

#transformed histogram and normal probability plot
sns.distplot(all_data['GrLivArea'], fit=norm);
fig = plt.figure()

Adding a new feature 'TotalSF'

In [None]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

## Converting categorical variables

In [None]:
#convert categorical variable into dummy
all_data = pd.get_dummies(all_data)
all_data.head()

In [None]:
x_train = all_data[:len(y_train)]
x_test = all_data[len(y_train):]

In [None]:
x_test.shape , x_train.shape

## Modeling

In [None]:
# from sklearn.metrics import make_scorer
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.metrics import mean_squared_error


### XGBoost model

In [None]:
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'subsample': [0.45, 0.5, 0.55], 'n_estimators': [1200, 1400], 
     'max_depth': [5], 'learning_rate': [0.02],
     'colsample_bytree': [0.4], 'colsample_bylevel': [0.5],
     'reg_alpha':[1], 'reg_lambda': [1], 'min_child_weight':[2]}
]

xgb_model = XGBRegressor(eval_metric='rmse')

In [None]:
grid_search = GridSearchCV(
    xgb_model,
    param_grid=param_grid,
    scoring = 'neg_root_mean_squared_error',
    n_jobs = 10,
    cv = 5,
    verbose=True
)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
print(grid_search.best_score_)

In [None]:
# y_predict = np.floor(np.expm1(xgb_model.predict(x_test)))
y_predict = np.floor(np.expm1(grid_search.best_estimator_.predict(x_test)))
y_predict

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = y_predict
sub.to_csv('./submission.csv',index=False)