In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The following attempt at my first machine learning problem has been inspired with the help of the following notebook:
https://www.kaggle.com/pralabhpoudel/house-price-prediction/notebook

In [None]:
#Important Libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

#Loading Data
train_data=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_data.describe()

In [None]:
train_data.shape, test_data.shape

In [None]:
#For missing value handling, we will remove columns that have more than 80% values as missing
missing_percent = (train_data.isnull().sum()/len(train_data))*100
df = pd.DataFrame(missing_percent)
df

No meaningful values are being obtained, so arranged the data in descending order.

In [None]:
missing_percent = missing_percent[missing_percent>0].sort_values(ascending=False)

In [None]:
df=pd.DataFrame(missing_percent)
df

In [None]:
#Dropping all values above 80% and ID, since ID has no relation with predicting Sales Price practically
train_data=train_data.drop(['PoolQC','Id','Alley','Fence','MiscFeature'], axis=1)

In [None]:
#Similarly for test data
missing_percent2 = (test_data.isnull().sum()/len(test_data))*100
missing_percent2 = missing_percent2[missing_percent2>0].sort_values(ascending=False)
df = pd.DataFrame(missing_percent2)
df

In [None]:
test_id = test_data['Id']
test_data = test_data.drop(['PoolQC','Id','Alley','Fence','MiscFeature'], axis=1)

Now to manage the other features, we can either replace with median, value_counts or with 'None' values etc.

In [None]:
train_data['FireplaceQu']=train_data['FireplaceQu'].fillna('None')
train_data['LotFrontage']=train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())
train_data['GarageCond']=train_data['GarageCond'].fillna('None')
train_data['GarageQual']=train_data['GarageQual'].fillna('None')
train_data['GarageFinish']=train_data['GarageFinish'].fillna('None')
train_data['GarageYrBlt']=train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].median())
train_data['GarageType']=train_data['GarageType'].fillna('None')
train_data['BsmtFinType2']=train_data['BsmtFinType2'].fillna('None')
train_data['BsmtFinType1']=train_data['BsmtFinType1'].fillna("None")
train_data['BsmtExposure']=train_data['BsmtExposure'].fillna('None')
train_data['BsmtCond']=train_data['BsmtCond'].fillna('None')
train_data['BsmtQual']=train_data['BsmtQual'].fillna('None')
train_data['MasVnrArea']=train_data['MasVnrArea'].fillna(0)
train_data['MasVnrType']=train_data['MasVnrArea'].fillna("Others")
train_data['Electrical']=train_data['Electrical'].fillna('None')

In [None]:
test_data['FireplaceQu']=test_data['FireplaceQu'].fillna('None')
test_data['LotFrontage']=test_data['LotFrontage'].fillna(test_data['LotFrontage'].median())
test_data['GarageCond']=test_data['GarageCond'].fillna('None')
test_data['GarageQual']=test_data['GarageQual'].fillna('None')
test_data['GarageFinish']=test_data['GarageFinish'].fillna('None')
test_data['GarageYrBlt']=test_data['GarageYrBlt'].fillna(test_data['GarageYrBlt'].median())
test_data['GarageType']=test_data['GarageType'].fillna('None')
test_data['BsmtFinType2']=test_data['BsmtFinType2'].fillna('None')
test_data['BsmtFinType1']=test_data['BsmtFinType1'].fillna("None")
test_data['BsmtExposure']=test_data['BsmtExposure'].fillna('None')
test_data['BsmtCond']=test_data['BsmtCond'].fillna('None')
test_data['BsmtQual']=test_data['BsmtQual'].fillna('None')
test_data['MasVnrArea']=test_data['MasVnrArea'].fillna(0)
test_data['MasVnrType']=test_data['MasVnrArea'].fillna("Others")
test_data['Electrical']=test_data['Electrical'].fillna('None')

Now we investigate correlation of the data.

In [None]:
correlation=train_data.corr()
Positive_Related=correlation.index[abs(correlation["SalePrice"])>0.4]

plt.figure(figsize=(12,12))
Corr_Heatmap=sns.heatmap(train_data[Positive_Related].corr(),annot=True,cmap="GnBu")

So OverallQual and GrLivArea have the highest positive correlation.

In [None]:
#OverallQual is categorical so boxplot to check for outliers

sns.set(rc={'figure.figsize':(10,8)})
sns.boxplot(x='OverallQual',y='SalePrice',data=train_data)

In [None]:
#Similarly Scatterplot for GrLivArea

sns.set(rc={'figure.figsize':(8,8)})
sns.scatterplot(x='GrLivArea',y='SalePrice',data=train_data)


In [None]:
#Delete the outliers from GrLivArea as possible
train_data=train_data.drop(train_data[(train_data['GrLivArea']>4000)&(train_data['SalePrice']>400000)].index)

In [None]:
#To convert string to model
new_train_data=train_data.apply(lambda x:pd.factorize(x)[0])
test_data=test_data.apply(lambda x:pd.factorize(x)[0])

In [None]:
X_train=new_train_data.drop('SalePrice',axis=1)
Y_train=train_data['SalePrice']
X_test=test_data
X_train.shape, Y_train.shape, X_test.shape

In [None]:
#Checking the score of models
model=RandomForestRegressor(n_estimators=50)
model.fit(X_train,Y_train)
print(model.score(X_train,Y_train))

In [None]:
from xgboost import XGBRegressor
modelX= XGBRegressor(learning_rate=0.05, n_estimators=500)
modelX.fit(X_train,Y_train)
print(modelX.score(X_train,Y_train))

In [None]:
#Prediction using XGBoost
preds_set = modelX.predict(X_test)

In [None]:
output=pd.DataFrame({'Id':test_id,'SalePrice':preds_set})
print(output)

In [None]:
output.to_csv("HousePricev1.csv",index=False)