In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import norm,skew
warnings.filterwarnings(message='numpy.dtype size changed',action='ignore')

Inspired from [Stacked Regressions : Top 4% on LeaderBoard](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard)

In [None]:
train=pd.read_csv('../input/train.csv').copy()
test=pd.read_csv('../input/test.csv').copy()
test_id=test['Id']

### Basic description of data

In [None]:
print('Train data contains {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print('Test data contains {} rows and {} columns'.format(test.shape[0],test.shape[1]))

In [None]:
train.head()

In [None]:
train.info()

#### Outliers

In [None]:
plt.figure()
plt.scatter(train['GrLivArea'],train['SalePrice'])
plt.show()

In [None]:
train.drop((train[(train['GrLivArea']>4000) & (train['SalePrice']>300000)]).index,inplace=True)


In [None]:
plt.figure()
plt.scatter(train['GrLivArea'],train['SalePrice'])
plt.show()

###### Analysing target variable : SalePrice

In [None]:
plt.figure()
sns.distplot(train['SalePrice'],norm_hist=True,fit=norm)
(mu,sigma)=norm.fit(train['SalePrice'])
plt.legend(['Normal dist $\mu=$ {:.2f} and $\sigma=$ {:.2f}'.format(mu,sigma)],)
plt.show()

###### Log transformation of target variable

In [None]:
train['SalePrice']=np.log1p(train['SalePrice'])
plt.figure()
sns.distplot(train['SalePrice'],norm_hist=True,fit=norm)
(mu,sigma)=norm.fit(train['SalePrice'])
plt.legend(['Normal dist $\mu=$ {:.2f} and $\sigma=$ {:.2f}'.format(mu,sigma)],)
plt.show()

#### droping Id column

In [None]:
train=train.drop('Id',axis=1)
test=test.drop('Id',axis=1)
y_train=train.SalePrice.values
all_data=pd.concat((train,test)).reset_index(drop=True)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(all_data.corr(),annot=False,linewidths=.5)

#### Missing data

In [None]:
all_data.drop('SalePrice',axis=1,inplace=True)
missing=all_data.isnull().sum().sort_values(ascending=False)/len((all_data))*100
missing=missing[missing!=0]
missing=pd.DataFrame(missing,columns=['percent of missing'])

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=missing.index,y=missing['percent of missing'])
plt.gca().set_xticklabels(missing.index,rotation='90')
plt.show()

###### Imputing missing values

 PoolQC:  describes the quality of the pool.So NA here means no pool in the house

MiscFeature : describes the Miscellaneous feature not covered in other categories.NA here means null.

Alley : Type of alley access to property.NA here means no alley access.

Fence : Fence quality.NA means no fence

FireplaceQu: Fireplace quality,here NA mean no fireplace

###### LotFrontage: Linear feet of street connected to property

In [None]:
all_data['LotFrontage']=all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x : x.fillna(x.median()
                                                                                                     ))

In [None]:
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")

all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")

all_data["Alley"] = all_data["Alley"].fillna("None")

all_data["Fence"] = all_data["Fence"].fillna("None")

all_data['FireplaceQu']=all_data['FireplaceQu'].fillna('None')

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
## droping an unwanted column.
all_data = all_data.drop(['Utilities'], axis=1)

all_data["Functional"] = all_data["Functional"].fillna("Typ")

all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")


###### Transforming some  numerical variables

In [None]:
all_data['MSSubClass']=all_data['MSSubClass'].astype(str)
all_data['OverallCond']=all_data['OverallCond'].astype(str)
all_data['YrSold']=all_data['YrSold'].astype(str)
all_data['MoSold']=all_data['MoSold'].astype(str)

###### Label encoding

In [None]:

from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
for c in cols:
    Lb=LabelEncoder()
    Lb.fit(list(all_data[c].values))
    all_data[c]=Lb.fit_transform(list(all_data[c].values))


In [None]:
##Total square feet areaof house feature 
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']


###### Skewness of features

In [None]:
numerical_feat=all_data.dtypes[all_data.dtypes!= 'object'].index
skewness=all_data[numerical_feat].apply(lambda x : skew(x.dropna())).sort_values(ascending=False)
skewness.head(10)

In [None]:
from scipy.special import boxcox1p
skewness=skewness[abs(skewness)>.75]
skew_feat=skewness.index
lam=.15
for feat in skew_feat:
    all_data[feat] = boxcox1p(all_data[feat], lam)
    

In [None]:
len(skew_feat)

In [None]:
all_data=pd.get_dummies(all_data)
all_data.shape


In [None]:
train=all_data[:train.shape[0]]
test=all_data[train.shape[0]:]


##### Modelling

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

#import lightgbm as lgb


#### cross validation strategy

In [None]:


#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)



##### Xgboost

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)


##### Gradient boosting

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)



##### Lasso

In [None]:
lasso=make_pipeline(RobustScaler(),Lasso(alpha=.005,random_state=1))


###### ENet

In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [None]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

    
        

In [None]:
rmsle_cv(GBoost)

In [None]:
Average_=AveragingModels(models=(ENet,lasso,model_xgb,GBoost))
rmsle_cv(Average_)




### making my submission

In [None]:
Average_.fit(train,y_train)
prediction=Average_.predict(test)


In [None]:
sub=pd.DataFrame()
sub['Id']=test_id
sub['Saleprice']=prediction
sub.to_csv('submission.csv',index=False)