In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#1 explore the data 
df_train = pd.read_csv('../input/train.csv')
df_sample = pd.read_csv('../input/sample_submission.csv')
df_test = pd.read_csv('../input/test.csv')

In [None]:
#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))

## Part One Outliers

**Check the scatter plot**
#df_train.columns
#df_train.plot.scatter(x = 'GrLivArea',y = 'SalePrice')


In [None]:
#1 outliers:
fig,ax = plt.subplots()
ax.scatter(x=df_train['GrLivArea'],y = df_train['SalePrice'])
plt.ylabel('SalePrice')
plt.xlabel('GrLivArea')
#plt.ylabel('SalePrice', fontsize=13)
#plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
# remove right two outliers
df_train =df_train.drop(df_train[(df_train.GrLivArea > 4000 ) & (df_train.SalePrice < 300000)].index )
df_train.plot.scatter(x = 'GrLivArea',y = 'SalePrice')

## Target Variable

In [None]:
sns.distplot(df_train.SalePrice,fit = norm)

In [None]:
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
train = df_train.copy()

### Log-transformation of the target variable


In [None]:
train["SalePrice"] = np.log1p(train["SalePrice"])

In [None]:
sns.distplot(train['SalePrice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

In [None]:
train_id = df_train.index
test_id =df_test.index

In [None]:
# Merge test and train set together
#dir()
df_all = pd.concat([df_train.drop(['SalePrice'],axis =1),df_test]).reset_index(drop = True)
df_all.shape

### Missing Values

In [None]:
#get overal status
all_miss = df_all.isnull().sum()/len(df_all)*100
all_miss = all_miss[all_miss>0].sort_values(ascending=False)
all_miss_plt = all_miss[:20]

f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='45')
sns.barplot(x=all_miss_plt.index, y=all_miss_plt)
#plt.xlabel('Features', fontsize=15)
#plt.ylabel('Percent of missing values', fontsize=15)
#plt.title('Percent missing data by feature', fontsize=15)


In [None]:
#step1 set threshold as 10%
high_mis_vars = all_miss[all_miss > 10].index
high_mis_vars
df_all = df_all.drop(high_mis_vars, axis =1)

In [None]:
#all_miss_plt
high_mis_vars

In [None]:
#after we drop the columns, there are still 28 vars(out of 73) have missing values
all_miss = df_all.isnull().sum()/len(df_all)*100
all_miss = all_miss[all_miss>0].sort_values(ascending=False)

len(all_miss)

In [None]:
df_all[all_miss.index].head()

In [None]:
#fill missing values based on variable types
for col in all_miss.index:
#col = all_miss.index[0]
    if df_all[col].dtype == 'object':
        #print('object Var: fill with None')
        df_all[col].fillna('None',inplace = True)
    else:
        #print('numeric Var: fill with mean')
        df_all[col].fillna(round(df_all[col].mean(),0),inplace = True)
    #print(df_all[col].dtype )
#df_all

### Encoding (one-hot encoding)

In [None]:
#Encoding
df_all.dtypes.value_counts()

In [None]:
#get categorical variables
#cat_col = df_all.select_dtypes(include = ['object']).columns
#cat_col.columns

In [None]:
X_numb = df_all.select_dtypes(include = ['float64','int64'])
X_cat = df_all.select_dtypes(include = ['object'])
X_cat['MSZoning'].head()

In [None]:
X_cat['MSZoning'].value_counts()

In [None]:
#one-hot encoding
X_cat = pd.get_dummies(X_cat, columns= X_cat.columns.values,prefix_sep= "::")

In [None]:
X_cat.head()

In [None]:
X = pd.concat([X_numb,X_cat],axis=1)

In [None]:
X_numb.shape

In [None]:
X_cat.shape

In [None]:
X_train = X.iloc[:len(df_train),:]
X_test = X.iloc[len(df_train):,:]
y_train = df_train.SalePrice

### Training Model

In [None]:
#1 Random Forest
from sklearn.ensemble import RandomForestRegressor
rnd_reg = RandomForestRegressor(n_estimators =500, max_leaf_nodes = 16,n_jobs =-1)
rnd_reg.fit(X_train,y_train)




In [None]:
#2 GBRT gradient boosted regression trees
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=8,n_estimators =500,learning_rate = 1.0)
gbrt.fit(X_train,y_train)


In [None]:
y_gbrt =gbrt.predict(X_test)
my_submission = pd.DataFrame({'Id': test_ID, 'SalePrice': y_gbrt})
my_submission.to_csv('gbrt_submission20181007.csv',index=False)

In [None]:
#model test & evaluation session
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_t,X_v,y_t,y_v = train_test_split(X_train,y_train)
gbrt_mock = GradientBoostingRegressor(max_depth=8,n_estimators =500,learning_rate = 1.0)
gbrt_mock.fit(X_t,y_t)

In [None]:
errors = [mean_squared_error(y_v,y_pred) for y_pred in gbrt_mock.staged_predict(X_v) ]
bst_n_esimators = np.argmin(errors)

In [None]:
bst_n_esimators

In [None]:
print(rnd_reg.feature_importances_)

In [None]:
y_test =rnd_reg.predict(X_test)

In [None]:
y_test

In [None]:
#output = pd.DataFrame()
#output['Id'] = test_ID
#output['SalePrice'] = y_test
#output.to_csv('submission.csv',index=False)

my_submission = pd.DataFrame({'Id': test_ID 'SalePrice': y_test})
my_submission.to_csv('submission.csv',index=False)


In [None]:
#print(os.listdir("../input"))

In [None]:
#print(os.listdir("../input"))
#import os
os.getcwd()

In [None]:
os.listdir()