In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
import os

os.getcwd()

data =  pd.read_csv('train.csv')

data.shape

(1460, 81)

In [4]:
##find all columns with type object

obj_df = data.select_dtypes(include=['object']).copy().keys()

obj_df


Index([u'MSZoning', u'Street', u'Alley', u'LotShape', u'LandContour',
       u'Utilities', u'LotConfig', u'LandSlope', u'Neighborhood',
       u'Condition1', u'Condition2', u'BldgType', u'HouseStyle', u'RoofStyle',
       u'RoofMatl', u'Exterior1st', u'Exterior2nd', u'MasVnrType',
       u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual', u'BsmtCond',
       u'BsmtExposure', u'BsmtFinType1', u'BsmtFinType2', u'Heating',
       u'HeatingQC', u'CentralAir', u'Electrical', u'KitchenQual',
       u'Functional', u'FireplaceQu', u'GarageType', u'GarageFinish',
       u'GarageQual', u'GarageCond', u'PavedDrive', u'PoolQC', u'Fence',
       u'MiscFeature', u'SaleType', u'SaleCondition'],
      dtype='object')

In [5]:
##drop all categorical columns

cols=['MSZoning','Street','Alley','LotShape','LandContour',
                        'Utilities','LotConfig','LandSlope','Neighborhood',
                        'Condition1','Condition2','BldgType','HouseStyle','RoofStyle',
                        'RoofMatl','Exterior1st','Exterior2nd','MasVnrType',
                        'ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond',
                        'BsmtExposure','BsmtFinType1','BsmtFinType2','Heating',
                        'HeatingQC','CentralAir','Electrical','KitchenQual',
                        'Functional','FireplaceQu','GarageType','GarageFinish',
                        'GarageQual','GarageCond','PavedDrive','PoolQC','Fence',
                        'MiscFeature','SaleType','SaleCondition']

for i in range(len(cols)):
    data.drop(cols[i], axis=1, inplace=True)
    
data.shape

(1460, 38)

In [6]:
##check for NaN values

total = data.isnull().sum().sort_values(ascending=False)
percent = 100*(data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
LotFrontage,259,17.739726
GarageYrBlt,81,5.547945
MasVnrArea,8,0.547945
BsmtFinSF1,0,0.0
LowQualFinSF,0,0.0
2ndFlrSF,0,0.0
1stFlrSF,0,0.0
TotalBsmtSF,0,0.0
BsmtUnfSF,0,0.0
BsmtFinSF2,0,0.0


In [7]:
## drop columns with NaN values

col = ['LotFrontage','GarageYrBlt','MasVnrArea']

for i in range(len(col)):
    data.drop(col[i], axis=1, inplace=True)



In [8]:
data.shape

(1460, 35)

In [9]:
#split train and test set

from sklearn.model_selection import train_test_split

y=data['SalePrice']

X=data.drop('SalePrice', axis=1)

X_train,X_test ,y_train,y_test= train_test_split(X,y,test_size=0.25,random_state=5)

In [10]:
#shape of X and y and train and test sets
print(X_train.shape,y_train.shape)

print(X_test.shape,y_test.shape)

((1095, 34), (1095L,))
((365, 34), (365L,))


In [11]:
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))



In [12]:
#predict with different models

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from math import sqrt


#LR
reg =linear_model.LinearRegression()
reg.fit(X_train, y_train)
pred_LR = reg.predict(X_test)
print("LR:", rmsle(pred_LR,y_test))

# SVM
svc=SVR(C=1.0, epsilon=0.2)
svc.fit(X_train, y_train)
pred_SVM = svc.predict(X_test)
print("SVR:", rmsle(pred_SVM,y_test))

# Decision Tree
decision_tree = DecisionTreeRegressor(random_state=21)
decision_tree.fit(X_train, y_train)
predictions_DT = decision_tree.predict(X_test)
print("Decision Tree:", rmsle(predictions_DT,y_test))

# # Neural Net
nn = MLPRegressor(hidden_layer_sizes=(10,),  activation='relu', solver='adam',    alpha=0.001,batch_size='auto',
              learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
              random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
              nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
              epsilon=1e-08)
nn.fit(X_train, y_train)
pred_nn = nn.predict(X_test)
print("NN:", rmsle(pred_nn,y_test))

# KNN
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train,y_train)
predictions_KNN = knn.predict(X_test)
print("KNN:", rmsle(predictions_KNN,y_test))

# Random Forest
rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train,y_train)
predictions_RF = rf.predict(X_test)
print("Random Forest:", rmsle(predictions_RF,y_test))

  


('LR:', 0.1941078993444466)
('SVR:', 0.40764451335102303)
('Decision Tree:', 0.21123528359650748)
('NN:', 0.19878175769801112)
('KNN:', 0.23163294727640638)
('Random Forest:', 0.27234668354748)
