In [29]:
# Dependencies
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from matplotlib.pyplot import cm
import scipy.stats as stats
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt,log
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler 
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

# Set max columns
pd.set_option('display.max_columns', 100)


In [13]:
data = pd.read_csv('train.csv', header=0)

In [14]:
# Bathrooms
def fullbathrooms(x):
    return x['BsmtFullBath'] + x['FullBath']

data['FullBathNet'] = data.apply(lambda x: fullbathrooms(x), axis=1)
data['FullBathNet'].describe()

count    1460.000000
mean        1.990411
std         0.732046
min         0.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         6.000000
Name: FullBathNet, dtype: float64

In [15]:
modes = ['BsmtQual','KitchenQual','MSZoning']
for m in modes:
    values = data[m].mode()
    data[m].fillna(values[0], inplace=True)

medians = ['TotalBsmtSF','GarageCars','GarageArea','FullBathNet']
for m in medians:
    data[m].fillna(data[m].median(), inplace=True)

data.loc[data['1stFlrSF'] > 2500, '1stFlrSF'] = data['1stFlrSF'].median()
data.loc[data['FullBathNet'] == 0, 'FullBathNet'] = data['FullBathNet'].median()

data['FireFlag'] = data['Fireplaces'].map(lambda a: 1 if a > 0 else 0)


selected=['1stFlrSF','GarageArea','GarageCars','GrLivArea','TotalBsmtSF',
          'TotRmsAbvGrd','FullBathNet','OverallQual','YearBuilt','YearRemodAdd','BldgType',
          'BsmtQual','CentralAir','FireFlag','ExterCond','HouseStyle','KitchenQual',
          'LotShape','MSZoning','Neighborhood','SalePrice']

data=data[selected]

In [16]:
data.isnull().any()

1stFlrSF        False
GarageArea      False
GarageCars      False
GrLivArea       False
TotalBsmtSF     False
TotRmsAbvGrd    False
FullBathNet     False
OverallQual     False
YearBuilt       False
YearRemodAdd    False
BldgType        False
BsmtQual        False
CentralAir      False
FireFlag        False
ExterCond       False
HouseStyle      False
KitchenQual     False
LotShape        False
MSZoning        False
Neighborhood    False
SalePrice       False
dtype: bool

In [17]:
bldg = pd.get_dummies(data['BldgType'], prefix='BT')
data = data.join(bldg)
data.drop('BldgType', axis=1, inplace=True)

bsmt = pd.get_dummies(data['BsmtQual'], prefix='BSMT')
data = data.join(bsmt)
data.drop('BsmtQual', axis=1, inplace=True)

ec = pd.get_dummies(data['ExterCond'], prefix='EC')
data = data.join(ec)
data.drop('ExterCond', axis=1, inplace=True)

house = pd.get_dummies(data['HouseStyle'], prefix='HS')
data = data.join(house)
data.drop('HouseStyle', axis=1, inplace=True)

kit = pd.get_dummies(data['KitchenQual'], prefix='K')
data = data.join(kit)
data.drop('KitchenQual', axis=1, inplace=True)

lot = pd.get_dummies(data['LotShape'], prefix='L')
data = data.join(lot)
data.drop('LotShape', axis=1, inplace=True)

zoning = pd.get_dummies(data['MSZoning'], prefix='Z')
data = data.join(zoning)
data.drop('MSZoning', axis=1, inplace=True)

neighborhood = pd.get_dummies(data['Neighborhood'], prefix='N')
data = data.join(neighborhood)
data.drop('Neighborhood', axis=1, inplace=True)

data['AC'] = data['CentralAir'].map(lambda x: 1 if x == 'Y' else 0)
data.drop('CentralAir', axis=1, inplace=True)

In [18]:
data.head()

Unnamed: 0,1stFlrSF,GarageArea,GarageCars,GrLivArea,TotalBsmtSF,TotRmsAbvGrd,FullBathNet,OverallQual,YearBuilt,YearRemodAdd,FireFlag,SalePrice,BT_1Fam,BT_2fmCon,BT_Duplex,BT_Twnhs,BT_TwnhsE,BSMT_Ex,BSMT_Fa,BSMT_Gd,BSMT_TA,EC_Ex,EC_Fa,EC_Gd,EC_Po,EC_TA,HS_1.5Fin,HS_1.5Unf,HS_1Story,HS_2.5Fin,HS_2.5Unf,HS_2Story,HS_SFoyer,HS_SLvl,K_Ex,K_Fa,K_Gd,K_TA,L_IR1,L_IR2,L_IR3,L_Reg,Z_C (all),Z_FV,Z_RH,Z_RL,Z_RM,N_Blmngtn,N_Blueste,N_BrDale,N_BrkSide,N_ClearCr,N_CollgCr,N_Crawfor,N_Edwards,N_Gilbert,N_IDOTRR,N_MeadowV,N_Mitchel,N_NAmes,N_NPkVill,N_NWAmes,N_NoRidge,N_NridgHt,N_OldTown,N_SWISU,N_Sawyer,N_SawyerW,N_Somerst,N_StoneBr,N_Timber,N_Veenker,AC
0,856.0,548,2,1710,856,8,3.0,7,2003,2003,0,208500,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1262.0,460,2,1262,1262,6,2.0,6,1976,1976,1,181500,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,920.0,608,2,1786,920,6,3.0,7,2001,2002,1,223500,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,961.0,642,3,1717,756,7,2.0,7,1915,1970,1,140000,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1145.0,836,3,2198,1145,9,3.0,8,2000,2000,1,250000,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [19]:
array=data.values
Y=array[:,11]
Y

array([ 208500.,  181500.,  223500., ...,  266500.,  142125.,  147500.])

In [20]:
x1=array[:,:11]
x2=array[:,12:]

X=np.hstack((x1,x2))

X[1]

array([  1.26200000e+03,   4.60000000e+02,   2.00000000e+00,
         1.26200000e+03,   1.26200000e+03,   6.00000000e+00,
         2.00000000e+00,   6.00000000e+00,   1.97600000e+03,
         1.97600000e+03,   1.00000000e+00,   1.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,

In [33]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.05)

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb.fit(X, Y, early_stopping_rounds=5, eval_set=[(X_test, Y_test)], 
        verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [34]:
ptrain = xgb.predict(X_train)
ptest = xgb.predict(X_test)

score_train = xgb.score(X_train, Y_train)
score_test = xgb.score(X_test, Y_test)
rmsle_train = np.mean(np.log10((ptrain)/(Y_train)))
rmsle_test = np.mean(np.log10((ptest)/(Y_test)))
                      
print('|metric \t|train\t\t|test\t\t| \n|score \t\t|'
      +str(score_train)+'\t|'
      +str(score_test)+'\t|\n|rmsle \t\t|'
      +str(rmsle_train)+'|'+str(rmsle_test)+'|')

|metric 	|train		|test		| 
|score 		|0.952742524845	|0.968690781096	|
|rmsle 		|0.00339510344739|-0.000284965873891|


In [36]:
test_data = pd.read_csv('test.csv', header=0)

In [37]:
def fullbathrooms(x):
    return x['BsmtFullBath'] + x['FullBath']

test_data['FullBathNet'] = test_data.apply(lambda x: fullbathrooms(x), axis=1)

modes = ['BsmtQual','KitchenQual','MSZoning']
for m in modes:
    values = test_data[m].mode()
    test_data[m].fillna(values[0], inplace=True)

medians = ['TotalBsmtSF','GarageCars','GarageArea','FullBathNet']
for m in medians:
    test_data[m].fillna(test_data[m].median(), inplace=True)

test_data.loc[test_data['1stFlrSF'] > 2500, '1stFlrSF'] = test_data['1stFlrSF'].median()
test_data.loc[test_data['FullBathNet'] == 0, 'FullBathNet'] = test_data['FullBathNet'].median()

test_data['FireFlag'] = test_data['Fireplaces'].map(lambda a: 1 if a > 0 else 0)


selected=['1stFlrSF','GarageArea','GarageCars','GrLivArea','TotalBsmtSF',
          'TotRmsAbvGrd','FullBathNet','OverallQual','YearBuilt','YearRemodAdd','BldgType',
          'BsmtQual','CentralAir','FireFlag','ExterCond','HouseStyle','KitchenQual',
          'LotShape','MSZoning','Neighborhood']

test_data=test_data[selected]

In [38]:
bldg = pd.get_dummies(test_data['BldgType'], prefix='BT')
test_data = test_data.join(bldg)
test_data.drop('BldgType', axis=1, inplace=True)

bsmt = pd.get_dummies(test_data['BsmtQual'], prefix='BSMT')
test_data = test_data.join(bsmt)
test_data.drop('BsmtQual', axis=1, inplace=True)

ec = pd.get_dummies(test_data['ExterCond'], prefix='EC')
test_data = test_data.join(ec)
test_data.drop('ExterCond', axis=1, inplace=True)

house = pd.get_dummies(test_data['HouseStyle'], prefix='HS')
# Correct for missing column
house2 = house[['HS_1.5Fin','HS_1.5Unf','HS_1Story']]
house2['HS_2.5Fin']=0
house3=house[['HS_2.5Unf','HS_2Story','HS_SFoyer','HS_SLvl']]
house2 = house2.join(house3)
test_data = test_data.join(house2)
test_data.drop('HouseStyle', axis=1, inplace=True)

kit = pd.get_dummies(test_data['KitchenQual'], prefix='K')
test_data = test_data.join(kit)
test_data.drop('KitchenQual', axis=1, inplace=True)

lot = pd.get_dummies(test_data['LotShape'], prefix='L')
test_data = test_data.join(lot)
test_data.drop('LotShape', axis=1, inplace=True)

zoning = pd.get_dummies(test_data['MSZoning'], prefix='Z')
test_data = test_data.join(zoning)
test_data.drop('MSZoning', axis=1, inplace=True)

neighborhood = pd.get_dummies(test_data['Neighborhood'], prefix='N')
test_data = test_data.join(neighborhood)
test_data.drop('Neighborhood', axis=1, inplace=True)

test_data['AC'] = test_data['CentralAir'].map(lambda x: 1 if x == 'Y' else 0)
test_data.drop('CentralAir', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [39]:
l1 = list(data)
l2 = list(test_data)

# https://stackoverflow.com/questions/3462143/get-difference-between-two-lists
diff = [x for x in l1 if x not in l2]
diff

['SalePrice']

In [40]:
test_array = test_data.values
test_array[0]

array([  8.96000000e+02,   7.30000000e+02,   1.00000000e+00,
         8.96000000e+02,   8.82000000e+02,   5.00000000e+00,
         1.00000000e+00,   5.00000000e+00,   1.96100000e+03,
         1.96100000e+03,   0.00000000e+00,   1.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,

In [41]:
submission = xgb.predict(test_array)
submission[0]

116402.34

In [42]:
ks = pd.read_csv('test.csv', header=0)

In [43]:
output = pd.DataFrame({'Id': ks['Id'], 'SalePrice': submission})
output.to_csv('submission.csv', index=False)

In [47]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3)
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)    
X_test = scaler.transform(X_test)    

In [48]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb.fit(X, Y, early_stopping_rounds=5, eval_set=[(X_test, Y_test)], 
        verbose=False)

ptrain = xgb.predict(X_train)
ptest = xgb.predict(X_test)

score_train = xgb.score(X_train, Y_train)
score_test = xgb.score(X_test, Y_test)
rmsle_train = np.sqrt(np.square(np.log(ptrain + 1) - np.log(Y_train + 1)).mean())
rmsle_test = np.sqrt(np.square(np.log(ptest + 1) - np.log(Y_test + 1)).mean())
                      
print('|metric \t|train\t\t|test\t\t| \n|score \t\t|'
      +str(score_train)+'\t|'
      +str(score_test)+'\t|\n|rmsle \t\t|'
      +str(rmsle_train)+'|'+str(rmsle_test)+'|')

|metric 	|train		|test		| 
|score 		|-1.37474105169	|-1.36395081678	|
|rmsle 		|0.787024973493|0.757120998996|


In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE



In [51]:
lms=LinearRegression()

lms.fit(X_train, Y_train)
ptrain = lms.predict(X_train)
ptest = lms.predict(X_test)

score_train = lms.score(X_train, Y_train)
score_test = lms.score(X_test, Y_test)
rmsle_train = np.sqrt(np.square(np.log(ptrain + 1) - np.log(Y_train + 1)).mean())
rmsle_test = np.sqrt(np.square(np.log(ptest + 1) - np.log(Y_test + 1)).mean())
                      
print('|metric \t|train\t\t|test\t\t| \n|score \t\t|'
      +str(score_train)+'\t|'
      +str(score_test)+'\t|\n|rmsle \t\t|'
      +str(rmsle_train)+'|'+str(rmsle_test)+'|')

|metric 	|train		|test		| 
|score 		|0.86708938793	|-3.61680502161e+22	|
|rmsle 		|0.147604234699|1.39919840363|
