In [1]:
#Competition: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
#Score: 0.165

import numpy as np
import pandas as pd
from fancyimpute import KNN, SimpleFill
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import csv
import os
import xgboost
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV



def load_data():
    train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
    test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
    
    return train_df, test_df


train_df, test_df = load_data()

Using TensorFlow backend.


In [2]:
def convert_to_categorical(df, column_names):
    
    for col in column_names:
        if col in df.columns:
            df[col] = df[col].astype('category')

            counters = pd.get_dummies(df[col])
            df = pd.concat([df, pd.get_dummies(df[col],prefix=col, drop_first=True) ],axis=1)
            df.drop([col],axis=1, inplace=True)
    
    return df

In [3]:
#Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

test_ids = test_df['Id'].values


# Checking for Missing Data

#remove features with Nan percentage higher than 0.45
nan_columns = train_df.columns.values[train_df.isna().any()]

for i in range(len(nan_columns)):
    col_values = train_df.loc[:, nan_columns[i]]
    percentage_null = round(col_values.isnull().sum()/len(col_values), 2)
    #print("Nan Percentage of {}: {}".format(nan_columns[i], percentage_null))

    if round(col_values.isnull().sum()/len(col_values), 2) > 0.45:
        train_df.drop(nan_columns[i], axis=1, inplace=True)
        test_df.drop(nan_columns[i], axis=1, inplace=True)


# Analysis of feature data types

'''
# Nan in Garages or Basement, mean there are none
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    train_df[col] = train_df[col].fillna(0)
    test_df[col] = train_df[col].fillna(0)
    
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    train_df[col] = train_df[col].fillna('None')
    test_df[col] = train_df[col].fillna('None')
        
for col in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    train_df[col] = train_df[col].fillna('None')
    test_df[col] = train_df[col].fillna('None')
'''
    
   
train_df['MSSubClass'] = train_df['MSSubClass'].astype(str)
train_df['YrSold'] = train_df['YrSold'].astype(str)
train_df['MoSold'] = train_df['MoSold'].astype(str)

test_df['MSSubClass'] = test_df['MSSubClass'].astype(str)
test_df['YrSold'] = test_df['YrSold'].astype(str)
test_df['MoSold'] = test_df['MoSold'].astype(str)


# Preprocess categorical variables

#label the categorical variables
labels_categorical = np.array([])
labels_non_categorical = np.array([])
for ind, col in enumerate(train_df.columns):
    col_values = train_df.loc[:, col]
    
    #check for strings
    if all(isinstance(x, (str, float)) for x in col_values) is True:
        
        #replacing missing values by the mode
        #train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
        
        #encode categorical variables
        not_nan_vals = train_df.loc[train_df[col].notnull(), col]
        labelencoder_X = LabelEncoder()
        train_df.loc[train_df[col].notnull(), col] = labelencoder_X.fit_transform(not_nan_vals)
        
        not_nan_vals = test_df.loc[test_df[col].notnull(), col]
        labelencoder_X = LabelEncoder()
        test_df.loc[test_df[col].notnull(), col] = labelencoder_X.fit_transform(not_nan_vals)
        
        labels_categorical = np.append(labels_categorical, col)
    else:            
        if col != 'SalePrice':
            labels_non_categorical = np.append(labels_non_categorical, col)
        
        

#Fill missing values

#using the most similar instance to fill the remaining fields
train_df.loc[:,:] = KNN(k=1).fit_transform(train_df)
test_df.loc[:,:] = KNN(k=1).fit_transform(test_df)


#Feature selection: Correlation analysis

cor = train_df.corr()

#plt.figure(figsize=(12,10))
#sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
#plt.show()

cor_price = abs(cor['SalePrice'])
relevant_features = cor_price[cor_price>0.55].index
relevant_features = relevant_features.drop(labels = ['SalePrice'])  

print('Relevant Features {}'.format(relevant_features))

train_target = train_df['SalePrice']

train_df = train_df.loc[:, relevant_features]
test_df = test_df.loc[:, relevant_features]


labels_non_categorical = [i for i in labels_non_categorical if i in relevant_features]

#Normalize data
sc = StandardScaler()
train_df[labels_non_categorical] = sc.fit_transform(train_df[labels_non_categorical])
test_df[labels_non_categorical] = sc.transform(test_df[labels_non_categorical])        


Imputing row 1/1460 with 0 missing, elapsed time: 0.991
Imputing row 101/1460 with 1 missing, elapsed time: 0.994
Imputing row 201/1460 with 0 missing, elapsed time: 0.997
Imputing row 301/1460 with 0 missing, elapsed time: 1.000
Imputing row 401/1460 with 0 missing, elapsed time: 1.002
Imputing row 501/1460 with 0 missing, elapsed time: 1.004
Imputing row 601/1460 with 0 missing, elapsed time: 1.007
Imputing row 701/1460 with 0 missing, elapsed time: 1.010
Imputing row 801/1460 with 0 missing, elapsed time: 1.012
Imputing row 901/1460 with 1 missing, elapsed time: 1.014
Imputing row 1001/1460 with 5 missing, elapsed time: 1.017
Imputing row 1101/1460 with 0 missing, elapsed time: 1.019
Imputing row 1201/1460 with 0 missing, elapsed time: 1.022
Imputing row 1301/1460 with 1 missing, elapsed time: 1.024
Imputing row 1401/1460 with 0 missing, elapsed time: 1.026
Imputing row 1/1459 with 0 missing, elapsed time: 0.924
Imputing row 101/1459 with 5 missing, elapsed time: 0.926
Imputing row 

In [4]:
#Use One Hot Encoding
train_df = convert_to_categorical(train_df, labels_categorical)
test_df = convert_to_categorical(test_df, labels_categorical)

In [5]:
#XGBOOST

parameters = {
    'max_depth': [1, 5],
    'n_estimators': [1000, 5000],
    'learning_rate': [0.01, 0.05]
}

#Cross Validation
grid_search = GridSearchCV(
    estimator=xgboost.XGBRegressor(),
    param_grid=parameters,
    n_jobs = 10,
    cv = 10
)


grid_search.fit(train_df, train_target)
print(grid_search.best_params_)
best_model = grid_search.best_estimator_




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000}


In [6]:
results = best_model.predict(test_df)

#export the results
with open('test_results.csv', 'w') as writeFile:
    
    writer = csv.writer(writeFile)
    writer.writerow(['Id', 'SalePrice'])
    
    for ind, sample in enumerate(results):
        writer.writerow([test_ids[ind], sample])

writeFile.close()