# Técnicas de Regresión para predicción de Costo de Casas



## Obtención de Datos

In [6]:
# Importación de librerías
import numpy as np
import pandas as pd

# Carga de csv con datos de entrenamiento
train = pd.read_csv('./data/train.csv')
train.head(n=10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


## Exploración de Datos

In [None]:
# Resumen estadístico descriptivo excluyendo valores NaN
train.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Setup de figuras de Matploplib
plt.figure(figsize=(12,5))
#f, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

# Primer gráfico de distribución de Precios de Venta
plt.subplot(121)
sns.distplot(train['SalePrice'],kde=False)
plt.xlabel('Sale price')
plt.axis([0,800000,0,180])

# Segundo gráfico de distribución de Logaritmo de Precios de Venta
plt.subplot(122)
sns.distplot(np.log(train['SalePrice']),kde=False)
plt.xlabel('Log (sale price)')
plt.axis([10,14,0,180])

In [None]:
# Mapa de Correlaciones
corr = train.select_dtypes(include = ['float64', 'int64']).iloc[:,1:].corr()

sns.set(font_scale=1)  
sns.heatmap(corr, vmax=1, square=True)

In [None]:
# Lista de variables más correlacionadas con el Precio de Venta
corr_list = corr['SalePrice'].sort_values(axis=0,ascending=False).iloc[1:]
corr_list.head(n=10)

In [None]:
# Ploteo de Características más correlacionadas Vs el Precio de Venta
plt.figure(figsize=(18,8))
for i in range(6):
    ii = '23'+str(i+1)
    plt.subplot(ii)
    feature = corr_list.index.values[i]
    plt.scatter(train[feature], train['SalePrice'], facecolors='none',edgecolors='k',s = 75)
    sns.regplot(x = feature, y = 'SalePrice', data = train,scatter=False, color = 'Blue')
    ax=plt.gca() 
    ax.set_ylim([0,800000])


## Preprocesamiento

In [3]:
import pandas as pd
import numpy as np
def data_preprocess(train,test):
    from scipy.stats import skew
    
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y


train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
print("Data preprocessing ...")
Xtrain,Xtest,ytrain = data_preprocess(train,test)
print("Data preprocessed!")

Data preprocessing ...
Data preprocessed!




## Models

In [4]:
from models.base import *

#test_predict,score = model_random_forecast(Xtrain,Xtest,ytrain)
#test_predict,score = model_extra_trees_regression(Xtrain,Xtest,ytrain)
#test_predict,score = model_gradient_boosting_tree(Xtrain,Xtest,ytrain)
test_predict,score = model_regression_multivariable(Xtrain,Xtest,ytrain)
#test_predict,score = model_regression_multivariable_own(Xtrain,Xtest,ytrain)

## Submission

In [5]:
import numpy as np
import datetime
import pandas as pd
def create_submission(prediction,score,test):
    now = datetime.datetime.now()
    sub_file = 'submission_'+str(score)+'_'+str(now.strftime("%Y-%m-%d-%H-%M"))+'.csv'
    print ('Creating submission: ', sub_file)
    pd.DataFrame({'Id': test['Id'].values, 'SalePrice': prediction}).to_csv(sub_file, index=False)
    print ('Submission created!')

create_submission(np.exp(test_predict),score,test)

Creating submission:  submission_0.9507046874992898_2018-10-18-23-53.csv
Submission created!
