In [1]:
# Importar bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn import linear_model
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import AdaBoostRegressor

In [13]:

# Importar dados do conjunto de treinamento
features = pd.read_csv("conjunto_de_treinamento.csv")
features = features.sample(frac=1, random_state=12345)

regioes = {
    "centro": ["Recife", "Sto Amaro", "Boa Vista", "Cabanga", "Ilha do Leite", "Paissandu", "Sto Antonio", "S Jose", "Soledade", "Coelhos", "Ilha Joana Bezerra"],
    "norte": ["Arruda", "Campina do Barreto", "Campo Grande", "Encruzilhada", "Hipodromo", "Peixinhos", "Ponto de Parada", "Rosarinho", "Torreao", "Agua Fria", "Alto Santa Terezinha", "Bomba do Hemeterio", "Cajueiro", "Fundao", "Porto da Madeira", "Beberibe", "Dois Unidos", "Linha do Tiro"],
    "noroeste": ["Aflitos", "Alto do Mandu", "Apipucos", "Casa Amarela", "Casa Forte", "Derby", "Dois Irmaos", "Espinheiro", "Gracas", "Jaqueira", "Monteiro", "Parnamirim", "Poco da Panela", "Santana", "Tamarineira", "Sitio dos Pintos", "Alto Jose Bonifácio", "Alto Jose do Pinho", "Mangabeira", "Morro da Conceicao", "Vasco da Gama", "Brejo da Guabiraba", "Brejo do Beberibe", "Corrego do Jenipapo", "Guabiraba", "Macaxeira", "Nova Descoberta", "Passarinho", "Pau Ferro"],
    "oeste": ["Cordeiro", "Ilha do Retiro", "Iputinga", "Madalena", "Prado", "Torre", "Zumbi", "Engenho do Meio", "Torroes", "Caxanga", "Cid Universitaria", "Varzea"],
    "sudoeste": ["Afogados", "Bongi", "Mangueira", "Mustardinha", "San Martin", "Areias", "Cacote", "Estancia", "Jiquia", "Barro", "Coqueiral", "Curado", "Jd S Paulo", "Sancho", "Tejipio", "Toto"],
    "sul": ["Boa Viagem", "Brasilia Teimosa", "Imbiribeira", "Ipsep", "Pina", "Ibura", "Jordao", "Cohab"]
}

for i in range(len(features)):
    if features.loc[i, "bairro"] in regioes["centro"]:
        features.loc[i, "regiao"] = "centro"
    elif features.loc[i, "bairro"] in regioes["norte"]:
        features.loc[i, "regiao"] = "norte"
    elif features.loc[i, "bairro"] in regioes["noroeste"]:
        features.loc[i, "regiao"] = "noroeste"
    elif features.loc[i, "bairro"] in regioes["oeste"]:
        features.loc[i, "regiao"] = "oeste"
    elif features.loc[i, "bairro"] in regioes["sudoeste"]:
        features.loc[i, "regiao"] = "sudoeste"
    elif features.loc[i, "bairro"] in regioes["sul"]:
        features.loc[i, "regiao"] = "sul"
    else:
        features.loc[i, "regiao"] = "outro"
    
    if "copa" in features.loc[i, "diferenciais"].split(" e "):
        features.loc[i, "copa"] = 1
    else:
        features.loc[i, "copa"] = 0
    
    if "vestiario" in features.loc[i, "diferenciais"].split(" e "):
        features.loc[i, "vestiario"] = 1
    else:
        features.loc[i, "vestiario"] = 0
    
    if "children care" in features.loc[i, "diferenciais"].split(" e "):
        features.loc[i, "children_care"] = 1
    else:
        features.loc[i, "children_care"] = 0
    
    if "esquina" in features.loc[i, "diferenciais"].split(" e "):
        features.loc[i, "esquina"] = 1
    else:
        features.loc[i, "esquina"] = 0

# Remover colunas
features = features.drop(
    [
         "Id",
         "diferenciais",
         "bairro"
    ],
    axis=1
)

# Deletar outliers
for n in range(1,9):
    minPrice = features[features["quartos"] == n].preco.quantile(q=0.01)
    maxPrice = features[features["quartos"] == n].preco.quantile(q=0.90)
    
    features = features.drop(features[(features["quartos"] == n) & ((features["preco"] < minPrice) | (features["preco"] > maxPrice))].index)

for x in regioes.keys():
    if len(features[features["regiao"] == x]) > 1:
        q1 = features[features["regiao"] == x].preco.quantile(q=0.25)
        q3 = features[features["regiao"] == x].preco.quantile(q=0.75)
        iqr = q3 - q1
        features = features.drop(features[(features["regiao"] == x) & ((features["preco"] < q1-1.5*iqr) | (features["preco"] > q3+1.5*iqr))].index)    
            
for i in range (0, 6):
    q1 = features[features["suites"] == i].preco.quantile(q=0.25)
    q3 = features[features["suites"] == i].preco.quantile(q=0.75)
    iqr = q3 - q1
    features = features.drop(features[(features["suites"] == i) & ((features["preco"] < q1-1.5*iqr) | (features["preco"] > q3+1.5*iqr))].index)

# Divisão dos dados categóricos
features = pd.get_dummies(features,columns=
    [
         "tipo",
         "regiao"
    ]
)

# Padronização dos dados binários
binarizer = LabelBinarizer()

binaries = [
    "tipo_vendedor"
]

for v in binaries:
    features[v] = binarizer.fit_transform(features[v])

features.T

Unnamed: 0,101,1439,2478,1717,3274,2330,3445,3102,1603,3781,...,231,4139,4363,882,3441,546,3429,4478,3497,4578
tipo_vendedor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
quartos,4.0,2.0,3.0,3.0,3.0,2.0,1.0,4.0,3.0,2.0,...,4.0,4.0,2.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0
suites,3.0,0.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,0.0,...,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
vagas,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,...,2.0,3.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0
area_util,1304.0,68.0,77.0,66.0,108.0,75.0,32.0,150.0,68.0,55.0,...,138.0,162.0,49.0,108.0,118.0,49.0,78.0,84.0,106.0,87.0
area_extra,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0
churrasqueira,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
estacionamento,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
piscina,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
playground,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# Separação do conjunto em label e features
label = np.array(features["preco"]).astype(int)
features = features.drop("preco", axis=1)
feature_list = list(features.columns)

In [15]:
# Determinar os atributos com maior correlação com o preço
best_features = SelectKBest(score_func=f_regression, k=10)
fit = best_features.fit(features, label)

datafr = {"name": feature_list, "score": fit.scores_}
featureScores = pd.DataFrame(datafr)

print(featureScores)

                name        score
0      tipo_vendedor     1.401256
1            quartos  4490.030332
2             suites  5990.586936
3              vagas  1604.934746
4          area_util  1397.492626
5         area_extra    20.268696
6      churrasqueira    26.356878
7     estacionamento     3.949794
8            piscina    41.196304
9         playground     0.454859
10            quadra     8.913946
11          s_festas     8.299654
12           s_jogos     0.055122
13       s_ginastica     0.000857
14             sauna    26.746264
15         vista_mar    37.072408
16              copa    71.408796
17         vestiario     0.444008
18     children_care     0.725311
19           esquina     0.955224
20  tipo_Apartamento     1.024756
21         tipo_Casa     1.866142
22         tipo_Loft     0.152712
23    tipo_Quitinete     3.270119
24     regiao_centro     0.003753
25   regiao_noroeste    64.289489
26      regiao_norte    64.674140
27      regiao_oeste   173.632007
28      regiao

In [16]:
# Excluir as colunas com baixa correlação
used = featureScores.sort_values(by="score").tail(30).name

new_features = features[features.columns.intersection(used)]

In [17]:
# MinMax Scaler
scaler = MinMaxScaler().fit(new_features)
scaled_features = scaler.transform(new_features)

In [18]:
# Separar conjunto em treinamento e teste
train_features, test_features, train_label, test_label = train_test_split(scaled_features, label, test_size = 0.3, random_state = 12345)

In [8]:
# Busca pelo melhor modelo
classifiers = [
    svm.SVR(),
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression(),
    linear_model.Ridge(alpha=.5),
    linear_model.Lasso(alpha=0.1),
    linear_model.LassoLars(alpha=.1),
    linear_model.ElasticNet(random_state=12345),
    linear_model.LogisticRegression()
]

for classifier in classifiers:
    print(classifier)
    clf = classifier
    clf.fit(train_features, train_label)
    print(clf.score(train_features, train_label))
    predictions = clf.predict(test_features)
    train_predictions = clf.predict(train_features)
    print("RMSPE (train) = ", np.sqrt(np.mean(np.square((train_predictions - train_label)/train_label))))
    print("RMSPE (test) = ", np.sqrt(np.mean(np.square((test_label - predictions)/test_label))))
    print("")

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
-0.11580964471420674
RMSPE (train) =  0.4967792874103713
RMSPE (test) =  0.510339703682172

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)
0.7451349389462849
RMSPE (train) =  0.30229667156554785
RMSPE (test) =  0.32553818252290395

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)
0.7549



0.3563444858274084
RMSPE (train) =  0.36308389541299674
RMSPE (test) =  0.36294171158185584

TheilSenRegressor(copy_X=True, fit_intercept=True, max_iter=300,
                  max_subpopulation=10000, n_jobs=None, n_subsamples=None,
                  random_state=None, tol=0.001, verbose=False)
0.5349823720375226
RMSPE (train) =  0.3766398432983391
RMSPE (test) =  0.3988577821272065

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
0.7545595846981816
RMSPE (train) =  0.2957867540985325
RMSPE (test) =  0.31931660675600065

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
0.7547320049656426
RMSPE (train) =  0.29372922521843775
RMSPE (test) =  0.31566349250616776

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)


  positive)


0.7550611270997545
RMSPE (train) =  0.293234144265023
RMSPE (test) =  0.3145412120685285

LassoLars(alpha=0.1, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
          fit_path=True, max_iter=500, normalize=True, positive=False,
          precompute='auto', verbose=False)
0.7550611195373022
RMSPE (train) =  0.2932305667477519
RMSPE (test) =  0.3145275527965112

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=12345, selection='cyclic', tol=0.0001,
           warm_start=False)
0.14410363387190817
RMSPE (train) =  0.6064295696433031
RMSPE (test) =  0.6289553358496548

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
# Modelo base
model = DecisionTreeRegressor(random_state=12345)

model.fit(train_features, train_label)

train_predictions = model.predict(train_features)
print("RMSPE (train) = ", np.sqrt(np.mean(np.square((train_predictions - train_label)/train_label))))

test_predictions = model.predict(test_features)
print("RMSPE (test) = ", np.sqrt(np.mean(np.square((test_predictions - test_label)/test_label))))

RMSPE (train) =  0.0532861046149525
RMSPE (test) =  0.26089148694432573


In [19]:
# AdaBoostRegressor
regr = AdaBoostRegressor(DecisionTreeRegressor(random_state=12345), n_estimators=105, random_state=12345)
regr.fit(train_features, train_label)
regrtrain = regr.predict(train_features)
print("RMSPE (train) = ", np.sqrt(np.mean(np.square((regrtrain - train_label)/train_label))))
regrpredic = regr.predict(test_features)
print("RMSPE (test) = ", np.sqrt(np.mean(np.square((regrpredic - test_label)/test_label))))

RMSPE (train) =  0.1374336369300811
RMSPE (test) =  0.22622336654909803


In [20]:
# Conjunto de teste
# Importar dados
test_features = pd.read_csv("conjunto_de_teste.csv")

# Corrigir colunas
for i in range(len(test_features)):
    if test_features.loc[i, "bairro"] in regioes["centro"]:
        test_features.loc[i, "regiao"] = "centro"
    elif test_features.loc[i, "bairro"] in regioes["norte"]:
        test_features.loc[i, "regiao"] = "norte"
    elif test_features.loc[i, "bairro"] in regioes["noroeste"]:
        test_features.loc[i, "regiao"] = "noroeste"
    elif test_features.loc[i, "bairro"] in regioes["oeste"]:
        test_features.loc[i, "regiao"] = "oeste"
    elif test_features.loc[i, "bairro"] in regioes["sudoeste"]:
        test_features.loc[i, "regiao"] = "sudoeste"
    elif test_features.loc[i, "bairro"] in regioes["sul"]:
        test_features.loc[i, "regiao"] = "sul"
    else:
        test_features.loc[i, "regiao"] = "outro"
    
    if "copa" in test_features.loc[i, "diferenciais"].split(" e "):
        test_features.loc[i, "copa"] = 1
    else:
        test_features.loc[i, "copa"] = 0
    
    if "vestiario" in test_features.loc[i, "diferenciais"].split(" e "):
        test_features.loc[i, "vestiario"] = 1
    else:
        test_features.loc[i, "vestiario"] = 0
    
    if "children care" in test_features.loc[i, "diferenciais"].split(" e "):
        test_features.loc[i, "children_care"] = 1
    else:
        test_features.loc[i, "children_care"] = 0
    
    if "esquina" in test_features.loc[i, "diferenciais"].split(" e "):
        test_features.loc[i, "esquina"] = 1
    else:
        test_features.loc[i, "esquina"] = 0

test_features["tipo_Quitinete"] = 0

# Excluir colunas
id_series = test_features["Id"]
test_features = test_features.drop(
    [
         "Id",
         "diferenciais",
         "bairro"
    ],
    axis=1
)

# Colunas categóricas
test_features = pd.get_dummies(test_features,columns=
    [
         "tipo",
         "regiao"
    ]
)

# Colunas binárias
binarizer = LabelBinarizer()

binaries = [
    "tipo_vendedor"
]

for v in binaries:
    test_features[v] = binarizer.fit_transform(test_features[v])

# MinMax scaler
new_test_features = test_features[test_features.columns.intersection(used)]
scaled_test_features = scaler.transform(new_test_features)

test_features = np.array(test_features)

predictions = pd.DataFrame(id_series)
predictions = predictions.set_index("Id")
predictions["preco"] = regr.predict(scaled_test_features)
predictions.to_csv("result.csv")