In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
home = os.path.expanduser( '~' )
with open('../train.pickle', 'rb') as f:
    train = pickle.load(f)

In [3]:
df = pd.DataFrame.from_dict(train)

In [4]:
import re

def helper_regex(string, pat):
    string = str(string)
    if pat == 'm2':
        pattern = r'\b(\d+(\.\d+)?) m2\b'
    elif pat == 'bedrooms':
        pattern = r'\b(\d+) hab\b'
    else:
        pattern = r'\b(\d+) bañ(o|os)\b'
    match = re.search(pattern, string)
    return float(match.group(1)) if match else np.nan

def clean_df(df, valid_df=None, valid=False):
    df['m2'] = df.features.apply(helper_regex, args=('m2',))
    df['bedrooms'] = df.features.apply(helper_regex, args=('bedrooms',))
    df['bathrooms'] = df.features.apply(helper_regex, args=('bathrooms',))
    df['m2'].fillna(-1, inplace=True)
    df['bedrooms'].fillna(-1, inplace=True)
    df['bathrooms'].fillna(-1, inplace=True)
    if valid:
        valid_df['m2'] = valid_df.features.apply(helper_regex, args=('m2',))
        valid_df['bedrooms'] = valid_df.features.apply(helper_regex, args=('bedrooms',))
        valid_df['bathrooms'] = valid_df.features.apply(helper_regex, args=('bathrooms',))
        valid_df['m2'].fillna(-1, inplace=True)
        valid_df['bedrooms'].fillna(-1, inplace=True)
        valid_df['bathrooms'].fillna(-1, inplace=True)
        return valid_df[['m2','bedrooms','bathrooms','loc_string','type','desc']]
    return df[['m2','bedrooms','bathrooms','loc_string','type','desc']], df['price'].str.split(' ', expand=True)[0].astype(float)

In [5]:
df_clean, target = clean_df(df)

In [6]:
df_clean

Unnamed: 0,m2,bedrooms,bathrooms,loc_string,type,desc
0,85.0,2.0,1.0,Barcelona - Sant Antoni,FLAT,Piso en última planta a reformar en calle Tall...
1,65.0,2.0,1.0,Barcelona - Dreta de l´Eixample,FLAT,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,77.0,2.0,1.0,Barcelona - Dreta de l´Eixample,FLAT,"En pleno centro de Barcelona, justo al lado de..."
3,96.0,3.0,2.0,Barcelona - Sant Antoni,FLAT,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,84.0,2.0,1.0,Barcelona - Sagrada Família,FLAT,"En el corazón de Barcelona, en una hermosa fin..."
...,...,...,...,...,...,...
861,115.0,3.0,1.0,Barcelona - Navas,FLAT,"HANNAN-PIPER Real Estate les presenta, en excl..."
862,82.0,3.0,1.0,Barcelona - Navas,FLAT,¡ OPORTUNIDAD !\n\nLa Casa Agency vende: Vivie...
863,79.0,4.0,2.0,Barcelona - Navas,FLAT,"Piso totalmente REFORMADO y a ESTRENAR, con MU..."
864,63.0,1.0,1.0,Barcelona - Navas,FLAT,Presentamos la oportunidad de comprar un bonit...


In [7]:
from sklearn.model_selection import train_test_split

# Train-test split
X = df_clean[['m2','bedrooms','bathrooms','loc_string','type']]
X['loc_string'] = X.loc_string.astype('category')
X['type'] = X.type.astype('category')
y = target.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['loc_string'] = X.loc_string.astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['type'] = X.type.astype('category')


In [8]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

depth = [i for i in range(1,10)]
eta = [0.03,0.04,0.05,0.06,0.07,0.08]
params = []
r2 = []
for i in depth:
    for j in eta:
        model = XGBRegressor(max_depth=i, eta=j, tree_method="hist", enable_categorical=True)
        # fit model
        model.fit(X_train, y_train)
        yhat = model.predict(X_test)
        params.append((i,j))
        r2.append(r2_score(y_test, yhat))

In [9]:
xgb = pd.DataFrame(list(zip(params, r2)), columns =['params', 'r2'])

In [10]:
xgb

Unnamed: 0,params,r2
0,"(1, 0.03)",0.465761
1,"(1, 0.04)",0.491768
2,"(1, 0.05)",0.510086
3,"(1, 0.06)",0.520588
4,"(1, 0.07)",0.522009
5,"(1, 0.08)",0.520265
6,"(2, 0.03)",0.531952
7,"(2, 0.04)",0.542579
8,"(2, 0.05)",0.54716
9,"(2, 0.06)",0.544822


In [11]:
home = os.path.expanduser( '~' )
with open(home + '/data/test_kaggle.pickle', 'rb') as f:
    test = pickle.load(f) 
df_test = pd.DataFrame.from_dict(test)
test_clean = clean_df(df, df_test, True)
X = test_clean[['m2','bedrooms','bathrooms','loc_string','type']]
X['loc_string'] = X.loc_string.astype('category')
X['type'] = X.type.astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['loc_string'] = X.loc_string.astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['type'] = X.type.astype('category')


In [12]:
X

Unnamed: 0,m2,bedrooms,bathrooms,loc_string,type
0,87.0,4.0,1.0,Barcelona - El Parc i la Llacuna del Poblenou,FLAT
1,78.0,4.0,1.0,Barcelona - Poblenou,FLAT
2,65.0,1.0,1.0,Barcelona - L´Antiga Esquerra de l´Eixample,FLAT
3,88.0,3.0,1.0,Barcelona - Poblenou,FLAT
4,82.0,2.0,1.0,Barcelona - Sant Antoni,FLAT
...,...,...,...,...,...
127,89.0,3.0,1.0,Barcelona - Dreta de l´Eixample,FLAT
128,65.0,3.0,1.0,Barcelona - El Parc i la Llacuna del Poblenou,FLAT
129,75.0,4.0,1.0,Barcelona - Sagrada Família,FLAT
130,75.0,3.0,2.0,Barcelona - Poblenou,APARTMENT


In [18]:
model = XGBRegressor(max_depth=5, eta=0.05, tree_method="hist", enable_categorical=True)
# fit model
model.fit(X_train, y_train)
yhat = model.predict(X)

In [19]:
out = pd.DataFrame(yhat)
out = out.rename(columns={0: 'price'})
out.index.names = ['id']
out.to_csv('solution.csv')

In [20]:
yhat = model.predict(X_test)
r2_score(y_test, yhat)

0.5634377117820251