In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
home = os.path.expanduser( '~' )
with open('../train.pickle', 'rb') as f:
    train = pickle.load(f)

In [3]:
df = pd.DataFrame.from_dict(train)

In [4]:
import re

def helper_regex(string, pat):
    string = str(string)
    if pat == 'm2':
        pattern = r'\b(\d+(\.\d+)?) m2\b'
    elif pat == 'bedrooms':
        pattern = r'\b(\d+) hab\b'
    else:
        pattern = r'\b(\d+) bañ(o|os)\b'
    match = re.search(pattern, string)
    return float(match.group(1)) if match else np.nan

def clean_df(df, valid_df=None, valid=False):
    df['m2'] = df.features.apply(helper_regex, args=('m2',))
    df['bedrooms'] = df.features.apply(helper_regex, args=('bedrooms',))
    df['bathrooms'] = df.features.apply(helper_regex, args=('bathrooms',))
    df['m2'].fillna(-1, inplace=True)
    df['bedrooms'].fillna(-1, inplace=True)
    df['bathrooms'].fillna(-1, inplace=True)
    if valid:
        valid_df['m2'] = valid_df.features.apply(helper_regex, args=('m2',))
        valid_df['bedrooms'] = valid_df.features.apply(helper_regex, args=('bedrooms',))
        valid_df['bathrooms'] = valid_df.features.apply(helper_regex, args=('bathrooms',))
        valid_df['m2'].fillna(-1, inplace=True)
        valid_df['bedrooms'].fillna(-1, inplace=True)
        valid_df['bathrooms'].fillna(-1, inplace=True)
        return valid_df[['m2','bedrooms','bathrooms','loc_string','type','desc']]
    return df[['m2','bedrooms','bathrooms','loc_string','type','desc']], df['price'].str.split(' ', expand=True)[0].astype(float)

In [5]:
df_clean, target = clean_df(df)

In [6]:
df_clean

Unnamed: 0,m2,bedrooms,bathrooms,loc_string,type,desc
0,85.0,2.0,1.0,Barcelona - Sant Antoni,FLAT,Piso en última planta a reformar en calle Tall...
1,65.0,2.0,1.0,Barcelona - Dreta de l´Eixample,FLAT,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,77.0,2.0,1.0,Barcelona - Dreta de l´Eixample,FLAT,"En pleno centro de Barcelona, justo al lado de..."
3,96.0,3.0,2.0,Barcelona - Sant Antoni,FLAT,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,84.0,2.0,1.0,Barcelona - Sagrada Família,FLAT,"En el corazón de Barcelona, en una hermosa fin..."
...,...,...,...,...,...,...
861,115.0,3.0,1.0,Barcelona - Navas,FLAT,"HANNAN-PIPER Real Estate les presenta, en excl..."
862,82.0,3.0,1.0,Barcelona - Navas,FLAT,¡ OPORTUNIDAD !\n\nLa Casa Agency vende: Vivie...
863,79.0,4.0,2.0,Barcelona - Navas,FLAT,"Piso totalmente REFORMADO y a ESTRENAR, con MU..."
864,63.0,1.0,1.0,Barcelona - Navas,FLAT,Presentamos la oportunidad de comprar un bonit...


In [7]:
def convert_dummies(df):
    return pd.get_dummies(df, columns=['loc_string','type'])

In [8]:
df_clean = convert_dummies(df_clean)

In [9]:
from transformers import AutoModel, AutoTokenizer
model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model1 = AutoModel.from_pretrained(model_name)

In [10]:
import torch
def generate_embeddings(text):
    tokenized_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model1(**tokenized_text)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [11]:
embeddings = df_clean['desc'].apply(generate_embeddings)

In [12]:
from sklearn.preprocessing import StandardScaler
X_numeric = df_clean[['m2','bedrooms','bathrooms']]
scaler = StandardScaler()
scaler.fit(X_numeric)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
x_scale = scaler.transform(X_numeric)
x_loc = df_clean[[col for col in df_clean.columns if 'loc_string' in col or 'type' in col]].astype(float).to_numpy()

In [22]:
from sklearn.model_selection import train_test_split

# Train-test split
X = torch.cat((torch.from_numpy(x_scale),torch.from_numpy(x_loc),torch.cat(embeddings.tolist())),1).float()
y = torch.from_numpy(target.to_numpy())
X_train, y_train = X, y

In [23]:
import torch.nn as nn

# Define PyTorch model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(796, 128) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = x.float()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [24]:
import torch.optim as optim
# Initialize model, loss function, and optimizer
model2 = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)
# optimizer = optim.Adam(model2.parameters(), lr=0.1) 
# optimizer = optim.Adam(model2.parameters(), lr=0.01)

In [25]:
from sklearn.metrics import r2_score
import torch.nn.functional as F
import torch.optim as optim

def val_metric(model, valid_dl):
    model.eval()
    losses = []
    y_hats = []
    ys = []
    for x, y in valid_dl:
        y_hat = model(x.float())
        loss = F.mse_loss(y_hat, y.float().unsqueeze(1))
        y_hats.append(y_hat.detach().numpy())
        ys.append(y.numpy())
        losses.append(loss.item())
    
    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)
    return np.mean(losses), r2_score(ys, y_hats)

def train_loop(model, train_dl, optimizer, epochs):
    losses = []
    for i in range(epochs):
        model.train()
        for x, y in train_dl:
            y_hat = model(x.float())
            loss = F.mse_loss(y_hat, y.float().unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        train_loss = np.mean(losses)
        if i%50 == 0:
            print("train loss %.3f" % 
                  (train_loss))

In [26]:
from torch.utils.data import TensorDataset, DataLoader

# Prepare DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [27]:
train_loop(model2, train_loader, optimizer, epochs=400)

train loss 121400.119
train loss 15888.523
train loss 9798.998
train loss 7512.246
train loss 6268.480
train loss 5476.640
train loss 4925.066
train loss 4518.606


In [28]:
# hyperparam search results

# EPOCHS:
# 400 selected as close to optimal
# train loss 123146.778 valid loss 117264.086 R2 -20.045
# train loss 19529.198 valid loss 4557.824 R2 0.213
# train loss 11841.983 valid loss 3529.190 R2 0.390
# train loss 9006.410 valid loss 2975.565 R2 0.485
# train loss 7472.548 valid loss 2658.420 R2 0.537
# train loss 6489.845 valid loss 2497.027 R2 0.562
# train loss 5798.061 valid loss 2437.986 R2 0.569
# train loss 5282.371 valid loss 2423.802 R2 0.569
# train loss 4883.059 valid loss 2446.239 R2 0.563
# train loss 4563.317 valid loss 2495.837 R2 0.553
# train loss 4301.058 valid loss 2551.174 R2 0.543
# train loss 4081.709 valid loss 2539.044 R2 0.544
# train loss 3894.907 valid loss 2616.675 R2 0.530
# train loss 3733.773 valid loss 2602.759 R2 0.531
# train loss 3592.785 valid loss 2636.798 R2 0.525
# train loss 3468.017 valid loss 2728.841 R2 0.509

# Learning rate: 0.01
# train loss 93144.997 valid loss 37365.315 R2 -5.702
# train loss 5396.729 valid loss 2490.490 R2 0.559
# train loss 3732.610 valid loss 2454.221 R2 0.560
# train loss 3077.187 valid loss 2524.390 R2 0.547
# train loss 2699.771 valid loss 2494.183 R2 0.552
# train loss 2431.254 valid loss 2577.741 R2 0.536
# train loss 2232.138 valid loss 2651.122 R2 0.523
# train loss 2063.081 valid loss 2694.784 R2 0.516

# Learning rate 0.1
# train loss 29403.618 valid loss 4523.429 R2 0.221
# train loss 3030.731 valid loss 2756.205 R2 0.512
# train loss 2208.557 valid loss 3329.776 R2 0.401
# train loss 1681.966 valid loss 4040.415 R2 0.281
# train loss 1354.280 valid loss 4537.712 R2 0.191
# train loss 1133.233 valid loss 4730.176 R2 0.153
# train loss 969.536 valid loss 4680.087 R2 0.162
# train loss 854.781 valid loss 4665.918 R2 0.171

In [29]:
home = os.path.expanduser( '~' )
with open(home + '/data/test_kaggle.pickle', 'rb') as f:
    test = pickle.load(f) 
df_test = pd.DataFrame.from_dict(test)
test_clean = clean_df(df, df_test, True)

In [30]:
test_embed = df_test['desc'].apply(generate_embeddings)

In [31]:
test = convert_dummies(test_clean)


In [32]:
for col in df_clean.columns:
    if ('loc_string' in col) or 'type_' in col:
        if col not in test.columns:
            test[col] = False

In [33]:
x_test_scale = scaler.transform(test[['m2','bedrooms','bathrooms']])
x_test_loc = test[[col for col in df_clean.columns if ('loc_string' in col) or 'type_' in col]].astype(float).to_numpy()

In [34]:
X = torch.cat((torch.from_numpy(x_test_scale),torch.from_numpy(x_test_loc),torch.cat(test_embed.tolist())),1).float()

In [35]:
yhat = model2(X)

In [36]:
from xgboost import XGBRegressor
model = XGBRegressor(max_depth=4, eta=0.08)
# fit model
model.fit(X_train, y_train)

In [38]:
y_pred = np.average( np.array([ model2(X).detach().numpy().flatten(), model.predict(X) ]), weights=[0.5,0.5], axis=0 )

In [39]:
y_pred

array([352.01066589, 358.08151245, 279.99972534, 325.40510559,
       348.45541382, 357.58998108, 381.3079834 , 223.58557129,
       285.94612122, 263.77397156, 379.61616516, 419.2905426 ,
       421.81385803, 366.35014343, 404.56719971, 327.65568542,
       302.96418762, 410.04556274, 434.00343323, 309.56140137,
       294.29801941, 337.60255432, 278.10189819, 389.8381958 ,
       268.75813293, 303.85971069, 399.14006042, 341.24937439,
       365.4130249 , 425.17485046, 336.34669495, 344.41622925,
       376.60171509, 319.93681335, 442.46965027, 311.37174988,
       417.75595093, 323.61358643, 406.09364319, 342.97799683,
       388.93161011, 376.60171509, 286.68196106, 287.96488953,
       395.3473053 , 426.59767151, 329.39526367, 412.76502991,
       365.8416748 , 408.31678772, 287.4092865 , 397.12503052,
       313.21864319, 295.40283203, 383.90316772, 239.15110016,
       441.18609619, 344.30078125, 320.37319946, 381.3727417 ,
       267.17189789, 332.4055481 , 351.659729  , 273.61

In [40]:
out = pd.DataFrame(y_pred)
out

Unnamed: 0,0
0,352.010666
1,358.081512
2,279.999725
3,325.405106
4,348.455414
...,...
127,381.278488
128,314.502045
129,309.289948
130,319.717957


In [41]:
out = out.rename(columns={0: 'price'})
out.index.names = ['id']
out.to_csv('solution.csv')