In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
home = os.path.expanduser( '~' )
with open('../train.pickle', 'rb') as f:
    train = pickle.load(f)

In [3]:
df = pd.DataFrame.from_dict(train)

In [4]:
import re

def helper_regex(string, pat):
    string = str(string)
    if pat == 'm2':
        pattern = r'\b(\d+(\.\d+)?) m2\b'
    elif pat == 'bedrooms':
        pattern = r'\b(\d+) hab\b'
    else:
        pattern = r'\b(\d+) bañ(o|os)\b'
    match = re.search(pattern, string)
    return float(match.group(1)) if match else np.nan

def clean_df(df, valid_df=None, valid=False):
    df['m2'] = df.features.apply(helper_regex, args=('m2',))
    df['bedrooms'] = df.features.apply(helper_regex, args=('bedrooms',))
    df['bathrooms'] = df.features.apply(helper_regex, args=('bathrooms',))
    df['m2'].fillna(-1, inplace=True)
    df['bedrooms'].fillna(-1, inplace=True)
    df['bathrooms'].fillna(-1, inplace=True)
    if valid:
        valid_df['m2'] = valid_df.features.apply(helper_regex, args=('m2',))
        valid_df['bedrooms'] = valid_df.features.apply(helper_regex, args=('bedrooms',))
        valid_df['bathrooms'] = valid_df.features.apply(helper_regex, args=('bathrooms',))
        valid_df['m2'].fillna(-1, inplace=True)
        valid_df['bedrooms'].fillna(-1, inplace=True)
        valid_df['bathrooms'].fillna(-1, inplace=True)
        return valid_df[['m2','bedrooms','bathrooms','loc_string','type','desc']]
    return df[['m2','bedrooms','bathrooms','loc_string','type','desc']], df['price'].str.split(' ', expand=True)[0].astype(float)

In [5]:
df_clean, target = clean_df(df)

In [6]:
df_clean

Unnamed: 0,m2,bedrooms,bathrooms,loc_string,type,desc
0,85.0,2.0,1.0,Barcelona - Sant Antoni,FLAT,Piso en última planta a reformar en calle Tall...
1,65.0,2.0,1.0,Barcelona - Dreta de l´Eixample,FLAT,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,77.0,2.0,1.0,Barcelona - Dreta de l´Eixample,FLAT,"En pleno centro de Barcelona, justo al lado de..."
3,96.0,3.0,2.0,Barcelona - Sant Antoni,FLAT,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,84.0,2.0,1.0,Barcelona - Sagrada Família,FLAT,"En el corazón de Barcelona, en una hermosa fin..."
...,...,...,...,...,...,...
861,115.0,3.0,1.0,Barcelona - Navas,FLAT,"HANNAN-PIPER Real Estate les presenta, en excl..."
862,82.0,3.0,1.0,Barcelona - Navas,FLAT,¡ OPORTUNIDAD !\n\nLa Casa Agency vende: Vivie...
863,79.0,4.0,2.0,Barcelona - Navas,FLAT,"Piso totalmente REFORMADO y a ESTRENAR, con MU..."
864,63.0,1.0,1.0,Barcelona - Navas,FLAT,Presentamos la oportunidad de comprar un bonit...


In [7]:
def convert_dummies(df):
    return pd.get_dummies(df, columns=['loc_string','type'])

In [8]:
df_clean = convert_dummies(df_clean)

In [9]:
from transformers import AutoModel, AutoTokenizer
model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model1 = AutoModel.from_pretrained(model_name)

In [10]:
import torch
def generate_embeddings(text):
    tokenized_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model1(**tokenized_text)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [11]:
embeddings = df_clean['desc'].apply(generate_embeddings)

In [12]:
from sklearn.preprocessing import StandardScaler
X_numeric = df_clean[['m2','bedrooms','bathrooms']]
scaler = StandardScaler()
scaler.fit(X_numeric)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
x_scale = scaler.transform(X_numeric)
x_loc = df_clean[[col for col in df_clean.columns if 'loc_string' in col or 'type' in col]].astype(float).to_numpy()

In [14]:
from sklearn.model_selection import train_test_split

# Train-test split
X = torch.cat((torch.from_numpy(x_scale),torch.from_numpy(x_loc),torch.cat(embeddings.tolist())),1).float()
y = torch.from_numpy(target.to_numpy())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
import torch.nn as nn

# Define PyTorch model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(796, 128) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = x.float()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [64]:
import torch.optim as optim
# Initialize model, loss function, and optimizer
model2 = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)
# optimizer = optim.Adam(model2.parameters(), lr=0.1) 
# optimizer = optim.Adam(model2.parameters(), lr=0.01)

In [65]:
from sklearn.metrics import r2_score
import torch.nn.functional as F
import torch.optim as optim

def val_metric(model, valid_dl):
    model.eval()
    losses = []
    y_hats = []
    ys = []
    for x, y in valid_dl:
        y_hat = model(x.float())
        loss = F.mse_loss(y_hat, y.float().unsqueeze(1))
        y_hats.append(y_hat.detach().numpy())
        ys.append(y.numpy())
        losses.append(loss.item())
    
    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)
    return np.mean(losses), r2_score(ys, y_hats)

def train_loop(model, train_dl, valid_dl, optimizer, epochs):
    losses = []
    for i in range(epochs):
        model.train()
        for x, y in train_dl:
            y_hat = model(x.float())
            loss = F.mse_loss(y_hat, y.float().unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        train_loss = np.mean(losses)
        valid_loss, valid_auc = val_metric(model, valid_dl)
        if i%50 == 0:
            print("train loss %.3f valid loss %.3f R2 %.3f" % 
                  (train_loss, valid_loss, valid_auc))

In [66]:
from torch.utils.data import TensorDataset, DataLoader

# Prepare DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataset = TensorDataset(X_test, y_test)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [67]:
train_loop(model2, train_loader, valid_loader, optimizer, epochs=400)

train loss 122658.525 valid loss 116732.398 R2 -19.949
train loss 19485.865 valid loss 4544.778 R2 0.215
train loss 11819.015 valid loss 3528.908 R2 0.391
train loss 8991.517 valid loss 2965.841 R2 0.486
train loss 7462.532 valid loss 2661.147 R2 0.537
train loss 6482.899 valid loss 2486.100 R2 0.564
train loss 5792.862 valid loss 2449.183 R2 0.568
train loss 5279.049 valid loss 2421.530 R2 0.570


In [59]:
# hyperparam search results

# EPOCHS:
# 400 selected as close to optimal
# train loss 123146.778 valid loss 117264.086 R2 -20.045
# train loss 19529.198 valid loss 4557.824 R2 0.213
# train loss 11841.983 valid loss 3529.190 R2 0.390
# train loss 9006.410 valid loss 2975.565 R2 0.485
# train loss 7472.548 valid loss 2658.420 R2 0.537
# train loss 6489.845 valid loss 2497.027 R2 0.562
# train loss 5798.061 valid loss 2437.986 R2 0.569
# train loss 5282.371 valid loss 2423.802 R2 0.569
# train loss 4883.059 valid loss 2446.239 R2 0.563
# train loss 4563.317 valid loss 2495.837 R2 0.553
# train loss 4301.058 valid loss 2551.174 R2 0.543
# train loss 4081.709 valid loss 2539.044 R2 0.544
# train loss 3894.907 valid loss 2616.675 R2 0.530
# train loss 3733.773 valid loss 2602.759 R2 0.531
# train loss 3592.785 valid loss 2636.798 R2 0.525
# train loss 3468.017 valid loss 2728.841 R2 0.509

# Learning rate: 0.01
# train loss 93144.997 valid loss 37365.315 R2 -5.702
# train loss 5396.729 valid loss 2490.490 R2 0.559
# train loss 3732.610 valid loss 2454.221 R2 0.560
# train loss 3077.187 valid loss 2524.390 R2 0.547
# train loss 2699.771 valid loss 2494.183 R2 0.552
# train loss 2431.254 valid loss 2577.741 R2 0.536
# train loss 2232.138 valid loss 2651.122 R2 0.523
# train loss 2063.081 valid loss 2694.784 R2 0.516

# Learning rate 0.1
# train loss 29403.618 valid loss 4523.429 R2 0.221
# train loss 3030.731 valid loss 2756.205 R2 0.512
# train loss 2208.557 valid loss 3329.776 R2 0.401
# train loss 1681.966 valid loss 4040.415 R2 0.281
# train loss 1354.280 valid loss 4537.712 R2 0.191
# train loss 1133.233 valid loss 4730.176 R2 0.153
# train loss 969.536 valid loss 4680.087 R2 0.162
# train loss 854.781 valid loss 4665.918 R2 0.171

In [20]:
home = os.path.expanduser( '~' )
with open(home + '/data/test_kaggle.pickle', 'rb') as f:
    test = pickle.load(f) 
df_test = pd.DataFrame.from_dict(test)
test_clean = clean_df(df, df_test, True)

In [34]:
test_embed = df_test['desc'].apply(generate_embeddings)

In [35]:
test = convert_dummies(test_clean)


In [36]:
for col in df_clean.columns:
    if ('loc_string' in col) or 'type_' in col:
        if col not in test.columns:
            test[col] = False

In [37]:
x_test_scale = scaler.transform(test[['m2','bedrooms','bathrooms']])
x_test_loc = test[[col for col in df_clean.columns if ('loc_string' in col) or 'type_' in col]].astype(float).to_numpy()

In [38]:
X = torch.cat((torch.from_numpy(x_test_scale),torch.from_numpy(x_test_loc),torch.cat(test_embed.tolist())),1).float()

In [39]:
yhat = model2(X)

In [120]:
depth = [i for i in range(1,10)]
eta = [0.03,0.04,0.05,0.06,0.07,0.08]
params = []
r2 = []
for i in depth:
    for j in eta:
        model = XGBRegressor(max_depth=i, eta=j)
        # fit model
        model.fit(X_train, y_train)
        yhat = model.predict(X_test)
        params.append((i,j))
        r2.append(r2_score(y_test, yhat))

In [121]:
xgb = pd.DataFrame(list(zip(params, r2)), columns =['params', 'r2'])
xgb

Unnamed: 0,params,r2
0,"(1, 0.03)",0.481014
1,"(1, 0.04)",0.512584
2,"(1, 0.05)",0.531326
3,"(1, 0.06)",0.542734
4,"(1, 0.07)",0.550384
5,"(1, 0.08)",0.556924
6,"(2, 0.03)",0.531595
7,"(2, 0.04)",0.547445
8,"(2, 0.05)",0.548821
9,"(2, 0.06)",0.561329


In [122]:
from xgboost import XGBRegressor
model = XGBRegressor(max_depth=4, eta=0.08)
# fit model
model.fit(X_train, y_train)

In [123]:
for i in range(11):
    w1 = i/10
    w2 = 1-w1
    y_pred = np.average( np.array([ model2(X_test).detach().numpy().flatten(), model.predict(X_test) ]), weights = [w1,w2], axis=0 )
    print(w1, w2, r2_score(y_test, y_pred))

0.0 1.0 0.5722461616302965
0.1 0.9 0.5874523222289074
0.2 0.8 0.5990462410810272
0.3 0.7 0.6070279181866559
0.4 0.6 0.6113973535457935
0.5 0.5 0.6121545471584398
0.6 0.4 0.609299499024595
0.7 0.30000000000000004 0.6028322091442591
0.8 0.19999999999999996 0.5927526775174319
0.9 0.09999999999999998 0.5790609041441137
1.0 0.0 0.5617568890243043


In [106]:
p1 = torch.reshape(torch.from_numpy(model.predict(X_train)), (-1,1))
p2 = model2(X_train)
t1 = torch.reshape(torch.from_numpy(model.predict(X_test)), (-1,1))
t2 = model2(X_test)

In [111]:
xtrain_mod = torch.cat((X_train, p1, p2), axis=1)
xtest_mod = torch.cat((X_test, t1, t2), axis=1)

In [117]:
depth = [i for i in range(1,10)]
eta = [0.03,0.04,0.05,0.06,0.07,0.08]
params = []
r2 = []
for i in depth:
    for j in eta:
        model3 = XGBRegressor(max_depth=i, eta=j)
        # fit model
        model3.fit(xtrain_mod.detach(), y_train)
        yhat = model3.predict(xtest_mod.detach())
        params.append((i,j))
        r2.append(r2_score(y_test, yhat))

In [118]:
xgb = pd.DataFrame(list(zip(params, r2)), columns =['params', 'r2'])
xgb

Unnamed: 0,params,r2
0,"(1, 0.03)",0.523491
1,"(1, 0.04)",0.523231
2,"(1, 0.05)",0.520601
3,"(1, 0.06)",0.518823
4,"(1, 0.07)",0.516524
5,"(1, 0.08)",0.516309
6,"(2, 0.03)",0.519511
7,"(2, 0.04)",0.514189
8,"(2, 0.05)",0.51087
9,"(2, 0.06)",0.505869


In [124]:
y_pred = np.mean( np.array([ model2(X).detach().numpy().flatten(), model.predict(X) ]), axis=0 )

In [125]:
r2_score(y_test, model.predict(X_test))

0.5722461616302965

In [126]:
y_pred

array([353.94666, 348.92538, 277.738  , 320.78235, 359.33618, 353.0382 ,
       386.13763, 216.6109 , 274.40436, 271.8137 , 384.7299 , 424.28333,
       422.5855 , 377.6869 , 384.96637, 333.08768, 303.48062, 392.3758 ,
       437.54016, 311.47696, 297.18857, 337.53143, 278.37848, 395.42105,
       267.13434, 288.36646, 384.24115, 340.2953 , 372.06427, 428.0089 ,
       335.58264, 335.342  , 380.8922 , 328.6947 , 428.77374, 306.6346 ,
       402.1638 , 308.71243, 392.58234, 336.66583, 404.05518, 380.8922 ,
       282.18964, 296.92126, 386.2401 , 421.7724 , 316.34372, 406.56763,
       357.77533, 409.98975, 296.8744 , 383.26422, 321.62006, 289.99567,
       392.80322, 233.72089, 441.32166, 352.04346, 308.06406, 388.92972,
       242.63597, 335.43018, 353.51404, 278.88757, 335.42047, 380.60223,
       355.32336, 371.696  , 374.633  , 359.693  , 340.68958, 328.21692,
       398.20288, 403.3898 , 276.4377 , 328.81512, 357.14282, 316.52673,
       335.91455, 346.8915 , 420.49683, 364.1969 , 

In [127]:
out = pd.DataFrame(y_pred)
out

Unnamed: 0,0
0,353.946655
1,348.925385
2,277.738007
3,320.782349
4,359.336182
...,...
127,383.985291
128,315.334839
129,309.582581
130,306.902405


In [128]:
out = out.rename(columns={0: 'price'})
out.index.names = ['id']
out.to_csv('solution.csv')