In [35]:
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
from tqdm import tqdm
import numpy as np

import hopsworks

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [36]:
# import os
# HOPSWORKS_API_KEY = os.environ.get('HOPSWORKS_API_KEY')

In [37]:
# project = hopsworks.login(project='id2223_enric', api_key_value=HOPSWORKS_API_KEY)
# fs = project.get_feature_store()

In [38]:
# fs.get_feature_group("hackernews_fg")

In [39]:
df_c = pd.read_csv('../data/pd_combined.csv')
df_c = df_c.sample(frac=0.50).reset_index(drop=True)
df_c = df_c.fillna(value=" ")

In [None]:
from feature_processing import load_text_encoder, to_embedding

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.ll1 = nn.Linear(768, 1024)
        self.bn1 = nn.BatchNorm1d(2)
        self.elu1 = nn.ELU()
        self.ll2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(2)
        self.elu2 = nn.ELU()
        self.llf = nn.Linear(512, 1)
        
    def forward(self, x):
        x = self.elu1(self.bn1(self.ll1(x)))
        x = self.elu2(self.bn2(self.ll2(x)))
        x = torch.sum(x, dim=1)
        x = self.llf(x)
        return x

In [None]:
model_1 = Model().to(device)

In [None]:
model_2 = copy.deepcopy(model_1)

In [None]:
def rearray(arr_str):
    arr_str = arr_str.strip("'").replace('\n', '').replace('[', '').replace(']', '').split()
    numpy_array = np.array(arr_str, dtype=float)
    return numpy_array

In [None]:
import re

def extract_words_from_link(link):
    # Match alphanumeric sequences
    url_str = ""
    words = re.findall(r'\b\w+\b', link)
    remove_list = ['https', 'http', 'www']
    final_words = [w for w in words if not(w in remove_list)]
    for w in final_words:
        url_str += w + " "
    return url_str

In [None]:
# class DfDataset(Dataset):
#     def __init__(self, df, col):
#         self.df = df
#         self.col = col
    
#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         val = self.df[self.col].iloc[idx]
#         reg_lbl = self.df['score'].iloc[idx]
#         if reg_lbl <= 1:
#             cls_lbl = 0
#             reg_lbl = reg_lbl
#         else:
#             cls_lbl = 1
#             reg_lbl = reg_lbl / 2800
#         arr = rearray(val)
#         return arr, cls_lbl, reg_lbl
    
from feature_processing import to_embedding
class DfDataset(Dataset):
    def __init__(self, df):
        self.df = df[['title', 'url', 'score']]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        title = [self.df['title'].iloc[idx]]
        url = [extract_words_from_link(self.df['url'].iloc[idx])]
        score = self.df['score'].iloc[idx]/280
        
        title_embedding = to_embedding(title)
        url_embedding = to_embedding(url)
        embeddings = torch.cat([title_embedding, url_embedding], dim=0)
        embeddings = F.softmax(embeddings, dim=0)
        return embeddings, score
        

In [None]:
train_df, val_df = df_c[:70000], df_c[70000:]

In [None]:
train_ds, val_ds = DfDataset(train_df), DfDataset(val_df)

In [None]:
train_loader = DataLoader(train_ds, batch_size=12, 
                          num_workers=2, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=12,
                        num_workers=2, shuffle=True)

In [None]:
epochs = 500
optimizer = optim.AdamW(model_1.parameters(), lr=1e-5)
lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, epochs=500, steps_per_epoch=len(train_loader))
mse_loss = nn.MSELoss()
bce_loss = nn.BCELoss()
def loss_fn(output, Y):
    cls_lbl, reg_lbl = Y[0], Y[1]
    cls_op, reg_op = F.sigmoid(output[:, 0]), output[:, 1]
    bce_l = bce_loss(cls_op, cls_lbl)
    mse_l = mse_loss(reg_op * cls_lbl, reg_lbl *  cls_lbl)
    return bce_l + mse_l

In [None]:
for epoch in range(epochs):
    tr_loss_per_batch = []
    val_loss_per_batch = []
    for sample in tqdm(train_loader):
        X, Y = sample
        X, Y = X.to(torch.float32).to(device), Y.to(torch.float32).to(device)
        target = model_1(X)
        loss = mse_loss(target.squeeze(), Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tr_loss_per_batch.append(loss.item())
        lr_scheduler.step()
    with torch.no_grad():
        for sample in tqdm(val_loader):
            X, Y = sample
            X, Y = X.to(torch.float32).to(device), Y.to(torch.float32).to(device)
            target = model_1(X)
            loss = mse_loss(target, Y)
            val_loss_per_batch.append(loss.item())
            
    print(f"Epoch: {epoch+1}/{epochs}")
    print(f"Training loss: {np.mean(tr_loss_per_batch)} Validation Loss: {np.mean(val_loss_per_batch)}")

  2%|▏         | 90/5834 [03:40<3:54:36,  2.45s/it]


KeyboardInterrupt: 