In [24]:
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
from tqdm import tqdm
import numpy as np

import hopsworks

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [25]:
from feature_processing import load_text_encoder, to_embedding

In [26]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.ll1 = nn.Linear(768, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.elu1 = nn.ELU()
        self.ll2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.elu2 = nn.ELU()
        self.llf = nn.Linear(512, 2)
        # self.bn3 = nn.BatchNorm1d(124),
        # self.elu3 = nn.ELU()
        # self.llf = nn.Linear(124, 2)
        
    def forward(self, x):
        x = self.elu1(self.bn1(self.ll1(x)))
        x = self.elu2(self.bn2(self.ll2(x)))
        print(x.shape)
        x = self.llf(x)
        return x

In [13]:
model_1 = nn.Sequential(
    nn.Linear(768, 360),
    nn.BatchNorm1d(360),
    nn.ELU(),
    nn.Linear(360, 124),
    nn.BatchNorm1d(124),
    nn.ELU(),
    nn.Linear(124, 2),
    # nn.ELU(),
    # nn.Linear(128, 2)
).to(device)

In [14]:
model_2 = copy.deepcopy(model_1)

In [15]:
def rearray(arr_str):
    arr_str = arr_str.strip("'").replace('\n', '').replace('[', '').replace(']', '').split()
    numpy_array = np.array(arr_str, dtype=float)
    return numpy_array

In [20]:
import re

def extract_words_from_link(link):
    # Match alphanumeric sequences
    url_str = ""
    words = re.findall(r'\b\w+\b', link)
    remove_list = ['https', 'http', 'www']
    final_words = [w for w in words if not(w in remove_list)]
    for w in final_words:
        url_str += w + " "
    return url_str

In [21]:
# class DfDataset(Dataset):
#     def __init__(self, df, col):
#         self.df = df
#         self.col = col
    
#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         val = self.df[self.col].iloc[idx]
#         reg_lbl = self.df['score'].iloc[idx]
#         if reg_lbl <= 1:
#             cls_lbl = 0
#             reg_lbl = reg_lbl
#         else:
#             cls_lbl = 1
#             reg_lbl = reg_lbl / 2800
#         arr = rearray(val)
#         return arr, cls_lbl, reg_lbl
    
from feature_processing import to_embedding
class DfDataset(Dataset):
    def __init__(self, df):
        self.df = df[['title', 'url', 'score']]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        title = [self.df['title'].iloc[idx]]
        url = [self.df['url'].iloc[idx]]
        score = self.df['score'].iloc[idx]/2800
        
        title_embedding = torch.tensor(to_embedding(title))
        url_embedding = torch.tensor(to_embedding(url))
        
        print(title_embedding.shape)
        print(url_embedding.shape)
        return title_embedding, url_embedding, score
        

In [22]:
df_c = pd.read_csv('../data/compiled.csv')
df_c = df_c.sample(frac=1).reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: '../data/compiled.csv'

In [23]:
train_df, val_df = df_c[:70000], df_c[70000:]
train_title_df = train_df[['title', 'score']]
val_title_df = val_df[['title', 'score']]

train_url_df = train_df[['url', 'score']]
val_url_df = val_df[['url', 'score']]

NameError: name 'df_c' is not defined

In [19]:
train_ds, val_ds = DfDataset(train_title_df, col='title'), DfDataset(val_title_df, col='title')

In [20]:
train_loader = DataLoader(train_ds, batch_size=12, 
                          num_workers=2, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=12,
                        num_workers=2, shuffle=True)

In [21]:
epochs = 500
optimizer = optim.AdamW(model_1.parameters(), lr=1e-5)
lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, epochs=500, steps_per_epoch=len(train_loader))
mse_loss = nn.MSELoss()
bce_loss = nn.BCELoss()
def loss_fn(output, Y):
    cls_lbl, reg_lbl = Y[0], Y[1]
    cls_op, reg_op = F.sigmoid(output[:, 0]), output[:, 1]
    bce_l = bce_loss(cls_op, cls_lbl)
    mse_l = mse_loss(reg_op * cls_lbl, reg_lbl *  cls_lbl)
    return bce_l + mse_l

In [22]:
for epoch in range(epochs):
    tr_loss_per_batch = []
    val_loss_per_batch = []
    for sample in train_loader:
        X, Y_cls, Y_reg = sample
        X, Y_cls, Y_reg = X.to(torch.float32).to(device), Y_cls.to(torch.float32).to(device), Y_reg.to(torch.float32).to(device)
        target = model_1(X)
        loss = loss_fn(target, [Y_cls, Y_reg])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tr_loss_per_batch.append(loss.item())
        lr_scheduler.step()
    with torch.no_grad():
        for sample in val_loader:
            X, Y_cls, Y_reg = sample
            X, Y_cls, Y_reg = X.to(torch.float32).to(device), Y_cls.to(torch.float32).to(device), Y_reg.to(torch.float32).to(device)
            target = model_1(X)
            loss = loss_fn(target, [Y_cls, Y_reg])
            val_loss_per_batch.append(loss.item())
            
    print(f"Epoch: {epoch+1}/{epochs}")
    print(f"Training loss: {np.mean(tr_loss_per_batch)} Validation Loss: {np.mean(val_loss_per_batch)}")

Epoch: 1/500
Training loss: 0.652201490080189 Validation Loss: 0.6487126372915377
Epoch: 2/500
Training loss: 0.6419460509619894 Validation Loss: 0.6399197356878139
Epoch: 3/500
Training loss: 0.6388908415767155 Validation Loss: 0.6419165067964321
Epoch: 4/500
Training loss: 0.6360472890155382 Validation Loss: 0.6393506557321091
Epoch: 5/500
Training loss: 0.6338058447016401 Validation Loss: 0.6411273953297155
Epoch: 6/500
Training loss: 0.6318212653068717 Validation Loss: 0.6365471083840593
Epoch: 7/500
Training loss: 0.6300207763746826 Validation Loss: 0.6364051406975273
Epoch: 8/500
Training loss: 0.6272066328668169 Validation Loss: 0.6374095342785334
Epoch: 9/500
Training loss: 0.627186828278539 Validation Loss: 0.6356038334701273
Epoch: 10/500
Training loss: 0.625744749753288 Validation Loss: 0.6388505454972494
Epoch: 11/500
Training loss: 0.6239972193998387 Validation Loss: 0.6349751482264315
Epoch: 12/500
Training loss: 0.6222809226567262 Validation Loss: 0.6374529532605796
Epoc

KeyboardInterrupt: 