In [1]:
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import numpy as np

import hopsworks

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

%load_ext dotenv
%dotenv

In [2]:
project = hopsworks.login(project='id2223_enric')
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/197783
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
hackernews_fg = fs.get_feature_group("hackernews_fg", 1)
query = hackernews_fg.select_all()
feature_view = fs.get_or_create_feature_view(name="hackernews_fv",
                                  version=1,
                                  description="Hackernews feature view",
                                  labels=["score"],
                                  query=query)

In [5]:
num_samples = 1000
bins = np.linspace(df['score'].min(), df['score'].max(), num_samples + 1)
df['score_bin'] = pd.cut(df['score'], bins=bins, labels=False, include_lowest=True)
df_sampled = df.groupby('score_bin', group_keys=False).apply(lambda x: x.sample(1))
df_sampled = df_sampled.drop(columns=['score_bin'])
df = df_sampled

Finished: Reading data from Hopsworks, using ArrowFlight (7.77s) 




[              id                                              title  \
0        6196387  In Star Trek, does the transporter conserve th...   
3        5258770     'Get Me a Hair Appointment and Empty My Inbox'   
4       11296156  TripMode for Windows – Limit cellular hotspot ...   
5        2903935                     How To: Get Out of Working Rut   
6       20797513        Debating the Cryptographic Autonomy License   
...          ...                                                ...   
140035        64  Largest archive of online books about religion...   
140036       -60                        AI Is Already Killing Books   
140038         0                                      Book of Kells   
140039       -32  Fortran vs Python: The counter-intuitive rise ...   
140042        86              Alaska Airlines grounds 737 Max fleet   

                                                      url          time  \
0               http://scifi.stackexchange.com/q/39295/63  1.376256e+09

In [6]:
from feature_processing import load_text_encoder, to_embedding

In [7]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.ll1 = nn.Linear(768, 1024)
        self.bn1 = nn.BatchNorm1d(2)
        self.elu1 = nn.ELU()
        self.ll2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(2)
        self.elu2 = nn.ELU()
        self.llf = nn.Linear(512, 1)
        
    def forward(self, x):
        x = self.elu1(self.bn1(self.ll1(x)))
        x = self.elu2(self.bn2(self.ll2(x)))
        x = torch.sum(x, dim=1)
        x = self.llf(x)
        return x

In [8]:
model_1 = Model().to(device)

In [9]:
model_2 = copy.deepcopy(model_1)

In [10]:
def rearray(arr_str):
    arr_str = arr_str.strip("'").replace('\n', '').replace('[', '').replace(']', '').split()
    numpy_array = np.array(arr_str, dtype=float)
    return numpy_array

In [12]:
import re

def extract_words_from_link(link):
    # Match alphanumeric sequences
    url_str = ""
    words = re.findall(r'\b\w+\b', link)
    remove_list = ['https', 'http', 'www']
    final_words = [w for w in words if not(w in remove_list)]
    for w in final_words:
        url_str += w + " "
    return url_str

In [13]:
# class DfDataset(Dataset):
#     def __init__(self, df, col):
#         self.df = df
#         self.col = col
    
#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         val = self.df[self.col].iloc[idx]
#         reg_lbl = self.df['score'].iloc[idx]
#         if reg_lbl <= 1:
#             cls_lbl = 0
#             reg_lbl = reg_lbl
#         else:
#             cls_lbl = 1
#             reg_lbl = reg_lbl / 2800
#         arr = rearray(val)
#         return arr, cls_lbl, reg_lbl
    
from feature_processing import to_embedding
class DfDataset(Dataset):
    def __init__(self, df):
        self.df = df[['title', 'url', 'score']]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        title = [self.df['title'].iloc[idx]]
        url = [extract_words_from_link(self.df['url'].iloc[idx])]
        score = self.df['score'].iloc[idx]/280
        
        title_embedding = to_embedding(title)
        url_embedding = to_embedding(url)
        embeddings = torch.cat([title_embedding, url_embedding], dim=0)
        embeddings = F.softmax(embeddings, dim=0)
        return embeddings, score
        

In [15]:
train_df, val_df = train_test_split(df, test_size=0.1)

In [16]:
train_ds, val_ds = DfDataset(train_df), DfDataset(val_df)

In [17]:
train_loader = DataLoader(train_ds, batch_size=12, 
                          num_workers=2, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=12,
                        num_workers=2, shuffle=True)

In [19]:
epochs = 500
optimizer = optim.AdamW(model_1.parameters(), lr=1e-5)
lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, epochs=500, steps_per_epoch=len(train_loader))
mse_loss = nn.MSELoss()
bce_loss = nn.BCELoss()
def loss_fn(output, Y):
    cls_lbl, reg_lbl = Y[0], Y[1]
    cls_op, reg_op = F.sigmoid(output[:, 0]), output[:, 1]
    bce_l = bce_loss(cls_op, cls_lbl)
    mse_l = mse_loss(reg_op * cls_lbl, reg_lbl *  cls_lbl)
    return bce_l + mse_l

In [20]:
for epoch in range(epochs):
    tr_loss_per_batch = []
    val_loss_per_batch = []
    for sample in tqdm(train_loader):
        X, Y = sample
        X, Y = X.to(torch.float32).to(device), Y.to(torch.float32).to(device)
        target = model_1(X)
        loss = mse_loss(target.squeeze(), Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tr_loss_per_batch.append(loss.item())
        lr_scheduler.step()
    with torch.no_grad():
        for sample in tqdm(val_loader):
            X, Y = sample
            X, Y = X.to(torch.float32).to(device), Y.to(torch.float32).to(device)
            target = model_1(X)
            loss = mse_loss(target, Y)
            val_loss_per_batch.append(loss.item())
            
    print(f"Epoch: {epoch+1}/{epochs}")
    print(f"Training loss: {np.mean(tr_loss_per_batch)} Validation Loss: {np.mean(val_loss_per_batch)}")

100%|██████████| 27/27 [01:04<00:00,  2.39s/it]
100%|██████████| 3/3 [00:10<00:00,  3.54s/it]


Epoch: 1/500
Training loss: 13.85665206776725 Validation Loss: 3.585315704345703


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  2.99s/it]


Epoch: 2/500
Training loss: 3.3704053119376853 Validation Loss: 3.551585594813029


100%|██████████| 27/27 [01:01<00:00,  2.27s/it]
100%|██████████| 3/3 [00:09<00:00,  3.02s/it]


Epoch: 3/500
Training loss: 3.0964578964092113 Validation Loss: 3.294563035170237


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.98s/it]


Epoch: 4/500
Training loss: 2.5227972997559442 Validation Loss: 3.147269527117411


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  3.00s/it]


Epoch: 5/500
Training loss: 2.633529824239236 Validation Loss: 3.0897798935572305


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:09<00:00,  3.23s/it]


Epoch: 6/500
Training loss: 2.685106619640633 Validation Loss: 3.031854510307312


100%|██████████| 27/27 [01:03<00:00,  2.34s/it]
100%|██████████| 3/3 [00:09<00:00,  3.03s/it]


Epoch: 7/500
Training loss: 2.749568396144443 Validation Loss: 3.2761598428090415


100%|██████████| 27/27 [01:02<00:00,  2.33s/it]
100%|██████████| 3/3 [00:08<00:00,  2.92s/it]


Epoch: 8/500
Training loss: 2.858276888176247 Validation Loss: 3.5787360270818076


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  2.92s/it]


Epoch: 9/500
Training loss: 3.219050963719686 Validation Loss: 3.20870574315389


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:09<00:00,  3.00s/it]


Epoch: 10/500
Training loss: 2.908951030837165 Validation Loss: 3.87276283899943


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.99s/it]


Epoch: 11/500
Training loss: 2.8415291044447155 Validation Loss: 3.1664156119028726


100%|██████████| 27/27 [01:03<00:00,  2.37s/it]
100%|██████████| 3/3 [00:09<00:00,  3.10s/it]


Epoch: 12/500
Training loss: 2.710576366495203 Validation Loss: 3.120285073916117


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 13/500
Training loss: 2.6788518406726696 Validation Loss: 3.272860328356425


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:08<00:00,  3.00s/it]


Epoch: 14/500
Training loss: 2.6753017284252025 Validation Loss: 3.095897356669108


100%|██████████| 27/27 [01:02<00:00,  2.33s/it]
100%|██████████| 3/3 [00:09<00:00,  3.06s/it]


Epoch: 15/500
Training loss: 2.7082230779859753 Validation Loss: 3.1101580460866294


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:09<00:00,  3.01s/it]


Epoch: 16/500
Training loss: 2.8006770787415682 Validation Loss: 3.1490522623062134


100%|██████████| 27/27 [01:03<00:00,  2.35s/it]
100%|██████████| 3/3 [00:09<00:00,  3.21s/it]


Epoch: 17/500
Training loss: 2.6450834848262645 Validation Loss: 3.01206107934316


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:09<00:00,  3.07s/it]


Epoch: 18/500
Training loss: 2.5891454882091947 Validation Loss: 3.1453728477160134


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.97s/it]


Epoch: 19/500
Training loss: 2.7218753739639565 Validation Loss: 3.011244614919027


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 20/500
Training loss: 2.7542148055853666 Validation Loss: 3.108887791633606


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:09<00:00,  3.05s/it]


Epoch: 21/500
Training loss: 2.732511398968873 Validation Loss: 3.0114109913508096


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:10<00:00,  3.55s/it]


Epoch: 22/500
Training loss: 2.7968618913933083 Validation Loss: 3.6678649187088013


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.98s/it]


Epoch: 23/500
Training loss: 2.716928565943683 Validation Loss: 3.7217389742533364


100%|██████████| 27/27 [01:04<00:00,  2.37s/it]
100%|██████████| 3/3 [00:08<00:00,  2.95s/it]


Epoch: 24/500
Training loss: 2.803291791015201 Validation Loss: 3.2440579334894815


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 25/500
Training loss: 2.779134311057903 Validation Loss: 2.998934745788574


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.99s/it]


Epoch: 26/500
Training loss: 2.628841604347582 Validation Loss: 3.0650745232899985


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:10<00:00,  3.55s/it]


Epoch: 27/500
Training loss: 2.662438538339403 Validation Loss: 3.200525999069214


100%|██████████| 27/27 [01:02<00:00,  2.31s/it]
100%|██████████| 3/3 [00:08<00:00,  2.95s/it]


Epoch: 28/500
Training loss: 2.6280510635287673 Validation Loss: 3.192010283470154


100%|██████████| 27/27 [01:01<00:00,  2.28s/it]
100%|██████████| 3/3 [00:09<00:00,  3.00s/it]


Epoch: 29/500
Training loss: 2.7524828160250627 Validation Loss: 3.0805430809656777


100%|██████████| 27/27 [01:01<00:00,  2.28s/it]
100%|██████████| 3/3 [00:08<00:00,  2.99s/it]


Epoch: 30/500
Training loss: 2.574722916991622 Validation Loss: 3.0820460319519043


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.95s/it]


Epoch: 31/500
Training loss: 3.104647832888144 Validation Loss: 3.169434408346812


100%|██████████| 27/27 [01:01<00:00,  2.27s/it]
100%|██████████| 3/3 [00:09<00:00,  3.24s/it]


Epoch: 32/500
Training loss: 2.911080007199888 Validation Loss: 3.6468025843302407


100%|██████████| 27/27 [01:01<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 33/500
Training loss: 2.9830464875256575 Validation Loss: 3.1277193625768027


100%|██████████| 27/27 [01:01<00:00,  2.28s/it]
100%|██████████| 3/3 [00:08<00:00,  2.91s/it]


Epoch: 34/500
Training loss: 2.743401456762243 Validation Loss: 3.1260719299316406


100%|██████████| 27/27 [01:02<00:00,  2.32s/it]
100%|██████████| 3/3 [00:08<00:00,  2.99s/it]


Epoch: 35/500
Training loss: 2.807856358863689 Validation Loss: 3.216425657272339


100%|██████████| 27/27 [01:02<00:00,  2.30s/it]
100%|██████████| 3/3 [00:08<00:00,  2.90s/it]


Epoch: 36/500
Training loss: 3.1046393469527915 Validation Loss: 3.373560150464376


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 37/500
Training loss: 2.626341340718446 Validation Loss: 3.0548277695973716


100%|██████████| 27/27 [01:00<00:00,  2.25s/it]
100%|██████████| 3/3 [00:10<00:00,  3.63s/it]


Epoch: 38/500
Training loss: 2.6495977882985717 Validation Loss: 3.2027082045873008


100%|██████████| 27/27 [01:01<00:00,  2.28s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 39/500
Training loss: 2.874914050102234 Validation Loss: 3.085273543993632


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  2.93s/it]


Epoch: 40/500
Training loss: 3.0121877325905695 Validation Loss: 3.1983945965766907


100%|██████████| 27/27 [01:01<00:00,  2.28s/it]
100%|██████████| 3/3 [00:08<00:00,  2.91s/it]


Epoch: 41/500
Training loss: 2.667955504523383 Validation Loss: 3.138283848762512


100%|██████████| 27/27 [01:01<00:00,  2.27s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Epoch: 42/500
Training loss: 2.764554230151353 Validation Loss: 3.26604962348938


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:09<00:00,  3.05s/it]


Epoch: 43/500
Training loss: 2.7886421238934553 Validation Loss: 3.064253787199656


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:09<00:00,  3.15s/it]


Epoch: 44/500
Training loss: 2.6322112966466835 Validation Loss: 3.109840909639994


100%|██████████| 27/27 [01:01<00:00,  2.29s/it]
100%|██████████| 3/3 [00:08<00:00,  2.93s/it]


Epoch: 45/500
Training loss: 2.702819643197236 Validation Loss: 3.1918026010195413


 37%|███▋      | 10/27 [00:26<00:45,  2.68s/it]


KeyboardInterrupt: 

In [21]:
torch.save(model_1,'model.pth')

In [29]:
model_1 = torch.load('model.pth')

In [32]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

In [34]:
X, Y = next(iter(train_loader))
input_schema = Schema(X.cpu().numpy())
output_schema = Schema(Y.cpu().numpy())
model_schema = ModelSchema(input_schema, output_schema)

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f06d7a436d0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
Exception ignored in:   File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f06d7a436d0>
    assert self._parent_pid == os.getpid(), 'can only test a child process'Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__

AssertionError    : self._shutdown_workers()
can only test a child process  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers

    if w.is_alive():
  File "/op

In [24]:
import os
HOPSWORKS_API_KEY = os.environ.get('HOPSWORKS_API_KEY')

In [30]:
project = hopsworks.login()


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/223381


In [36]:
mr = project.get_model_registry()
hopsworks_model = mr.python.create_model(
        name="hopsworks_pred",
        model_schema=model_schema,
        description="Hopsworks upvote predictor"
    )

Connected. Call `.close()` to terminate connection gracefully.


In [39]:
hopsworks_model.save("model.pth")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/5257745 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/214 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/223381/models/hopsworks_pred/1


Model(name: 'hopsworks_pred', version: 1)