In [32]:
import ast
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, Dataset
from torchmetrics.functional import accuracy, auroc, f1_score
from tqdm.auto import tqdm
from transformers import (AdamW, AutoModel, AutoTokenizer, BertConfig,
                          BertModel, BertTokenizer)
from transformers import get_linear_schedule_with_warmup

In [35]:
df = pd.read_csv('../data/post_categories/dataset.csv')

In [36]:
df

Unnamed: 0,title,label
0,Four Steps To Turn Big Data Into Action,['big data']
1,9 Amazing Ways Big Data Is Used Today to Chang...,['big data']
2,Data Science and Moneyball: A Profile of Pete ...,['data science']
3,Stanford Algorithm Analyzes Sentence Sentiment...,['machine learning']
4,Machine Learning for Relevance and Serendipity,['machine learning']
...,...,...
328,Word2Vec Skip The Gram Tutorial,"['nlp', 'machine learning']"
329,150 successful machine learning models: 6 less...,"['machine learning', 'application']"
330,Introduction to Adversarial Machine Learning,['machine learning']
331,The Craft of Writing Effectively,['career']


In [37]:
vocabs = []
for d in df['label']:
  x_list = ast.literal_eval(d)
  vocabs.append(set(x_list))
print(vocabs)

[{'big data'}, {'big data'}, {'data science'}, {'machine learning'}, {'machine learning'}, {'python', 'data science'}, {'data visualization'}, {'computer vision'}, {'data'}, {'interview'}, {'use cases', 'big data'}, {'machine learning'}, {'healthcare', 'big data'}, {'engineering'}, {'data science'}, {'big data'}, {'interview'}, {'data science'}, {'forecasting'}, {'neural network', 'deep learning'}, {'use case'}, {'guide'}, {'nlp'}, {'industry'}, {'nlp'}, {'nlp'}, {'guide'}, {'python', 'designed patterns'}, {'interview'}, {'reinforcement learning'}, {'review', 'data science'}, {'question', 'machine learning'}, {'algorithm'}, {'algorithm'}, {'deep learning'}, {'deep learning'}, {'data science'}, {'question', 'data science'}, {'libraries'}, {'deep learning'}, {'trading'}, {'computer vision'}, {'applications', 'machine learning'}, {'how-to', 'data science'}, {'data shift', 'mlops'}, {'interview', 'business', 'data science'}, {'deep learning'}, {'computer science'}, {'healthcare', 'data sci

In [38]:
mlb = MultiLabelBinarizer()
preds = mlb.fit_transform(vocabs)
print(preds)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [39]:
y = df['label'].apply(ast.literal_eval)
y = y.apply(lambda x: [set(x)])
y = y.apply(mlb.transform)
y = y.apply(lambda x: x[0])
print(y)

0      [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
1      [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
2      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                             ...                        
328    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
329    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
330    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
331    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
332    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: label, Length: 333, dtype: object


In [40]:
x = df['title']
y = y
df_cpy = df

In [41]:
dummies = df_cpy['label'].apply(lambda x: eval(x)).str.join('|').str.get_dummies()
LABEL_COLUMNS = dummies.columns
print(LABEL_COLUMNS)

Index(['ai', 'algorithm', 'algorithms', 'application', 'applications',
       'bandits', 'bayesian', 'best practice', 'best practices', 'big data',
       'book', 'books', 'business', 'career', 'cloud', 'clustering',
       'computer science', 'computer vision', 'course', 'courses', 'dashboard',
       'data', 'data engineering', 'data mining', 'data science', 'data shift',
       'data visualization', 'dataset', 'decision making', 'deep learning',
       'design patterns', 'designed patterns', 'engineering',
       'experimentation', 'forecasting', 'gis', 'graph', 'guide', 'hardware',
       'healthcare', 'hiring', 'how-to', 'improvement', 'industry',
       'infrastructure', 'interview', 'interviews', 'iot', 'libraries',
       'library', 'machine learning', 'mathematics', 'mlops',
       'mobile development', 'neural network', 'nlp', 'notes', 'paper',
       'papers', 'predictive analytics', 'probability', 'product',
       'programming', 'python', 'question', 'rant', 'real-time',
 

In [42]:
print(dummies)

     ai  algorithm  algorithms  application  applications  bandits  bayesian  \
0     0          0           0            0             0        0         0   
1     0          0           0            0             0        0         0   
2     0          0           0            0             0        0         0   
3     0          0           0            0             0        0         0   
4     0          0           0            0             0        0         0   
..   ..        ...         ...          ...           ...      ...       ...   
328   0          0           0            0             0        0         0   
329   0          0           0            1             0        0         0   
330   0          0           0            0             0        0         0   
331   0          0           0            0             0        0         0   
332   0          0           0            1             0        0         0   

     best practice  best practices  big

In [43]:
df_final = pd.merge(df_cpy, dummies, left_index=True, right_index=True).dropna()
print(df_final)

                                                 title  \
0              Four Steps To Turn Big Data Into Action   
1    9 Amazing Ways Big Data Is Used Today to Chang...   
2    Data Science and Moneyball: A Profile of Pete ...   
3    Stanford Algorithm Analyzes Sentence Sentiment...   
4       Machine Learning for Relevance and Serendipity   
..                                                 ...   
328                    Word2Vec Skip The Gram Tutorial   
329  150 successful machine learning models: 6 less...   
330       Introduction to Adversarial Machine Learning   
331                   The Craft of Writing Effectively   
332                       Use Dalle 2 to generate logo   

                                   label  ai  algorithm  algorithms  \
0                           ['big data']   0          0           0   
1                           ['big data']   0          0           0   
2                       ['data science']   0          0           0   
3                  

In [44]:
# First Split for Train and Test
train, test = train_test_split(df_final, random_state=42, test_size=0.30, shuffle=True)
train, val = train_test_split(train, test_size=0.2, random_state=42,shuffle=True)
print(train.shape, test.shape, val.shape)

(184, 91) (99, 91) (46, 91)


In [45]:
train[LABEL_COLUMNS].sum()

ai                 5
algorithm          9
algorithms         1
application        5
applications       0
                  ..
use case           3
use cases          1
ux                 2
visualization      2
web development    6
Length: 89, dtype: int64

In [46]:
sample_row = train.iloc[16]
sample_comment = sample_row.title
sample_labels = sample_row[LABEL_COLUMNS]

print(sample_comment)
print()
print(sample_labels.to_dict())

This Amazing Image Algorithm Learns to Spot Objects Without Human Help

{'ai': 0, 'algorithm': 0, 'algorithms': 0, 'application': 0, 'applications': 0, 'bandits': 0, 'bayesian': 0, 'best practice': 0, 'best practices': 0, 'big data': 0, 'book': 0, 'books': 0, 'business': 0, 'career': 0, 'cloud': 0, 'clustering': 0, 'computer science': 0, 'computer vision': 1, 'course': 0, 'courses': 0, 'dashboard': 0, 'data': 0, 'data engineering': 0, 'data mining': 0, 'data science': 0, 'data shift': 0, 'data visualization': 0, 'dataset': 0, 'decision making': 0, 'deep learning': 0, 'design patterns': 0, 'designed patterns': 0, 'engineering': 0, 'experimentation': 0, 'forecasting': 0, 'gis': 0, 'graph': 0, 'guide': 0, 'hardware': 0, 'healthcare': 0, 'hiring': 0, 'how-to': 0, 'improvement': 0, 'industry': 0, 'infrastructure': 0, 'interview': 0, 'interviews': 0, 'iot': 0, 'libraries': 0, 'library': 0, 'machine learning': 0, 'mathematics': 0, 'mlops': 0, 'mobile development': 0, 'neural network': 0, 'nlp

In [47]:
# loading tokenizer of bert base version
BERT_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

encoding = tokenizer.encode_plus(
    sample_comment, 
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding='max_length',
    return_attention_mask=True,
    return_tensors="pt"
)
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [48]:
print(encoding["input_ids"].shape, encoding["attention_mask"].shape)
print(encoding["input_ids"].squeeze()[:20])
print(encoding["attention_mask"].squeeze()[:20])
print(encoding["input_ids"].squeeze().shape)

torch.Size([1, 512]) torch.Size([1, 512])
tensor([  101,  1188, 16035, 15065,  2586, 18791,  7088,  1306, 12958, 20163,
         1106, 28091,   152, 24380,  1116,  4914,  4243, 12056,   102,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])
torch.Size([512])


In [49]:
tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze())[:20]

['[CLS]',
 'This',
 'Amazing',
 'Image',
 'Al',
 '##gor',
 '##ith',
 '##m',
 'Lea',
 '##rns',
 'to',
 'Spot',
 'O',
 '##bject',
 '##s',
 'Without',
 'Human',
 'Help',
 '[SEP]',
 '[PAD]']

In [50]:
tokenizer.save_pretrained('../model/tokenizer')

('../model/tokenizer/tokenizer_config.json',
 '../model/tokenizer/special_tokens_map.json',
 '../model/tokenizer/vocab.txt',
 '../model/tokenizer/added_tokens.json')

In [18]:
class PostTitleDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128):

        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        
        data_row = self.data.iloc[index]

        title = data_row.title
        labels = data_row[LABEL_COLUMNS]

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return dict(
            title=title,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels)
        )

In [19]:
train_dataset = PostTitleDataset(train, tokenizer)

In [20]:
sample_item = train_dataset[0]
print(sample_item.keys())
print(sample_item["title"])
print(sample_item["labels"])
print(sample_item["input_ids"].shape)


dict_keys(['title', 'input_ids', 'attention_mask', 'labels'])
Algorithmic Trading Models - Machine Learning
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
torch.Size([128])


In [21]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
sample_item["input_ids"].unsqueeze(dim=0).shape
prediction = bert_model(sample_item["input_ids"].unsqueeze(dim=0), sample_item["attention_mask"].unsqueeze(dim=0))
prediction.last_hidden_state.shape, prediction.pooler_output.shape


(torch.Size([1, 128, 768]), torch.Size([1, 768]))

In [23]:
class PostTitleDataModule(pl.LightningDataModule):
    def __init__(self, train_df, test_df, tokenizer, batch_size=0, max_token_len=128):
        super().__init__
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len
        self.prepare_data_per_node = True
        self._log_hyperparams = True

    def setup(self, stage):
        self.train_dataset = PostTitleDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )

        self.test_dataset = PostTitleDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

In [24]:
N_EPOCHS = 10
BATCH_SIZE = 32

data_module = PostTitleDataModule(train, test, tokenizer, batch_size=BATCH_SIZE)
data_module.setup(stage="None")

In [25]:
# demo what BCE is
criterion = nn.BCELoss()
prediction = torch.FloatTensor(
    [10.95873564, 1.07321467, 1.58524066, 0.03839076, 15.72987556, 1.09513213]
)
labels = torch.FloatTensor(
  [1., 0., 0., 0., 1., 0.]
)
print(torch.sigmoid(prediction))
output = criterion(torch.sigmoid(prediction), labels)
print(output)

tensor([1.0000, 0.7452, 0.8299, 0.5096, 1.0000, 0.7493])
tensor(0.8725)


In [26]:
class PostTitleCategoryModel(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)    
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):
    
    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [27]:
steps_per_epoch=len(train) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(10, 50)

In [28]:
model = PostTitleCategoryModel(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps 
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
model.bert.save_pretrained('../model/bert_pretrained')

In [29]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [30]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)


In [31]:
trainer = pl.Trainer(
  logger=True,
  callbacks=[early_stopping_callback, checkpoint_callback],
  max_epochs=N_EPOCHS)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [32]:
trainer.fit(model, data_module)


Missing logger folder: /Users/schen/wecloud/mlops_labs_term2/training/lightning_logs

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 108 M 
1 | classifier | Linear    | 68.4 K
2 | criterion  | BCELoss   | 0     
-----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.515   Total estimated model params size (MB)


                                                                           



Epoch 0: 100%|██████████| 105/105 [03:19<00:00,  1.90s/it, loss=0.717, v_num=0, train_loss=0.704, val_loss=0.693]



Epoch 0: 100%|██████████| 105/105 [03:19<00:00,  1.90s/it, loss=0.717, v_num=0, train_loss=0.704, val_loss=0.693]

Epoch 0, global step 6: 'val_loss' reached 0.69255 (best 0.69255), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 105/105 [02:44<00:00,  1.57s/it, loss=0.694, v_num=0, train_loss=0.655, val_loss=0.652]

Epoch 1, global step 12: 'val_loss' reached 0.65192 (best 0.65192), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 105/105 [02:11<00:00,  1.26s/it, loss=0.675, v_num=0, train_loss=0.618, val_loss=0.611]

Epoch 2, global step 18: 'val_loss' reached 0.61060 (best 0.61060), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 105/105 [01:59<00:00,  1.14s/it, loss=0.643, v_num=0, train_loss=0.587, val_loss=0.575]

Epoch 3, global step 24: 'val_loss' reached 0.57528 (best 0.57528), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 105/105 [02:54<00:00,  1.66s/it, loss=0.606, v_num=0, train_loss=0.555, val_loss=0.545]

Epoch 4, global step 30: 'val_loss' reached 0.54469 (best 0.54469), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 5: 100%|██████████| 105/105 [02:56<00:00,  1.68s/it, loss=0.573, v_num=0, train_loss=0.529, val_loss=0.518]

Epoch 5, global step 36: 'val_loss' reached 0.51811 (best 0.51811), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 6: 100%|██████████| 105/105 [02:34<00:00,  1.47s/it, loss=0.545, v_num=0, train_loss=0.509, val_loss=0.504]

Epoch 6, global step 42: 'val_loss' reached 0.50354 (best 0.50354), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 7: 100%|██████████| 105/105 [02:34<00:00,  1.47s/it, loss=0.524, v_num=0, train_loss=0.500, val_loss=0.495]

Epoch 7, global step 48: 'val_loss' reached 0.49491 (best 0.49491), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 8: 100%|██████████| 105/105 [01:51<00:00,  1.06s/it, loss=0.51, v_num=0, train_loss=0.503, val_loss=0.494] 

Epoch 8, global step 54: 'val_loss' reached 0.49399 (best 0.49399), saving model to '/Users/schen/wecloud/mlops_labs_term2/training/checkpoints/best-checkpoint.ckpt' as top 1


Epoch 9: 100%|██████████| 105/105 [01:44<00:00,  1.00it/s, loss=0.503, v_num=0, train_loss=0.499, val_loss=0.494]

Epoch 9, global step 60: 'val_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 105/105 [01:44<00:00,  1.00it/s, loss=0.503, v_num=0, train_loss=0.499, val_loss=0.494]


In [33]:
model.freeze()
model.eval()

TitleTagger(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [35]:
torch.save(model.state_dict(), '../model/model_2.pt')

In [36]:
test_comment = "Deep Learning course at UofT"

encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

In [37]:
_, test_prediction = model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()

for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  print(f"{label}: {prediction}")

ai: 0.5314832329750061
algorithm: 0.3316115736961365
algorithms: 0.2727877199649811
application: 0.48776936531066895
applications: 0.5289465188980103
bandits: 0.2312105894088745
bayesian: 0.35741063952445984
best practice: 0.30665919184684753
best practices: 0.18563441932201385
big data: 0.449453741312027
book: 0.4171124994754791
books: 0.4235944151878357
business: 0.31198379397392273
career: 0.3096870183944702
cloud: 0.37051311135292053
clustering: 0.5439156293869019
computer science: 0.42995527386665344
computer vision: 0.5262809991836548
course: 0.40591341257095337
courses: 0.5615496039390564
dashboard: 0.46855491399765015
data: 0.5176693797111511
data engineering: 0.24762529134750366
data mining: 0.47928252816200256
data science: 0.36023592948913574
data shift: 0.3331622779369354
data visualization: 0.4803379774093628
dataset: 0.38014107942581177
decision making: 0.26210933923721313
deep learning: 0.43653497099876404
design patterns: 0.3018726706504822
designed patterns: 0.41125494

In [38]:
THRESHOLD = 0.5

test_comment = "Big Data at Air Miles"
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

_, test_prediction = model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()

for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  if prediction < THRESHOLD:
    continue
  print(f"{label}: {prediction}")

clustering: 0.5440770983695984
computer vision: 0.5074830055236816
courses: 0.5072101354598999
healthcare: 0.5016406178474426
iot: 0.5198387503623962
nlp: 0.6035922765731812
ux: 0.5572955012321472
web development: 0.5504329800605774


In [39]:
MAX_TOKEN_COUNT = 512

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = model.to(device)

val_dataset = PostTitleDataset(
  val,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device), 
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

100%|██████████| 46/46 [00:35<00:00,  1.31it/s]


In [None]:
accuracy(predictions, labels, threshold=THRESHOLD)

tensor(0.8801)

In [None]:
for i, name in enumerate(LABEL_COLUMNS):
  tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

AUROC per tag
ai: 0.8222222328186035
algorithm: 0.5555555820465088
algorithms: 0.6022727489471436
application: 0.7596899271011353
applications: 0.0
bandits: 0.0
bayesian: 0.0
best practice: 0.0
best practices: 0.0
big data: 0.4545454680919647
book: 0.0
books: 0.0
business: 0.0
career: 0.19999998807907104
cloud: 0.0
clustering: 0.0
computer science: 0.511904776096344
computer vision: 0.738636314868927
course: 0.5227272510528564
courses: 0.0
dashboard: 0.0
data: 0.2888888716697693
data engineering: 0.0
data mining: 0.6666666269302368
data science: 0.27906978130340576
data shift: 0.0
data visualization: 0.0
dataset: 0.0
decision making: 0.9545454382896423
deep learning: 0.4728682041168213
design patterns: 0.0
designed patterns: 0.0
engineering: 0.0
experimentation: 0.0
forecasting: 0.7111110687255859
gis: 0.1111111044883728
graph: 0.35555553436279297
guide: 0.5333333015441895
hardware: 0.0
healthcare: 0.0
hiring: 0.0
how-to: 0.0
improvement: 0.8888888359069824
industry: 0.6222221851348877



In [None]:
from sklearn.metrics import classification_report
y_pred = predictions.numpy()
y_true = labels.numpy()

upper, lower = 1, 0

y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true, 
  y_pred, 
  target_names=LABEL_COLUMNS, 
  zero_division=0
))

                        precision    recall  f1-score   support

                    ai       0.04      1.00      0.07         1
             algorithm       0.00      0.00      0.00         1
            algorithms       0.00      0.00      0.00         2
           application       0.00      0.00      0.00         3
          applications       0.00      0.00      0.00         0
               bandits       0.00      0.00      0.00         0
              bayesian       0.00      0.00      0.00         0
         best practice       0.00      0.00      0.00         0
        best practices       0.00      0.00      0.00         0
              big data       0.00      0.00      0.00         2
                  book       0.00      0.00      0.00         0
                 books       0.00      0.00      0.00         0
              business       0.00      0.00      0.00         0
                career       0.02      1.00      0.04         1
                 cloud       0.00      