In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
import torch
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm, trange

from dataset import Collator, Dataset
from metrics import compute_metrics_on_df
from train import train
from utils import chunks, set_global_seed

2022-05-29 23:39:00.404416: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# reproducibility
set_global_seed(42)

In [4]:
# parameters
config = {
    'MODEL_NAME':    'distilroberta-base',
    'BATCH_SIZE':    64,
    'LEARNING_RATE': 1e-5,
    'N_EPOCHS':      10,
    'CLASS_WEIGHT':  None,
}

In [5]:
# tensorboard
experiment_name = f"MODEL_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_CLASS_WEIGHT_{config['CLASS_WEIGHT']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### prepare data

In [7]:
df = pd.read_csv('./data/train.csv', usecols=['Title', 'BodyMarkdown', 'OpenStatus'])
df.fillna('', inplace=True)

df['Title'] = df['Title'].str.lower()
df['BodyMarkdown'] = df['BodyMarkdown'].str.lower()

In [8]:
df

Unnamed: 0,Title,BodyMarkdown,OpenStatus
0,decimal vs double?,"i'm new to c#, and i want to use a trackbar fo...",open
1,percentage width child in absolutely positione...,i've got an absolutely positioned div containi...,open
2,tools for porting j# code to c#,are there any conversion tools for porting vis...,open
3,how do i calculate someone's age in c#?,"given a datetime representing their birthday, ...",open
4,retrieve data from nsuserdefaults to tableview,i save values of two labels through nsuserdefa...,open
...,...,...,...
3370523,dividing an array by filter function,i have a javascript array that i would like to...,open
3370524,javascript link extractor,i am interested in extracting links from sites...,open
3370525,selenium remote webdriver insane memory usage,i've created a small python script to run test...,open
3370526,searching through an array of dictionaries,i'm making an iphone app which displays inform...,open


In [9]:
df['OpenStatus'].value_counts()

open                   3300392
not a real question      30789
off topic                17530
not constructive         15659
too localized             6158
Name: OpenStatus, dtype: int64

In [10]:
# undersampling
df = pd.concat([
    df[df['OpenStatus'] == 'open'].sample(100000, random_state=42),
    df[df['OpenStatus'] != 'open'],
])

In [11]:
df['OpenStatus'].value_counts()

open                   100000
not a real question     30789
off topic               17530
not constructive        15659
too localized            6158
Name: OpenStatus, dtype: int64

In [12]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['OpenStatus'],
)

In [13]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

Train size: 127602
Test size: 42534


In [14]:
le = LabelEncoder()

df_train['OpenStatus'] = le.fit_transform(df_train['OpenStatus'])
df_test['OpenStatus'] = le.transform(df_test['OpenStatus'])

In [15]:
train_dataset = Dataset(df=df_train)
test_dataset = Dataset(df=df_test)

### load bert model

In [16]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])

In [17]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = Collator(tokenizer, tokenizer_kwargs)

In [18]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    config['MODEL_NAME'],
    num_labels=df_train['OpenStatus'].nunique(),
).to(device)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])
criterion = torch.nn.CrossEntropyLoss()

In [21]:
train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

Epoch [1 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:43<00:00,  1.26s/it]


Train loss: 0.8184811789299564

Train metrics:
{'accuracy': 0.7033823921255152, 'precision_micro': 0.7033823921255152, 'precision_macro': 0.5951942809818445, 'precision_weighted': 0.6796833224894905, 'recall_micro': 0.7033823921255152, 'recall_macro': 0.46699959597897245, 'recall_weighted': 0.7033823921255152, 'f1_micro': 0.7033823921255152, 'f1_macro': 0.47999248439166237, 'f1_weighted': 0.6743940307439246}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.98it/s]


Test loss:  0.7559711185613073

Test metrics:
{'accuracy': 0.718648610523346, 'precision_micro': 0.718648610523346, 'precision_macro': 0.5980827994340795, 'precision_weighted': 0.6964913018250483, 'recall_micro': 0.718648610523346, 'recall_macro': 0.5017709451084473, 'recall_weighted': 0.718648610523346, 'f1_micro': 0.7186486105233462, 'f1_macro': 0.5114446445242382, 'f1_weighted': 0.6951597918162564}

Epoch [2 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:45<00:00,  1.26s/it]


Train loss: 0.7324333888746671

Train metrics:
{'accuracy': 0.7340167082020658, 'precision_micro': 0.7340167082020658, 'precision_macro': 0.6254522927544571, 'precision_weighted': 0.7142542537488618, 'recall_micro': 0.7340167082020658, 'recall_macro': 0.5198465191631366, 'recall_weighted': 0.7340167082020658, 'f1_micro': 0.7340167082020658, 'f1_macro': 0.5314538968973465, 'f1_weighted': 0.7120671007627537}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]


Test loss:  0.7346269326102465

Test metrics:
{'accuracy': 0.7251375370292001, 'precision_micro': 0.7251375370292001, 'precision_macro': 0.6117721970807868, 'precision_weighted': 0.7101906391552263, 'recall_micro': 0.7251375370292001, 'recall_macro': 0.5245240615980962, 'recall_weighted': 0.7251375370292001, 'f1_micro': 0.7251375370292, 'f1_macro': 0.5272707577639622, 'f1_weighted': 0.7087657428246096}

Epoch [3 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:43<00:00,  1.26s/it]


Train loss: 0.6959317921396483

Train metrics:
{'accuracy': 0.7505133148383254, 'precision_micro': 0.7505133148383254, 'precision_macro': 0.646984387273849, 'precision_weighted': 0.7330559044104187, 'recall_micro': 0.7505133148383254, 'recall_macro': 0.54611511538195, 'recall_weighted': 0.7505133148383254, 'f1_micro': 0.7505133148383254, 'f1_macro': 0.55959155304883, 'f1_weighted': 0.73123802181237}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]


Test loss:  0.7380969932652954

Test metrics:
{'accuracy': 0.7249259415996614, 'precision_micro': 0.7249259415996614, 'precision_macro': 0.5911216142390494, 'precision_weighted': 0.7057743198524082, 'recall_micro': 0.7249259415996614, 'recall_macro': 0.5258626479303828, 'recall_weighted': 0.7249259415996614, 'f1_micro': 0.7249259415996614, 'f1_macro': 0.5297520953783601, 'f1_weighted': 0.7072749137618286}

Epoch [4 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:44<00:00,  1.26s/it]


Train loss: 0.6614364838917014

Train metrics:
{'accuracy': 0.7675114810112694, 'precision_micro': 0.7675114810112694, 'precision_macro': 0.6759125043000467, 'precision_weighted': 0.7529096392934906, 'recall_micro': 0.7675114810112694, 'recall_macro': 0.5736884256014733, 'recall_weighted': 0.7675114810112694, 'f1_micro': 0.7675114810112694, 'f1_macro': 0.5895947270092627, 'f1_weighted': 0.7504593569774639}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.98it/s]


Test loss:  0.7445018477009652

Test metrics:
{'accuracy': 0.7245732825504303, 'precision_micro': 0.7245732825504303, 'precision_macro': 0.5985884560996526, 'precision_weighted': 0.7045441702741724, 'recall_micro': 0.7245732825504303, 'recall_macro': 0.514608394000209, 'recall_weighted': 0.7245732825504303, 'f1_micro': 0.7245732825504303, 'f1_macro': 0.5325697601989459, 'f1_weighted': 0.7081697962973493}

Epoch [5 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:43<00:00,  1.26s/it]


Train loss: 0.6239474756398435

Train metrics:
{'accuracy': 0.7862807792981301, 'precision_micro': 0.7862807792981301, 'precision_macro': 0.7011537767777876, 'precision_weighted': 0.7734473220450934, 'recall_micro': 0.7862807792981301, 'recall_macro': 0.6009468106758643, 'recall_weighted': 0.7862807792981301, 'f1_micro': 0.7862807792981301, 'f1_macro': 0.6196929916211101, 'f1_weighted': 0.7710875582420484}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]


Test loss:  0.7620926093338127

Test metrics:
{'accuracy': 0.7229510509239667, 'precision_micro': 0.7229510509239667, 'precision_macro': 0.5859331533126578, 'precision_weighted': 0.705043492789509, 'recall_micro': 0.7229510509239667, 'recall_macro': 0.5328830701601186, 'recall_weighted': 0.7229510509239667, 'f1_micro': 0.7229510509239667, 'f1_macro': 0.5436463253571163, 'f1_weighted': 0.7093577217360413}

Epoch [6 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:44<00:00,  1.26s/it]


Train loss: 0.5864982044505499

Train metrics:
{'accuracy': 0.8082318458958323, 'precision_micro': 0.8082318458958323, 'precision_macro': 0.7311708730345157, 'precision_weighted': 0.7976707345369578, 'recall_micro': 0.8082318458958323, 'recall_macro': 0.6355779988656571, 'recall_weighted': 0.8082318458958323, 'f1_micro': 0.8082318458958323, 'f1_macro': 0.6554900114981475, 'f1_weighted': 0.7956150723717246}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.98it/s]


Test loss:  0.7832789295149926

Test metrics:
{'accuracy': 0.721563925330324, 'precision_micro': 0.721563925330324, 'precision_macro': 0.5884022750855279, 'precision_weighted': 0.7009154463745565, 'recall_micro': 0.721563925330324, 'recall_macro': 0.5199870804823169, 'recall_weighted': 0.721563925330324, 'f1_micro': 0.721563925330324, 'f1_macro': 0.5334190921994525, 'f1_weighted': 0.7039324370091681}

Epoch [7 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:44<00:00,  1.26s/it]


Train loss: 0.5460589570380979

Train metrics:
{'accuracy': 0.830198586229056, 'precision_micro': 0.830198586229056, 'precision_macro': 0.7617375203436245, 'precision_weighted': 0.8214340436371951, 'recall_micro': 0.830198586229056, 'recall_macro': 0.6681581413878158, 'recall_weighted': 0.830198586229056, 'f1_micro': 0.830198586229056, 'f1_macro': 0.6898340667333732, 'f1_weighted': 0.8193331526577744}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.98it/s]


Test loss:  0.8160091431069195

Test metrics:
{'accuracy': 0.7171674425165749, 'precision_micro': 0.7171674425165749, 'precision_macro': 0.5768822489021793, 'precision_weighted': 0.696564918162977, 'recall_micro': 0.7171674425165749, 'recall_macro': 0.5180061506271783, 'recall_weighted': 0.7171674425165749, 'f1_micro': 0.7171674425165749, 'f1_macro': 0.5263460313080269, 'f1_weighted': 0.6990692919065121}

Epoch [8 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:43<00:00,  1.26s/it]


Train loss: 0.5046283833032386

Train metrics:
{'accuracy': 0.8533251829908622, 'precision_micro': 0.8533251829908622, 'precision_macro': 0.7957658325034263, 'precision_weighted': 0.8465321593220076, 'recall_micro': 0.8533251829908622, 'recall_macro': 0.7042623009584099, 'recall_weighted': 0.8533251829908622, 'f1_micro': 0.8533251829908622, 'f1_macro': 0.7281124134599621, 'f1_weighted': 0.8443472172803523}



loop over test batches: 100%|██████████| 665/665 [03:44<00:00,  2.97it/s]


Test loss:  0.8659704671766525

Test metrics:
{'accuracy': 0.7105844735975925, 'precision_micro': 0.7105844735975925, 'precision_macro': 0.5722154268097495, 'precision_weighted': 0.691006668193529, 'recall_micro': 0.7105844735975925, 'recall_macro': 0.5170324888365937, 'recall_weighted': 0.7105844735975925, 'f1_micro': 0.7105844735975926, 'f1_macro': 0.5279646364194146, 'f1_weighted': 0.6942095523644903}

Epoch [9 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:44<00:00,  1.26s/it]


Train loss: 0.4614418593712652

Train metrics:
{'accuracy': 0.8763890848105829, 'precision_micro': 0.8763890848105829, 'precision_macro': 0.8313372366016184, 'precision_weighted': 0.8714233442238934, 'recall_micro': 0.8763890848105829, 'recall_macro': 0.7413619518071218, 'recall_weighted': 0.8763890848105829, 'f1_micro': 0.8763890848105829, 'f1_macro': 0.7675370494894075, 'f1_weighted': 0.8693622275301071}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]


Test loss:  0.928220957696886

Test metrics:
{'accuracy': 0.7095970282597451, 'precision_micro': 0.7095970282597451, 'precision_macro': 0.564789875037949, 'precision_weighted': 0.6903220518817095, 'recall_micro': 0.7095970282597451, 'recall_macro': 0.5181428159729595, 'recall_weighted': 0.7095970282597451, 'f1_micro': 0.7095970282597451, 'f1_macro': 0.5295437103413234, 'f1_weighted': 0.6935374479805282}

Epoch [10 / 10]



loop over train batches: 100%|██████████| 1994/1994 [41:45<00:00,  1.26s/it]


Train loss: 0.4232616942215229

Train metrics:
{'accuracy': 0.8972116424507453, 'precision_micro': 0.8972116424507453, 'precision_macro': 0.8580666702782921, 'precision_weighted': 0.8933400742687718, 'recall_micro': 0.8972116424507453, 'recall_macro': 0.7760858490684399, 'recall_weighted': 0.8972116424507453, 'f1_micro': 0.8972116424507453, 'f1_macro': 0.8022100051513744, 'f1_weighted': 0.8918298440963974}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]

Test loss:  0.9737076680911215

Test metrics:
{'accuracy': 0.7017209761602483, 'precision_micro': 0.7017209761602483, 'precision_macro': 0.5519381955143166, 'precision_weighted': 0.6872232160875683, 'recall_micro': 0.7017209761602483, 'recall_macro': 0.5255390589761373, 'recall_weighted': 0.7017209761602483, 'f1_micro': 0.7017209761602483, 'f1_macro': 0.5319719340876865, 'f1_weighted': 0.6897758100082133}






In [22]:
model.save_pretrained(experiment_name)

### evaluate model

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(experiment_name)

model.to(device)
model.eval();

In [24]:
train_metrics = compute_metrics_on_df(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
    device=device,
)

inference: 100%|██████████| 1994/1994 [10:48<00:00,  3.08it/s]


In [25]:
train_metrics

{'accuracy': 0.911788216485635,
 'precision_micro': 0.911788216485635,
 'precision_macro': 0.8650380102826796,
 'precision_weighted': 0.9093525124280475,
 'recall_micro': 0.911788216485635,
 'recall_macro': 0.8159305242700556,
 'recall_weighted': 0.911788216485635,
 'f1_micro': 0.9117882164856351,
 'f1_macro': 0.8344911825400277,
 'f1_weighted': 0.9085508986516707}

In [26]:
test_metrics = compute_metrics_on_df(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
    device=device,
)

inference: 100%|██████████| 665/665 [03:36<00:00,  3.08it/s]


In [27]:
test_metrics

{'accuracy': 0.7017209761602483,
 'precision_micro': 0.7017209761602483,
 'precision_macro': 0.5519381955143166,
 'precision_weighted': 0.6872232160875683,
 'recall_micro': 0.7017209761602483,
 'recall_macro': 0.5255390589761373,
 'recall_weighted': 0.7017209761602483,
 'f1_micro': 0.7017209761602483,
 'f1_macro': 0.5319719340876865,
 'f1_weighted': 0.6897758100082133}