In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
import torch
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm, trange

from dataset import Collator, Dataset
from metrics import compute_metrics_on_df
from train import train
from utils import chunks, set_global_seed

2022-05-30 11:23:27.295950: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# reproducibility
set_global_seed(42)

In [4]:
# parameters
config = {
    'MODEL_NAME':    'distilroberta-base',
    'BATCH_SIZE':    64,
    'LEARNING_RATE': 1e-5,
    'N_EPOCHS':      5,
    'CLASS_WEIGHT':  'balanced',
}

In [5]:
# tensorboard
experiment_name = f"MODEL_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_CLASS_WEIGHT_{config['CLASS_WEIGHT']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### prepare data

In [7]:
df = pd.read_csv('./data/train.csv', usecols=['Title', 'BodyMarkdown', 'OpenStatus'])
df.fillna('', inplace=True)

df['Title'] = df['Title'].str.lower()
df['BodyMarkdown'] = df['BodyMarkdown'].str.lower()

In [8]:
df

Unnamed: 0,Title,BodyMarkdown,OpenStatus
0,decimal vs double?,"i'm new to c#, and i want to use a trackbar fo...",open
1,percentage width child in absolutely positione...,i've got an absolutely positioned div containi...,open
2,tools for porting j# code to c#,are there any conversion tools for porting vis...,open
3,how do i calculate someone's age in c#?,"given a datetime representing their birthday, ...",open
4,retrieve data from nsuserdefaults to tableview,i save values of two labels through nsuserdefa...,open
...,...,...,...
3370523,dividing an array by filter function,i have a javascript array that i would like to...,open
3370524,javascript link extractor,i am interested in extracting links from sites...,open
3370525,selenium remote webdriver insane memory usage,i've created a small python script to run test...,open
3370526,searching through an array of dictionaries,i'm making an iphone app which displays inform...,open


In [9]:
df['OpenStatus'].value_counts()

open                   3300392
not a real question      30789
off topic                17530
not constructive         15659
too localized             6158
Name: OpenStatus, dtype: int64

In [10]:
# undersampling
df = pd.concat([
    df[df['OpenStatus'] == 'open'].sample(100000, random_state=42),
    df[df['OpenStatus'] != 'open'],
])

In [11]:
df['OpenStatus'].value_counts()

open                   100000
not a real question     30789
off topic               17530
not constructive        15659
too localized            6158
Name: OpenStatus, dtype: int64

In [12]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['OpenStatus'],
)

In [13]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

Train size: 127602
Test size: 42534


In [14]:
le = LabelEncoder()

df_train['OpenStatus'] = le.fit_transform(df_train['OpenStatus'])
df_test['OpenStatus'] = le.transform(df_test['OpenStatus'])

In [15]:
train_dataset = Dataset(df=df_train)
test_dataset = Dataset(df=df_test)

### load bert model

In [16]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])

In [17]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = Collator(tokenizer, tokenizer_kwargs)

In [18]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    config['MODEL_NAME'],
    num_labels=df_train['OpenStatus'].nunique(),
).to(device)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])

In [21]:
class_weight = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df_train['OpenStatus']),
    y=df_train['OpenStatus'].values,
)

class_weight = torch.Tensor(class_weight).to(device)
class_weight

tensor([1.1052, 2.1731, 1.9412, 0.3403, 5.5251], device='cuda:0')

In [22]:
criterion = torch.nn.CrossEntropyLoss(weight=class_weight)

In [23]:
train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

Epoch [1 / 5]



loop over train batches: 100%|██████████| 1994/1994 [41:41<00:00,  1.25s/it]


Train loss: 1.100256936988955

Train metrics:
{'accuracy': 0.5575226093634896, 'precision_micro': 0.5575226093634896, 'precision_macro': 0.4730906891364981, 'precision_weighted': 0.6895011861027994, 'recall_micro': 0.5575226093634896, 'recall_macro': 0.5697467381006119, 'recall_weighted': 0.5575226093634896, 'f1_micro': 0.5575226093634896, 'f1_macro': 0.4813732844935303, 'f1_weighted': 0.5942956841467766}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]


Test loss:  1.0228022185483374

Test metrics:
{'accuracy': 0.5662763906521842, 'precision_micro': 0.5662763906521842, 'precision_macro': 0.49356763708248524, 'precision_weighted': 0.720319042467236, 'recall_micro': 0.5662763906521842, 'recall_macro': 0.6005731990619637, 'recall_weighted': 0.5662763906521842, 'f1_micro': 0.5662763906521842, 'f1_macro': 0.4943120258206526, 'f1_weighted': 0.6057944057478977}

Epoch [2 / 5]



loop over train batches: 100%|██████████| 1994/1994 [41:41<00:00,  1.25s/it]


Train loss: 0.9781770593185004

Train metrics:
{'accuracy': 0.6166361028824, 'precision_micro': 0.6166361028824, 'precision_macro': 0.5206169071665674, 'precision_weighted': 0.7287762387627867, 'recall_micro': 0.6166361028824, 'recall_macro': 0.6302939882970201, 'recall_weighted': 0.6166361028824, 'f1_micro': 0.6166361028824, 'f1_macro': 0.5356855659589488, 'f1_weighted': 0.6488616691587002}



loop over test batches: 100%|██████████| 665/665 [03:43<00:00,  2.97it/s]


Test loss:  0.9972877860069275

Test metrics:
{'accuracy': 0.6068086707104904, 'precision_micro': 0.6068086707104904, 'precision_macro': 0.5110351347449565, 'precision_weighted': 0.7248661303851771, 'recall_micro': 0.6068086707104904, 'recall_macro': 0.6098378012685325, 'recall_weighted': 0.6068086707104904, 'f1_micro': 0.6068086707104904, 'f1_macro': 0.5211384310475953, 'f1_weighted': 0.6415076154832634}

Epoch [3 / 5]



loop over train batches: 100%|██████████| 1994/1994 [41:44<00:00,  1.26s/it]


Train loss: 0.9216405626763791

Train metrics:
{'accuracy': 0.6379758937947682, 'precision_micro': 0.6379758937947682, 'precision_macro': 0.5408290651312329, 'precision_weighted': 0.7433540659778224, 'recall_micro': 0.6379758937947682, 'recall_macro': 0.6597751442664628, 'recall_weighted': 0.6379758937947682, 'f1_micro': 0.6379758937947682, 'f1_macro': 0.5597005813116028, 'f1_weighted': 0.6678168206576129}



loop over test batches: 100%|██████████| 665/665 [03:44<00:00,  2.97it/s]


Test loss:  0.9961506276202381

Test metrics:
{'accuracy': 0.6064325010579772, 'precision_micro': 0.6064325010579772, 'precision_macro': 0.5049156086846154, 'precision_weighted': 0.7254055641863599, 'recall_micro': 0.6064325010579772, 'recall_macro': 0.6102200641527542, 'recall_weighted': 0.6064325010579772, 'f1_micro': 0.6064325010579772, 'f1_macro': 0.5203925490713935, 'f1_weighted': 0.6393880191128092}

Epoch [4 / 5]



loop over train batches: 100%|██████████| 1994/1994 [41:47<00:00,  1.26s/it]


Train loss: 0.8632925877958506

Train metrics:
{'accuracy': 0.6554677826366357, 'precision_micro': 0.6554677826366357, 'precision_macro': 0.5586974664788299, 'precision_weighted': 0.7566510600414196, 'recall_micro': 0.6554677826366357, 'recall_macro': 0.6907942154915248, 'recall_weighted': 0.6554677826366357, 'f1_micro': 0.6554677826366357, 'f1_macro': 0.5816427931037501, 'f1_weighted': 0.6828457939787904}



loop over test batches: 100%|██████████| 665/665 [03:44<00:00,  2.97it/s]


Test loss:  1.0357164797029996

Test metrics:
{'accuracy': 0.6429914891616119, 'precision_micro': 0.6429914891616119, 'precision_macro': 0.5210909490402752, 'precision_weighted': 0.7206425626393085, 'recall_micro': 0.6429914891616119, 'recall_macro': 0.604661009781694, 'recall_weighted': 0.6429914891616119, 'f1_micro': 0.6429914891616119, 'f1_macro': 0.5397012883053967, 'f1_weighted': 0.6688691510186983}

Epoch [5 / 5]



loop over train batches: 100%|██████████| 1994/1994 [41:49<00:00,  1.26s/it]


Train loss: 0.8046740387064762

Train metrics:
{'accuracy': 0.6737668688578549, 'precision_micro': 0.6737668688578549, 'precision_macro': 0.5795982572857316, 'precision_weighted': 0.7697782689508855, 'recall_micro': 0.6737668688578549, 'recall_macro': 0.7256667340603846, 'recall_weighted': 0.6737668688578549, 'f1_micro': 0.6737668688578549, 'f1_macro': 0.6064150593120464, 'f1_weighted': 0.698732140216473}



loop over test batches: 100%|██████████| 665/665 [03:44<00:00,  2.96it/s]

Test loss:  1.0579450194996998

Test metrics:
{'accuracy': 0.6041754831428975, 'precision_micro': 0.6041754831428975, 'precision_macro': 0.5143968604645925, 'precision_weighted': 0.72382863220028, 'recall_micro': 0.6041754831428975, 'recall_macro': 0.6036832982628126, 'recall_weighted': 0.6041754831428975, 'f1_micro': 0.6041754831428975, 'f1_macro': 0.5206008760158995, 'f1_weighted': 0.6414028734173254}






In [24]:
model.save_pretrained(experiment_name)

### evaluate model

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(experiment_name)

model.to(device)
model.eval();

In [26]:
train_metrics = compute_metrics_on_df(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
    device=device,
)

inference: 100%|██████████| 1994/1994 [10:51<00:00,  3.06it/s]


In [27]:
train_metrics

{'accuracy': 0.6757339226657889,
 'precision_micro': 0.6757339226657889,
 'precision_macro': 0.5985318031314628,
 'precision_weighted': 0.7866684036510283,
 'recall_micro': 0.6757339226657889,
 'recall_macro': 0.7512254049866977,
 'recall_weighted': 0.6757339226657889,
 'f1_micro': 0.6757339226657889,
 'f1_macro': 0.6187349648220093,
 'f1_weighted': 0.704496895839678}

In [28]:
test_metrics = compute_metrics_on_df(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
    device=device,
)

inference: 100%|██████████| 665/665 [03:36<00:00,  3.07it/s]


In [29]:
test_metrics

{'accuracy': 0.6041754831428975,
 'precision_micro': 0.6041754831428975,
 'precision_macro': 0.5143968604645925,
 'precision_weighted': 0.72382863220028,
 'recall_micro': 0.6041754831428975,
 'recall_macro': 0.6036832982628126,
 'recall_weighted': 0.6041754831428975,
 'f1_micro': 0.6041754831428975,
 'f1_macro': 0.5206008760158995,
 'f1_weighted': 0.6414028734173254}