<a href="https://colab.research.google.com/github/huashuai1997/Colab20210803/blob/main/xunfei_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from tqdm.notebook import tqdm

!pip install transformers

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score
import random

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 12.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 45.9 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully 

In [None]:
a = torch.cuda.is_available()
b = torch.cuda.device_count()
c = torch.cuda.get_device_name(0)

print(f"gpu is availiable? {a}")
print(f"device count: {b}")
print(f"device name: {c}")

gpu is availiable? True
device count: 1
device name: Tesla K80


In [None]:
pro_train_path = '/content/sample_data/train_data_finished.csv'
df = pd.read_csv(pro_train_path)

print(len(df['paperid']))
print(len(df['abstract']))
print(len(df['categories']))

50000
50000
50000


# Data information

In [None]:
data_info = df['categories'].value_counts()
print(data_info)

cs.CV    11038
cs.CL     4260
cs.NI     3218
cs.CR     2798
cs.AI     2706
cs.DS     2509
cs.DC     1994
cs.SE     1940
cs.RO     1884
cs.LO     1741
cs.LG     1352
cs.SY     1292
cs.CY     1228
cs.DB      998
cs.GT      984
cs.HC      943
cs.PL      841
cs.IR      770
cs.CC      719
cs.NE      704
cs.CG      683
cs.OH      677
cs.SI      603
cs.DL      537
cs.DM      523
cs.FL      469
cs.AR      363
cs.CE      362
cs.GR      314
cs.MM      261
cs.ET      230
cs.MA      210
cs.NA      176
cs.SC      172
cs.SD      140
cs.PF      139
cs.MS      105
cs.OS       99
cs.GL       18
Name: categories, dtype: int64


# Encoding the labels:

In [None]:
possible_labels = df.categories.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(f"label dict: {label_dict}")

df['label'] = df.categories.replace(label_dict)

label dict: {'cs.CL': 0, 'cs.NE': 1, 'cs.DL': 2, 'cs.CV': 3, 'cs.LG': 4, 'cs.DS': 5, 'cs.IR': 6, 'cs.RO': 7, 'cs.DM': 8, 'cs.CR': 9, 'cs.AR': 10, 'cs.NI': 11, 'cs.AI': 12, 'cs.SE': 13, 'cs.CG': 14, 'cs.LO': 15, 'cs.SY': 16, 'cs.GR': 17, 'cs.PL': 18, 'cs.SI': 19, 'cs.OH': 20, 'cs.HC': 21, 'cs.MA': 22, 'cs.GT': 23, 'cs.ET': 24, 'cs.FL': 25, 'cs.CC': 26, 'cs.DB': 27, 'cs.DC': 28, 'cs.CY': 29, 'cs.CE': 30, 'cs.MM': 31, 'cs.NA': 32, 'cs.PF': 33, 'cs.OS': 34, 'cs.SD': 35, 'cs.SC': 36, 'cs.MS': 37, 'cs.GL': 38}


# Train and Validation Split
# Because the labels are imbalanced, we split the data set in a stratified fashion


In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=2, stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# splited data information
info = df.groupby(['categories', 'label', 'data_type']).count()
print(info)

                            paperid  title  abstract
categories label data_type                          
cs.AI      12    train         2300   2300      2300
                 val            406    406       406
cs.AR      10    train          309    309       309
                 val             54     54        54
cs.CC      26    train          611    611       611
...                             ...    ...       ...
cs.SE      13    val            291    291       291
cs.SI      19    train          513    513       513
                 val             90     90        90
cs.SY      16    train         1098   1098      1098
                 val            194    194       194

[78 rows x 3 columns]


# BertTokenizer and Encoding the Data

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].abstract.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=200,
    return_tensors='pt'
)
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].abstract.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=200,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print(f"len dataset train: {len(dataset_train)}")
print(f"len dataset val: {len(dataset_val)}")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.







len dataset train: 42500
len dataset val: 7500


# BERT Pre-trained Model
## Using num_labels to indicate the number of output labels. We don’t really care about output_attentions We also don’t need output_hidden_states.

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Data Loaders
## DataLoader combines a dataset and a sampler, we use RandomSampler for training and SequentialSampler for validation.

In [None]:
batch_size = 3
dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

# Optimizer & Scheduler

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 6
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

# Performance Metrics

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

# Training Loop

In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)
print('*'*40)

def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


cuda
****************************************


#Train

In [None]:
for epoch in tqdm(range(1, epochs + 1)):
    # RELEASE GPU Memory
    torch.cuda.empty_cache()
    info =   torch.cuda.memory_summary(device=None, abbreviated=False)
    print(info)

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                  }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})


    torch.save(model.state_dict(), f'/content/sample_data/finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

# Loading and Evaluating the Model

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)

model.load_state_dict(torch.load('/content/sample_data/finetuned_BERT_epoch_1.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

# submitt result

In [None]:
# submitt result
pro_test_path = '/content/sample_data/test_data_finished.csv'

df = pd.read_csv(pro_test_path)

encoded_data_test = tokenizer.batch_encode_plus(
    df.abstract.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=200,
    return_tensors='pt'
)
# input data
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
print(f"len dataset test: {len(dataset_test)}")

dataloader_test = DataLoader(dataset_test,
                                   sampler=SequentialSampler(dataset_test),
                                   batch_size=batch_size)

# predict
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
model.load_state_dict(torch.load('/content/sample_data/finetuned_BERT_epoch_1.model', map_location=torch.device('cpu')))

model.eval()
loss_val_total = 0
predict = []
for batch in dataloader_test:
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              }

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs[1]

    logits = logits.detach().cpu().numpy()
    predict.append(logits)
# get the result
predict = np.concatenate(predict, axis=0)

preds_flat = np.argmax(predict, axis=1).flatten()

str_label = []
for label in predict_flat:
    str_label.append(idx2label[label])

paper_id = df['paperid'].tolist()
name_text = []
for idx in paper_id:
    name_text.append(idx)
# save to dictionary
dictionary = {'paperid':name_text,'categories':str_label}
df = pd.DataFrame(dictionary)
# save to csv
df.to_csv('/content/sample_data/result.csv',index=False)