# Classification of arXiv Abstracts Using BERT Models

Levent Güner <leventg@kth.se>

I. Erdem Demir <iedemir@kth.se>

### How to run the code?

* Download the data from https://www.kaggle.com/Cornell-University/arxiv
* Install the required libraries
* Run the lines one by one
* Google Colab is recommended after Part 3. The code is written for loading the data from Google Drive.
* The BERT part is both written for BERT Base Model and SciBERT.
* In sections 3.2 and 3.4.1, Change the required parts to use SciBERT.

In [1]:
!pip install transformers
import numpy as np
import pandas as pd
import json
import dask.bag as db

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

stop_words = set(stopwords.words('english'))

import sklearn.metrics as mt
from sklearn.base import BaseEstimator, TransformerMixin
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import sklearn.model_selection as ms

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding,  Dropout,  SpatialDropout1D, LSTM
from tensorflow.keras import backend as K
from keras.preprocessing.sequence import pad_sequences

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from tqdm import trange
import pickle

sns.set()

Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp39-cp39-macosx_12_0_arm64.whl
Installing collected packages: tokenizers
Successfully installed tokenizers-0.10.3


In [2]:
#define text cleaner

class CleanText(BaseEstimator, TransformerMixin):
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def stemming(self, input_text):
        porter = PorterStemmer()
        words = input_text.split() 
        stemmed_words = [porter.stem(word) for word in words]
        return " ".join(stemmed_words)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords)
        return clean_X

# 1 Data Handling

## 1.1 Open Data

In [3]:
#get data
data_file = 'arxiv-metadata-oai-snapshot.json'

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line


#read data with dask
docs = db.read_text(data_file).map(json.loads)
print('count:',docs.count().compute())


#see an instance example
docs.take(1)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/erdemdemir/Desktop/scalable-projects/ID2223-Final-Levent_Güner-Erdem_Demir/arxiv-metadata-oai-snapshot.json'

Traceback
---------
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/local.py", line 220, in execute_task
    result = _execute_task(task, data)
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/core.py", line 119, in <genexpr>
    return func(*(_execute_task(a, cache) for a in args))
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/core.py", line 113, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/core.py", line 113, in <listcomp>
    return [_execute_task(a, cache) for a in arg]
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/bag/core.py", line 2440, in empty_safe_apply
    _, part = peek(part)
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/toolz/itertoolz.py", line 1000, in peek
    item = next(iterator)
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/bag/core.py", line 2012, in __next__
    vals = [next(i) for i in self.iters]
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/bag/core.py", line 2012, in <listcomp>
    vals = [next(i) for i in self.iters]
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/dask/bag/text.py", line 164, in file_to_blocks
    with lazy_file as f:
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/fsspec/core.py", line 103, in __enter__
    f = self.fs.open(self.path, mode=mode)
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/fsspec/spec.py", line 1030, in open
    f = self._open(
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/fsspec/implementations/local.py", line 155, in _open
    return LocalFileOpener(path, mode, fs=self, **kwargs)
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/fsspec/implementations/local.py", line 250, in __init__
    self._open()
  File "/Users/erdemdemir/DataspellProjects/bert-arxiv/venv/lib/python3.9/site-packages/fsspec/implementations/local.py", line 255, in _open
    self.f = open(self.path, mode=self.mode)


## 1.2 Convert to Pandas DF

In [None]:
#get latest versions and convert to df

get_latest_version = lambda x: x['versions'][-1]['created']

# get only necessary fields
trim = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'].split(' '),
                  'abstract':x['abstract']}
# filter for papers published on or after 2019-01-01
columns = ['id','category','abstract']
docs_df = (docs
             .filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2018)
             .map(trim)
             .compute())

# convert to pandas
docs_df = pd.DataFrame(docs_df)

# add general category. we are going to use as our target variable
docs_df['general_category'] = docs_df.category.apply(lambda x:[a.split('.')[0] for a in x])

In [None]:
docs_df['sub_category'] = docs_df.category.apply(lambda x:[a.split('.')[1] if ('.' in a) else a.split('.')[0]+'_nsc' for a in x])
docs_df['new_category'] = docs_df.category.apply(lambda x:[[a.split('.')[0],a.split('.')[1]] if ('.' in a) else [a.split('.')[0],a.split('.')[0]+'_nsc'] for a in x])

## 1.3 Prepare Categories

In [None]:
#prepare categories for prediction

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(docs_df.general_category)

mlb_sub = MultiLabelBinarizer()
labels_sub = mlb_sub.fit_transform(docs_df.sub_category)
labels_sub

In [None]:
# create category and sub-category dict

cats_sub_cats = {}
catvals = docs_df['new_category'].values
for item in mlb.classes_:
    cats_sub_cats[item] = []

for i in range(len(docs_df['new_category'])):
    for item in catvals[i]:
        cats_sub_cats[item[0]].append(item[1])
        
for item in mlb.classes_:
    cats_sub_cats[item] = list(set(cats_sub_cats[item]))
cats_sub_cats

In [None]:
df = pd.concat([docs_df[['abstract','title']], pd.DataFrame(labels), pd.DataFrame(labels_sub)], axis=1)
df.columns = ['abstract','title'] + list(mlb.classes_) + list(mlb_sub.classes_)
df.head(4)

## 1.4 Create Samples

In [None]:
# create samples

chosen_cols = ['cs','math','physics','cond-mat','astro-ph','quant-ph','hep-ph']
df_filtered = df[(df['cs']==1) | (df['math']==1) | (df['physics']==1) | (df['cond-mat']==1) | (df['astro-ph']==1) | (df['quant-ph']==1) | (df['hep-ph']==1) ][chosen_cols+['abstract']]
dfad = df_filtered.drop('abstract',axis=1)
idxs = []
for cat in chosen_cols:
    print(cat)
    sample_count=20000
    if cat=='cs':
        sample_count=15000
        dfad['axissum'] = dfad.sum(axis=1)
        id1 = dfad[(dfad[cat]==1) & dfad['axissum']==1].sample(sample_count).index
        idxs.append(np.array(id1))
        print(len(id1))
    else:
        id1 = df_filtered[df_filtered[cat]==1].sample(sample_count).index
        idxs.append(np.array(id1))

idx_list = list(set([j for i in idxs for j in i ]))
df_filtered_new = df_filtered.loc[idx_list,chosen_cols+['abstract']]
df_filtered_new[chosen_cols].sum()

In [None]:
df_filtered_new = df_filtered_new[['abstract', 'cs', 'math', 'physics', 'cond-mat',  'astro-ph', 'quant-ph',
       'hep-ph']].sample(frac=1)


train_size = int(len(df_filtered_new)*0.75)
train_df = df_filtered_new[:train_size]
test_df = df_filtered_new[train_size:]

#save dfs

train_df.to_csv('train_arxiv_2.csv',index=False)
test_df.to_csv('test_arxiv_2.csv',index=False)

# 2 LSTM

## 2.1 Open Sampled Data

In [None]:
train_df = pd.read_csv('train_arxiv_2.csv')
test_df = pd.read_csv('test_arxiv_2.csv')

In [None]:
X_train = train_df.abstract
y_train = train_df.drop('abstract',axis=1)
X_test = test_df.abstract
y_test = test_df.drop('abstract',axis=1)

## 2.2 Clean and Tokenize

In [None]:
#clean
ct = CleanText()
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

print('cleaned')


#tokenization
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train) #sadece train ile yap

X_train = tokenizer.texts_to_sequences(X_train) #train ve test için ayrı ayrı yap
X_train = pad_sequences(X_train,maxlen=100) #train ve test için ayrı ayrı yap

X_test = tokenizer.texts_to_sequences(X_test) #train ve test için ayrı ayrı yap
X_test = pad_sequences(X_test,maxlen=100) #train ve test için ayrı ayrı yap

print('tokenized')

## 2.3 LSTM Model

In [None]:
#create the LSTM model
embed_dim = 64
lstm_out = 10

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1]))
model.add(LSTM(5))
model.add(Dense(y_train.shape[1],activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

# fit model
batch_size = 128
history = model.fit(X_train, y_train, validation_split=0.1, epochs = 3, batch_size=batch_size, verbose = 1)

In [None]:
model.evaluate(X_test,y_test)

## 2.4 Get Predictions & Evaluate

In [None]:
class_names = y_train.columns
sums = y_test.sum(axis=1)

class_map = {i:class_names[i] for i in range(len(class_names))}
preds = model.predict(X_test) #predicteds

In [None]:
#confusion matrix
sns.heatmap(mt.confusion_matrix(np.argmax(y_test.values[np.where(sums==1)[0]],axis=1),
np.argmax(preds[np.where(sums==1)[0]],axis=1),normalize='true'),
annot=True,
fmt='.2f',
xticklabels=class_names,
yticklabels=class_names)

In [None]:
#classification report
clf_rep = mt.classification_report(y_test.astype(bool).values,(preds>0.4),target_names=class_names)

print(clf_rep)

# 3 BERT Model

## 3.1 Open Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/train_arxiv_2.csv')
df = df.sample(frac=1).reset_index()
df.head()

In [None]:
cols = df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
df['one_hot_labels'] = list(df[label_cols].values)
labels = list(df.one_hot_labels.values)
abstracts = list(df.abstract.values)

## 3.2 Tokenization

In [None]:
# allenai/scibert_scivocab_uncased for SciBERT
# bert-base-uncased for BERT Base Model

max_length = 100 # due to GPU memory issues
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(abstracts,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

In [None]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

## 3.3 Validation Split

In [None]:
# Prepare for stratification
label_counts = df.one_hot_labels.astype(str).value_counts()
single_freq = label_counts[label_counts==1].keys()
single_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(single_freq)].index), reverse=True)
single_freq_input_ids = [input_ids.pop(i) for i in single_freq_idxs]
single_freq_token_types = [token_type_ids.pop(i) for i in single_freq_idxs]
single_freq_attention_masks = [attention_masks.pop(i) for i in single_freq_idxs]
single_freq_labels = [labels.pop(i) for i in single_freq_idxs]

# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids, attention_masks,
                                                            random_state=42, test_size=0.1, stratify = labels)

# Add single frequency data to train data
train_inputs.extend(single_freq_input_ids)
train_labels.extend(single_freq_labels)
train_masks.extend(single_freq_attention_masks)
train_token_types.extend(single_freq_token_types)


train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [None]:
batch_size = 32

# Iterator with DataLoader

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

torch.save(train_dataloader,'train_data_loader')
torch.save(validation_dataloader,'validation_data_loader')


## 3.4 Modeling

### 3.4.1 Load Pretrained Model

In [None]:
# allenai/scibert_scivocab_uncased for SciBERT
# bert-base-uncased for BERT Base Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.cuda()

In [None]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)

### 3.4.2 Train the Model

In [2]:

train_loss_set = []

epochs = 2

# trange for tracking the progress
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  num_train_examples, num_train_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    
    batch = tuple(t.to(device) for t in batch) # Add batch to GPU
    batch_input_ids, batch_input_mask, batch_labels, batch_token_types = batch
    optimizer.zero_grad() # Clear gradients

    # Forward pass
    outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss()
    loss = loss_func(logits.view(-1,num_labels),batch_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    num_train_examples += batch_input_ids.size(0)
    num_train_steps += 1

  print("Train loss: {}".format(tr_loss/num_train_steps))

  # Validation

  # Put model in evaluation state to calculate loss on the validation set
  model.eval()

  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predictions
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch) # Pass to GPU
    # Get inputs from our dataloader
    batch_input_ids, batch_input_mask, batch_labels, batch_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)
      batch_logit_pred = outs[0]
      pred_label = torch.sigmoid(batch_logit_pred)

      batch_logit_pred = batch_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      batch_labels = batch_labels.to('cpu').numpy()

    tokenized_texts.append(batch_input_ids)
    logit_preds.append(batch_logit_pred)
    true_labels.append(batch_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for preds in pred_labels for item in preds]
  true_labels = [item for trues in true_labels for item in trues]

  # Calculate Accuracy
  threshold = 0.5
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: {:.2f} %'.format(val_f1_accuracy))
  print('Validation Accuracy: {:.2f} %'.format(val_accuracy))

NameError: name 'trange' is not defined

In [None]:
# Save model for the future
torch.save(model.state_dict(), '/content/drive/MyDrive/bert_model_arxiv') #

In [None]:
# Load model to evaluate
model.load_state_dict(torch.load('/content/drive/MyDrive/bert_model_arxiv'))

## 3.5 Test the Model

### 3.5.1 Load Test Data

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/test_arxiv_2.csv')
test_df['one_hot_labels'] = list(test_df[label_cols].values)
test_df.head()

In [None]:
test_labels = list(test_df.one_hot_labels.values)
test_abstracts = list(test_df.abstract.values)

In [None]:
# Encode input data
test_encodings = tokenizer.batch_encode_plus(test_abstracts,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [None]:
# Make tensors & data loader from the data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

torch.save(test_dataloader,'test_data_loader')

### 3.5.2 Get Predictions

In [None]:
# Put model into evaluation state to evaluate loss on the validation set
model.eval()

logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)

  batch_input_ids, batch_input_mask, batch_labels, batch_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)
    batch_logit_pred = outs[0]
    pred_label = torch.sigmoid(batch_logit_pred)

    batch_logit_pred = batch_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    batch_labels = batch_labels.to('cpu').numpy()

  tokenized_texts.append(batch_input_ids)
  logit_preds.append(batch_logit_pred)
  true_labels.append(batch_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for tok_text in tokenized_texts for item in tok_text]
pred_labels = [item for preds in pred_labels for item in preds]
true_labels = [item for trues in true_labels for item in trues]

true_bools = [tl==1 for tl in true_labels]

### 3.5.3 Classification Report

In [None]:
pred_bools = [pl>0.5 for pl in pred_labels] # Apply threshold


print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

### 3.5.4 Confusion Matrix

In [None]:
# confusion matrix

true_bools = np.array(true_bools)
pred_bools = np.array(pred_bools)

sums = true_bools.sum(axis=1)
yt = np.argmax(true_bools[np.where(sums==1)[0]],axis=1)
yp = np.argmax(pred_bools[np.where(sums==1)[0]],axis=1)
sns.heatmap(confusion_matrix(yt,yp,normalize='true'),annot=True,fmt='.2f',xticklabels=label_cols,yticklabels=label_cols)

### 3.5.5 Create Output DataFrame

In [None]:
idx2label = dict(zip(range(7),label_cols))
print(idx2label)

# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

# Decoding input ids to comment text
abstracts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

# Converting lists to df
comparisons_df = pd.DataFrame({'abstract': abstracts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df.head()

comparisons_df.to_csv("/content/drive/MyDrive/comparisons.csv")