In [1]:
!pip install transformers==4.19.2

Collecting transformers==4.19.2
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1 (from transformers==4.19.2)
  Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.0
    Uninstalling tokenizers-0.15.0:
      Successfully uninstalled tokenizers-0.15.0
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed tokenizers-0.12.1 transformers-4.19.2


In [2]:
"""
Code to create distilbert model. Predicting distinctiveness indicator
"""

# Import Libraries
import sys
sys.path.append('../../python_packages/')
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from tqdm import tqdm, trange
import torch
import torch.nn as nn
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from time import time
import random
import gc
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device available: ",device)

Device available:  cuda


In [3]:
# Input
data_path = '/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/data/df_distilbert_input_unprocessed.pkl'

# DistilBERT
dbert_model_path = '/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/model/distilbert_unprocessed_shuffled.pth'


In [4]:
# Load DistillBERT model
model_name = 'distilbert-base-cased'
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
optimizer = torch.optim.Adam([
    {'params': model.distilbert.parameters(), 'lr': 1e-5},
    {'params': model.classifier.parameters(), 'lr': 1e-3}
])

# Read data
df = pd.read_pickle(data_path)
df['filing_dt'] = pd.to_datetime(df['filing_dt'])

# Incidence Rate
print("Incidence Rate: ", ((100*df['distinct_ind'].sum())/(df.shape[0])) )

Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Incidence Rate:  87.19046356698402


In [5]:
# Randomize input order
def get_shuffled_input(df):
  op_list = []
  for index, row in df.iterrows():
    col_vals = [row['mark_unprocessed'], row['statement_unprocessed'],
                row['mark_translated'], row['wordnet_text'], row['mark_length_text'],
                row['nice_cat_text'], row['nice_description_unpreprocessed'], row['pseudo_mark_unprocessed']]
    random.shuffle(col_vals)
    sep_str = ' [SEP] '.join(col_vals)
    op = '[CLS] ' + sep_str
    op_list.append(op)
  return op_list

shuffled_input = get_shuffled_input(df)
df['bert_input_shuffled'] = shuffled_input
df.head()

Unnamed: 0,serial_no,filing_dt,mark_processed,mark_unprocessed,distinct_ind,intl_class_cd,mark_len,wn_ind,mark_final,translated_ind,...,pseudo_mark_processed,pseudo_mark_unprocessed,statement_processed,statement_unprocessed,wordnet_text,mark_length_text,nice_cat_text,bert_input_processed,bert_input_unprocessed,bert_input_shuffled
0,85507018,2012-01-01,RODOPTICS,RODOPTICS,1,9,1,0,RODOPTICS,0.0,...,Pseudo mark is rod optics,Pseudo mark is ROD OPTICS,eyeglasses eyewear namely eyeglasses reading g...,"Eyeglasses; Eyewear, namely, eyeglasses, readi...",mark absent in Wordnet,mark length is 1,NICE category is 9,[CLS] RODOPTICS [SEP] eyeglasses eyewear namel...,"[CLS] RODOPTICS [SEP] Eyeglasses; Eyewear, nam...",[CLS] mark length is 1 [SEP] Pseudo mark is RO...
1,85507042,2012-01-01,TRIVEDI WINE,TRIVEDI WINE,0,35,2,1,TRIVEDI WINE,0.0,...,no Pseudo mark,no Pseudo mark,intent to use descriptor trivedi wine in conne...,"Intent to use descriptor ""TRIVEDI WINE"" in con...",mark present in Wordnet,mark length is 2,NICE category is 35,[CLS] TRIVEDI WINE [SEP] intent to use descrip...,[CLS] TRIVEDI WINE [SEP] Intent to use descrip...,[CLS] mark length is 2 [SEP] no translation re...
2,85507043,2012-01-01,101 GLASSES,101 GLASSES,1,9,2,1,101 GLASSES,0.0,...,no Pseudo mark,no Pseudo mark,glasses eyewear namely sunglasses eyeglasses o...,"""GLASSES"" Eyewear, namely, sunglasses, eyeglas...",mark present in Wordnet,mark length is 2,NICE category is 9,[CLS] 101 GLASSES [SEP] glasses eyewear namely...,"[CLS] 101 GLASSES [SEP] ""GLASSES"" Eyewear, nam...","[CLS] Scientific, research, navigation, survey..."
3,85507044,2012-01-01,MILLIONAIRE TREASURE HUNT,MILLIONAIRE TREASURE HUNT,1,41,3,1,MILLIONAIRE TREASURE HUNT,0.0,...,no Pseudo mark,no Pseudo mark,entertainment services in the nature of an ong...,Entertainment services in the nature of an on-...,mark present in Wordnet,mark length is 3,NICE category is 41,[CLS] MILLIONAIRE TREASURE HUNT [SEP] entertai...,[CLS] MILLIONAIRE TREASURE HUNT [SEP] Entertai...,[CLS] no translation required [SEP] MILLIONAIR...
4,85507045,2012-01-01,STONE WIND AND VINES,"STONE, WIND AND VINES",1,33,4,1,STONE WIND AND VINES,0.0,...,no Pseudo mark,no Pseudo mark,wine,Wine,mark present in Wordnet,mark length is 4,NICE category is 33,[CLS] STONE WIND AND VINES [SEP] wine [SEP] no...,"[CLS] STONE, WIND AND VINES [SEP] Wine [SEP] n...",[CLS] Wine [SEP] mark length is 4 [SEP] mark p...


In [6]:
# Divide data into train, test and validation
df_train = df[(df['filing_dt']>=pd.to_datetime('2012-01-01')) & (df['filing_dt']<=pd.to_datetime('2017-12-31'))]
df_val = df[(df['filing_dt']>=pd.to_datetime('2018-01-01')) & (df['filing_dt']<=pd.to_datetime('2018-12-31'))]
df_test = df[(df['filing_dt']>=pd.to_datetime('2019-01-01')) & (df['filing_dt']<=pd.to_datetime('2019-12-31'))]

print("Train data shape: ", df_train.shape)
print("Validation data shape: ", df_val.shape)
print("Test data shape: ", df_test.shape)


Train data shape:  (1001779, 23)
Validation data shape:  (234752, 23)
Test data shape:  (264827, 23)


In [7]:
# Create X and Y for modelling
X_train = np.array(df_train['bert_input_shuffled'])
y_train = np.array(df_train['distinct_ind'])
X_val = np.array(df_val['bert_input_shuffled'])
y_val = np.array(df_val['distinct_ind'])
X_test = np.array(df_test['bert_input_shuffled'])
y_test = np.array(df_test['distinct_ind'])

# Generate Batches
batch_size = 16
train_max_idx = batch_size * (len(X_train)//batch_size)
val_max_idx = batch_size * (len(X_val)//batch_size)
test_max_idx = batch_size * (len(X_test)//batch_size)

X_train = X_train[:train_max_idx]
X_train = X_train.reshape(-1, batch_size)
X_train = X_train.tolist()
y_train = y_train[:train_max_idx]
y_train = y_train.reshape(-1, batch_size)

X_val = X_val[:val_max_idx]
X_val = X_val.reshape(-1, batch_size)
X_val = X_val.tolist()
y_val = y_val[:val_max_idx]
y_val = y_val.reshape(-1, batch_size)

X_test = X_test[:test_max_idx]
X_test = X_test.reshape(-1, batch_size)
X_test = X_test.tolist()
y_test = y_test[:test_max_idx]
y_test = y_test.reshape(-1, batch_size)

print("Batches Generated: ")
print("X_train: ", len(X_train))
print("y_train: ", y_train.shape)

print("X_val: ", len(X_val))
print("y_val: ", y_val.shape)

Batches Generated: 
X_train:  62611
y_train:  (62611, 16)
X_val:  14672
y_val:  (14672, 16)


In [8]:
# Model Training
from transformers.utils import logging
logging.set_verbosity(40)

from tqdm import tqdm
model = model.to(device)

num_epochs = 2
for epoch in range(num_epochs):
  model.train()
  for text, labels in tqdm(zip(X_train, y_train), total=len(X_train)):
    # prepare model input through our tokenizer
    model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    # place everything on the right device
    model_inputs = {k:v.to(device) for k,v in model_inputs.items()}
    # labels have to be torch long tensors
    labels = torch.tensor(labels).long()
    labels = labels.to(device)
    # now, we can perform the forward pass
    output = model(**model_inputs, labels=labels)
    loss, logits = output[:2]
    # and the backward pass
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

torch.save(model, dbert_model_path)
print("Model Saved")

100%|██████████| 62611/62611 [1:49:02<00:00,  9.57it/s]
100%|██████████| 62611/62611 [1:49:08<00:00,  9.56it/s]


Model Saved


In [9]:
# Predictions on validation dataset
print("Predicting on Validation Data")
loaded_model = torch.load(dbert_model_path)
predictions, targets = [], []
pred_logits = []
loaded_model.eval()

with torch.no_grad():
  for text, labels in tqdm(zip(X_val, y_val), total=len(X_val)):
    try:
      model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
      model_inputs = {k:v.to(device) for k,v in model_inputs.items()}

      output = loaded_model(**model_inputs)
      logits = output[0]
      pred_logits.append(logits)
      # prediction is the argmax of the logits
      predictions.extend(logits.argmax(dim=1).tolist())
      targets.extend(labels)
    except:
      print("Unable to process: ", text)
      continue

accuracy = metrics.accuracy_score(targets, predictions)
print ("accuracy", accuracy)
classification_report = metrics.classification_report(targets, predictions)
print (classification_report)

Predicting on Validation Data


100%|██████████| 14672/14672 [13:57<00:00, 17.52it/s]


accuracy 0.8997026649400218
              precision    recall  f1-score   support

           0       0.60      0.27      0.38     26001
           1       0.92      0.98      0.95    208751

    accuracy                           0.90    234752
   macro avg       0.76      0.63      0.66    234752
weighted avg       0.88      0.90      0.88    234752



In [10]:
# Predictions on Test data
print("Predicting on Test Data")
predictions, targets = [], []
pred_logits = []
loaded_model.eval()

with torch.no_grad():
  for text, labels in tqdm(zip(X_test, y_test), total=len(X_test)):
    try:
      model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
      model_inputs = {k:v.to(device) for k,v in model_inputs.items()}

      output = loaded_model(**model_inputs)
      logits = output[0]
      pred_logits.append(logits)
      # prediction is the argmax of the logits
      predictions.extend(logits.argmax(dim=1).tolist())
      targets.extend(labels)
    except:
      print("Unable to process: ", text)
      continue

from sklearn import metrics
accuracy = metrics.accuracy_score(targets, predictions)
print ("accuracy", accuracy)
classification_report = metrics.classification_report(targets, predictions)
print (classification_report)

Predicting on Test Data


100%|██████████| 16551/16551 [15:55<00:00, 17.33it/s]


accuracy 0.8542837290798139
              precision    recall  f1-score   support

           0       0.65      0.19      0.30     42333
           1       0.86      0.98      0.92    222483

    accuracy                           0.85    264816
   macro avg       0.76      0.59      0.61    264816
weighted avg       0.83      0.85      0.82    264816



In [None]:
from google.colab import runtime
runtime.unassign()