In [1]:
# Basic Python modules
from collections import defaultdict
import random
import pickle

# For downloading large files from Google Drive
# https://github.com/wkentaro/gdown
import gdown

# For working with gzip files
# https://docs.python.org/3/library/gzip.html
import gzip

# For working with JSON files
import json

# For data manipulation and analysis
import pandas as pd
import numpy as np

# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split

# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

# For plotting and data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

In [2]:
!pip3 install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 11.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 50.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers


In [3]:
# using DistilBERT for testing --> can switch to BERT once set up
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [4]:
# This is the name of the BERT model that we want to use. 
# We're using DistilBERT to save space (it's a distilled version of the full BERT model), 
# and we're going to use the cased (vs uncased) version.
model_name = 'distilbert-base-cased'  

# This is the name of the program management system for NVIDIA GPUs. We're going to send our code here.
device_name = 'cuda'       

# This is the maximum number of tokens in any document sent to BERT.
max_length = 512                                                        

# This is the name of the directory where we'll save our model. You can name it whatever you want.
cached_model_directory_name = 'distilbert-doctors'  

# File path for data
drive_url = 'drive/MyDrive/relationships.csv'

In [5]:
# get the data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
relations_df = pd.read_csv(drive_url)

In [8]:
# Set up training and testing sets
X = relations_df["text"].to_list()
y = relations_df["DOCTORS"].to_list()

In [9]:
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size = 0.25)

In [10]:
# load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) # The model_name needs to match our pre-trained model.

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [11]:
test_texts[1:10]

["I get daily lower back pain, but I've had that for more than 10 years since working as a care assistant. I do get tired a lot, but I have epilepsy and sleep quality is an issue for me so it could be that. ",
 "\nI'm so SO fed up of being passed around being told it's not important. I have been in almost constant pain for over a year, had colonoscopy, so many ultrasounds, assessments and always told that it isnt anything.",
 "What exercises do you do for PT homework? I'm curious if they differ from what I have been doing.",
 'Does anyone share my problem and if so, does anyone have any advice for me? Thank you.',
 'Thank you for reading, and I hope this was coherent.',
 'Disclaimer: I understand everyone is different of course but I was looking for some feedback on these two choice. ',
 'Does anyone have an idea what I could possibly do or have advice for me? It would really help,',
 'Today we visited for a couple hours with a couple we’re friends with, I had to use my heating pad the

In [12]:
# PASS TO TOKENIZER, ADD PADDING AND TRUNCATE
train_encodings = tokenizer(train_texts,  truncation=True, padding=True)
test_encodings = tokenizer(test_texts,  truncation=True, padding=True)

## Convert into a Torch Dataset

In [13]:
# MAKE DATASET OBJECTS

class SCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
train_dataset = SCDataset(train_encodings, train_labels)
test_dataset = SCDataset(test_encodings, test_labels)

## Fine tune on our training data

In [15]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
)

In [16]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased").to(device_name)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [17]:
# Custom evaluation function 
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [18]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # custom evaluation function
)

In [19]:
trainer.train()

***** Running training *****
  Num examples = 750
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 141


Step,Training Loss,Validation Loss,Accuracy
10,0.6913,0.666403,0.648
20,0.6389,0.628308,0.656
30,0.6315,0.630269,0.676
40,0.587,0.619404,0.688
50,0.5328,0.680295,0.696
60,0.5984,0.601477,0.708
70,0.5352,0.590368,0.692
80,0.5587,0.550532,0.7
90,0.5136,0.563228,0.768
100,0.541,0.644261,0.668


***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20
***** Running Evaluation *****
  Num examples = 250
  Batch size = 20


Training completed

TrainOutput(global_step=141, training_loss=0.5301735905256677, metrics={'train_runtime': 319.6357, 'train_samples_per_second': 7.039, 'train_steps_per_second': 0.441, 'total_flos': 298051646976000.0, 'train_loss': 0.5301735905256677, 'epoch': 3.0})

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 250
  Batch size = 20


{'epoch': 3.0,
 'eval_accuracy': 0.796,
 'eval_loss': 0.4601612985134125,
 'eval_runtime': 8.4485,
 'eval_samples_per_second': 29.591,
 'eval_steps_per_second': 1.539}

## Assess performance

In [21]:
predicted_labels = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 250
  Batch size = 20


In [22]:
actual_predicted_labels = predicted_labels.predictions.argmax(-1)
len(actual_predicted_labels)

250

In [23]:
from sklearn.metrics import classification_report
class_report = classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten(), output_dict=True)
print(classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten()))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       162
           1       0.73      0.66      0.69        88

    accuracy                           0.80       250
   macro avg       0.78      0.76      0.77       250
weighted avg       0.79      0.80      0.79       250

