In [None]:
# change this label to (folder name of) whatever model you want to work with
#   - Make sure that at chronic_conditions/code/output there is a folder with this name. that's where the trained model is stored 
#     and where output will be sent
label_type = "PERCEIVED-NEGLIGENCE"

# point this path to the shared chronic conditions drive folder
chronic_conditions_dir = '/content/drive/MyDrive/chronic_conditions/'

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 34.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 353 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
import json
import pandas as pd
import numpy as np
from numpy import mean
from collections import Counter
import pickle
import re
import random
import os

from google.colab import drive, files

import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = 'cuda'

dataset_path = os.path.join(chronic_conditions_dir, 'endo+endometriosis_parags.csv') # data that will be predicted
model_path = os.path.join(chronic_conditions_dir, 'code/output/', label_type) # where the fine-tuned model is saved


output_path = os.path.join(chronic_conditions_dir, 'code/output/', label_type) # main output dir
raw_output_path = os.path.join(output_path, 'raw_output') # where to save raw predictions
combined_output_path = os.path.join(output_path, 'combined_output') # where to save combined predictions labels and text
combined_csv_path = os.path.join(output_path, 'combined_output.csv') # where to save main output doc


In [None]:
model_path

'/content/drive/MyDrive/chronic_conditions/code/output/PERCEIVED-NEGLIGENCE'

### Prepare Data

In [None]:
# load all paragraphs from the endometriosis reddit communities
endo_df = pd.read_csv(dataset_path)
worklist = endo_df["text"].to_list()

In [None]:
worklist[1:10]

["I'm so very glad to see the postings becoming regular and the great contributions you have all made to this sub-reddit! Kudos for being awesome!",
 'I know that many in our community have been undergoing new treatment plans, several with recent surgeries and most dealing with major decisions. I wanted to touch base with everyone and hope you will post your recent struggles, triumphs and knowledge. It is so good to hear about those things. It reminds us that we are not alone, that progress is being made and successes are being had! ',
 "So, it's a pretty open forum, but I'm encouraging everyone to update the community on what is new in your Endo journey! And thanks for making this such a great community!",
 'EDIT: Woot! 62 readers! Welcome to all of our new readers! :D',
 "I'm a 20yo female (rather obviously) and have been having terrible abdominal pain since I was 16 - before this they thought it was just ovarian cysts causing pain (until the cysts were gone and the pain didn't stop)

### Make Predictions

In [None]:
# choose batchsize --> reduce this if out of GPUs
batchsize = 8
predictions = []

# load the fine-tuned model from our directory and send it to cuda
model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)

# load the tokenizer (make sure this is the same type of tokenizer as what we used when training)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# get predictions in batches

for i in range(0, len(worklist), batchsize):
    batch = worklist[i:i+batchsize] # extract batch from worklist
    test_encodings = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device) # tokenize the posts
    output = model(**test_encodings) # make predictions with model on our test_encodings for this batch
    batch_predictions = torch.softmax(output.logits, dim=1).tolist() # get the predictions result
    predictions.append(batch_predictions)
    if i % 100 == 0:
      print(str(i)+" in "+str(len(worklist)))

0 in 86236
200 in 86236
400 in 86236
600 in 86236
800 in 86236
1000 in 86236
1200 in 86236
1400 in 86236
1600 in 86236
1800 in 86236
2000 in 86236
2200 in 86236
2400 in 86236
2600 in 86236
2800 in 86236
3000 in 86236
3200 in 86236
3400 in 86236
3600 in 86236
3800 in 86236
4000 in 86236
4200 in 86236
4400 in 86236
4600 in 86236
4800 in 86236
5000 in 86236
5200 in 86236
5400 in 86236
5600 in 86236
5800 in 86236
6000 in 86236
6200 in 86236
6400 in 86236
6600 in 86236
6800 in 86236
7000 in 86236
7200 in 86236
7400 in 86236
7600 in 86236
7800 in 86236
8000 in 86236
8200 in 86236
8400 in 86236
8600 in 86236
8800 in 86236
9000 in 86236
9200 in 86236
9400 in 86236
9600 in 86236
9800 in 86236
10000 in 86236
10200 in 86236
10400 in 86236
10600 in 86236
10800 in 86236
11000 in 86236
11200 in 86236
11400 in 86236
11600 in 86236
11800 in 86236
12000 in 86236
12200 in 86236
12400 in 86236
12600 in 86236
12800 in 86236
13000 in 86236
13200 in 86236
13400 in 86236
13600 in 86236
13800 in 86236
14000 i

In [None]:
# Save raw predictions to output path
pickle.dump(predictions, open(raw_output_path, "wb"))

In [None]:
# if just loading existing pickled predictions:
# with open(raw_output_path, 'rb') as pickle_file:
#     predictions = pickle.load(pickle_file)

In [None]:
# choose the column name for this prediction label type
ol_name = "predictions_"+label_type
col_name

'predictions_PERCEIVED-NEGLIGENCE'

In [None]:
# check length of predictions
len([item for sublist in predictions for item in sublist])

86236

In [None]:
# check length of text
len(worklist)

86236

In [None]:
# add predictions to main df
flat_list = [item for sublist in predictions for item in sublist]
endo_df["predictions"] = flat_list
endo_df[['prob_0','prob_1']] = pd.DataFrame(endo_df["predictions"].tolist(), index=endo_df.index)
endo_df[col_name] = np.where(endo_df['prob_1'] > .50, 1, 0) # this is the column we're interested in, since this is a binary label

In [None]:
endo_df.to_pickle(combined_output_path)

In [None]:
# if just loading existing pickled predictions:
# with open(combined_output_path, 'rb') as pickle_file:
#     endo_df = pickle.load(pickle_file)

In [None]:
Counter(endo_df[col_name])

Counter({0: 78764, 1: 7472})

In [None]:
endo_df[endo_df[col_name] ==1].sample(100).text.to_list()

["I saw a gynaecologist privately at the end of August and found the appointment a waste of time, he barely listened to me about what was happening, putting an 18 pound weight gain in 3 weeks down to me eating too much when I had already stated my diet had not changed, he did an exterior scan and told me it definitely wasn't pcos and my uterus looked healthy, and told me to try the combined pill which I had already explained I couldn't and didn't feel comfortable taking. I felt like he was berating me when I repeated why I couldn't take it and he simply said he wasn't asking me to take it for long like I was a silly little girl who didn't know what was good for me.",
 'Hi everyone. I have been suffering with severe pain on my periods for a decade, and whenever I do an online search endo is the top result. But I’ve not had a lap to confirm and the most of the symptoms don’t match so just wanted to check what you might think of this? Doctors never showed too much interest, they always wa

In [None]:
combined_csv_path

'/content/drive/MyDrive/chronic_conditions/code/output/PERCEIVED-NEGLIGENCE/combined_output.csv'

In [None]:
endo_df.to_csv(combined_csv_path)