In [1]:
# change this label to label you want to predict (name of the model you want to use)
label_type = "PARTNER"

# point to your project directory
endometriosis_dir = '/content/drive/MyDrive/endometriosis/'

In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [3]:
import json
import pandas as pd
import numpy as np
from numpy import mean
from collections import Counter
import pickle
import re
import random
import os

from google.colab import drive, files

import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [4]:
# use to connect to Google Drive if running on Google Colab
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
device = 'cuda'

# Make sure that the following paths align with your project directory
dataset_path = os.path.join(endometriosis_dir, 'data', 'endo+endometriosis_parags.pkl') # data that will be predicted
model_path = os.path.join(endometriosis_dir, 'code', 'output', 'PERSONAS', label_type) # where the fine-tuned model is saved


output_path = os.path.join(endometriosis_dir, 'output', 'predictions', 'PERSONAS', label_type) # main output dir
if not os.path.exists(output_path):
  os.makedirs(output_path)
raw_output_path = os.path.join(output_path, 'raw_output.pkl') # where to save raw predictions
combined_output_path = os.path.join(output_path, 'combined_output.pkl') # where to save combined predictions labels and text
combined_csv_path = os.path.join(output_path, 'combined_output.csv') # where to save main output doc


In [6]:
# Sanity check
model_path

'/content/drive/MyDrive/endometriosis/code/output/PERSONAS/PARTNER'

### Prepare Data

In [7]:
# load all paragraphs from the endometriosis reddit communities
endo_df = pd.read_pickle(dataset_path)
worklist = endo_df["text"].to_list()

### Make Predictions

In [8]:
# choose batchsize --> reduce this if out of GPUs
batchsize = 8
predictions = []

# load the fine-tuned model from our directory and send it to cuda
model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)

# load the tokenizer (make sure this is the same type of tokenizer as what we used when training)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
# get predictions in batches

for i in range(0, len(worklist), batchsize):
    batch = worklist[i:i+batchsize] # extract batch from worklist
    test_encodings = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device) # tokenize the posts
    output = model(**test_encodings) # make predictions with model on our test_encodings for this batch
    batch_predictions = torch.softmax(output.logits, dim=1).tolist() # get the predictions result
    predictions.append(batch_predictions)
    if i % 100 == 0:
      print(str(i)+" in "+str(len(worklist)))

0 in 589814
200 in 589814
400 in 589814
600 in 589814
800 in 589814
1000 in 589814
1200 in 589814
1400 in 589814
1600 in 589814
1800 in 589814
2000 in 589814
2200 in 589814
2400 in 589814
2600 in 589814
2800 in 589814
3000 in 589814
3200 in 589814
3400 in 589814
3600 in 589814
3800 in 589814
4000 in 589814
4200 in 589814
4400 in 589814
4600 in 589814
4800 in 589814
5000 in 589814
5200 in 589814
5400 in 589814
5600 in 589814
5800 in 589814
6000 in 589814
6200 in 589814
6400 in 589814
6600 in 589814
6800 in 589814
7000 in 589814
7200 in 589814
7400 in 589814
7600 in 589814
7800 in 589814
8000 in 589814
8200 in 589814
8400 in 589814
8600 in 589814
8800 in 589814
9000 in 589814
9200 in 589814
9400 in 589814
9600 in 589814
9800 in 589814
10000 in 589814
10200 in 589814
10400 in 589814
10600 in 589814
10800 in 589814
11000 in 589814
11200 in 589814
11400 in 589814
11600 in 589814
11800 in 589814
12000 in 589814
12200 in 589814
12400 in 589814
12600 in 589814
12800 in 589814
13000 in 589814
1

In [10]:
# Sanity check before saving
raw_output_path

'/content/drive/MyDrive/endometriosis/output/predictions/PERSONAS/PARTNER/raw_output.pkl'

In [11]:
# Save raw predictions to output path
pickle.dump(predictions, open(raw_output_path, "wb"))

In [12]:
# if just loading existing pickled predictions:
# with open(raw_output_path, 'rb') as pickle_file:
#     predictions = pickle.load(pickle_file)

In [13]:
# choose the column name for this prediction label type
col_name = "predictions_"+label_type
col_name

'predictions_PARTNER'

In [14]:
# check length of predictions
len([item for sublist in predictions for item in sublist])

589814

In [15]:
# check length of text
len(worklist)

589814

In [16]:
# add predictions to main df
flat_list = [item for sublist in predictions for item in sublist]
endo_df["predictions"] = flat_list
endo_df[['prob_0','prob_1']] = pd.DataFrame(endo_df["predictions"].tolist(), index=endo_df.index)
endo_df[col_name] = np.where(endo_df['prob_1'] > .50, 1, 0) # this is the column we're interested in, since this is a binary label

In [17]:
# save as pickle
endo_df.to_pickle(combined_output_path, protocol=4)

In [18]:
# if just loading existing pickled predictions:
# with open(combined_output_path, 'rb') as pickle_file:
#     endo_df = pickle.load(pickle_file)

In [19]:
col_name

'predictions_PARTNER'

In [None]:
# checking that predicted paragraphs make sense
endo_df[endo_df[col_name] ==1].sample(50).text.to_list()

In [21]:
# save as csv
endo_df.to_csv(combined_csv_path)