In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch as th

from transformers import TrainingArguments
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig


from utils import train_grader_model, prepare_grader_data

In [2]:
# Set up environment
DEVICE = "cuda" if th.cuda.is_available() else "cpu"
DEBUG = True
study_number = 1
results_dir = f"textwash_data/study{study_number}/intruder_test/full_data_study.csv"
trained_model_path = f"./trained_models/anon_grader.pt"

# Set seeds
SEED = 42
np.random.seed(SEED)
th.manual_seed(SEED)

<torch._C.Generator at 0x262839e3eb0>

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    evaluation_strategy='steps',
    eval_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=None
)

In [3]:
# Read the data
columns_to_read = ["type", "text", "file_id", "name", "got_name_truth_q1"]
raw_data = pd.read_csv(results_dir, usecols=columns_to_read).rename(columns={"got_name_truth_q1": "re_identify"})

In [4]:
# Aggregate by file_id and calculate the rate of re-identification
data = raw_data.groupby(["type", "file_id", "name", "text"]).agg({"re_identify": "mean"}).reset_index()


Unnamed: 0,type,file_id,name,text,re_identify
636,fict,fict_315_d_2_4.txt,brantley ratke,PERSON_FIRSTNAME_1 PERSON_LASTNAME_2 comes fro...,0.0
243,famous,famous_31_d_1_3.txt,emma watson,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is an LOC...,0.25
1013,semifamous,semifamous_297_d_3_8.txt,aleksandr gryazin,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is a reti...,0.0
1113,semifamous,semifamous_387_d_3_9.txt,walter chorn,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 was a sma...,0.0
58,famous,famous_153_d_1_1.txt,benedict cumberbatch,PERSON_FIRSTNAME_1 PERSON_LASTNAME_3 is an LOC...,0.333333


In [None]:
datasets, tokenizer = prepare_grader_data(data, SEED, DEVICE)

In [6]:
model, trainer = train_grader_model(train_data, SEED, training_args, trained_model_path, DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/48 [00:00<?, ?it/s]

  labels = th.tensor(self.labels[index]).squeeze()
Epoch:   0%|          | 0/1 [03:41<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# save model
# th.save(model.state_dict(), trained_model_path)