In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch as th

from transformers import TrainingArguments
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig


from utils import train_grader_model, prepare_grader_data

In [2]:
# Set up environment
DEVICE = "cuda" if th.cuda.is_available() else "cpu"
DEBUG = True
study_number = 1
data_dir = f"textwash_data/study{study_number}/intruder_test/full_data_study.csv"
trained_model_path = f"./trained_models/anon_grader.pt"
# Cancel wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Set seeds
SEED = 42
np.random.seed(SEED)
th.manual_seed(SEED)

<torch._C.Generator at 0x14989907ef0>

In [3]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    loggng_strategy='steps',
    logging_steps=10,
    evaluation_strategy='epoch',
    num_train_epochs=3,
    save_total_limit=1,
    save_strategy='no',
    load_best_model_at_end=True,
    report_to="tensorboard"
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [4]:
# Read the data
columns_to_read = ["type", "text", "file_id", "name", "got_name_truth_q1"]
raw_data = pd.read_csv(data_dir, usecols=columns_to_read).rename(columns={"got_name_truth_q1": "re_identify"})

In [5]:
# Aggregate by file_id and calculate the rate of re-identification
data = raw_data.groupby(["type", "file_id", "name", "text"]).agg({"re_identify": "mean"}).reset_index()


In [6]:
datasets, tokenizer = prepare_grader_data(data, SEED, DEVICE)

In [7]:
model = train_grader_model(datasets, SEED, training_args, trained_model_path, DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [35:27<?, ?it/s]

{'eval_loss': 0.08179844170808792, 'eval_runtime': 232.0557, 'eval_samples_per_second': 0.517, 'eval_steps_per_second': 0.034, 'epoch': 0.83}


Epoch 0: 100%|██████████| 1/1 [42:42<00:00, 2562.36s/it]


{'train_runtime': 2562.1278, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.047, 'train_loss': 0.08755702177683512, 'epoch': 1.0}


RuntimeError: Parent directory ./trained_models does not exist.

In [8]:
# save model
# th.save(model.state_dict(), trained_model_path)
model

NameError: name 'model' is not defined