In [1]:
import os
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch as th

from transformers import TrainingArguments
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
)

from clearml import Task


from utils import train_grader_model, prepare_grader_data

In [8]:
# Define constants
SUDY_NUMBER = 1
DEVICE = "cuda" if th.cuda.is_available() else "cpu"
data_used = "famous_and_semi"

# Set up environment
trained_models_path = f"./anon_grader/trained_models"
data_dir = f"textwash_data/study{SUDY_NUMBER}/intruder_test/full_data_study.csv"
results_dir = "./anon_grader/logs"


# Set seeds
SEED = 42
np.random.seed(SEED)
th.manual_seed(SEED)

<torch._C.Generator at 0x1f596945870>

In [3]:
# Read the data
columns_to_read = ["type", "text", "file_id", "name", "got_name_truth_q2"]
raw_data = pd.read_csv(data_dir, usecols=columns_to_read)


# Aggregate by file_id and calculate the rate of re-identification
data = (
    raw_data.groupby(["type", "file_id", "name", "text"])
    .agg({"got_name_truth_q2": "mean"})
    .reset_index()
)
data.rename(columns={"got_name_truth_q2": "human_rate"}, inplace=True)


In [4]:
data

Unnamed: 0,type,file_id,name,text,human_rate
0,famous,famous_100_d_1_5.txt,mick jagger,PERSON_FIRSTNAME_5 PERSON_LASTNAME_6 is the le...,0.000000
1,famous,famous_101_d_1_6.txt,adele,PERSON_FIRSTNAME_1 is a LOCATION_1 female sing...,0.666667
2,famous,famous_102_d_1_7.txt,daniel radcliffe,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is an LOC...,0.000000
3,famous,famous_103_d_1_5.txt,mick jagger,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is a NUME...,0.000000
4,famous,famous_104_d_1_2.txt,ed sheeran,A ginger-haired OTHER_IDENTIFYING_ATTRIBUTE_1 ...,1.000000
...,...,...,...,...,...
1191,semifamous,semifamous_96_d_3_6.txt,malcolm dustan,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is an ex ...,0.000000
1192,semifamous,semifamous_97_d_3_4.txt,clyde fernandes,This person is a footballler and keen to be a ...,0.000000
1193,semifamous,semifamous_98_d_3_5.txt,kenny kramm,"PERSON_FIRSTNAME_1 is a relatively small, pale...",0.000000
1194,semifamous,semifamous_99_d_3_9.txt,walter chorn,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 played th...,0.000000


In [5]:
# Use famous or semi-famous
data = data[data["type"].isin(["famous", "semifamous"])]
data

Unnamed: 0,type,file_id,name,text,human_rate
0,famous,famous_100_d_1_5.txt,mick jagger,PERSON_FIRSTNAME_5 PERSON_LASTNAME_6 is the le...,0.000000
1,famous,famous_101_d_1_6.txt,adele,PERSON_FIRSTNAME_1 is a LOCATION_1 female sing...,0.666667
2,famous,famous_102_d_1_7.txt,daniel radcliffe,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is an LOC...,0.000000
3,famous,famous_103_d_1_5.txt,mick jagger,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is a NUME...,0.000000
4,famous,famous_104_d_1_2.txt,ed sheeran,A ginger-haired OTHER_IDENTIFYING_ATTRIBUTE_1 ...,1.000000
...,...,...,...,...,...
1191,semifamous,semifamous_96_d_3_6.txt,malcolm dustan,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 is an ex ...,0.000000
1192,semifamous,semifamous_97_d_3_4.txt,clyde fernandes,This person is a footballler and keen to be a ...,0.000000
1193,semifamous,semifamous_98_d_3_5.txt,kenny kramm,"PERSON_FIRSTNAME_1 is a relatively small, pale...",0.000000
1194,semifamous,semifamous_99_d_3_9.txt,walter chorn,PERSON_FIRSTNAME_1 PERSON_LASTNAME_1 played th...,0.000000


In [6]:
from re import sub, match


In [12]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=1
).to(DEVICE)

# Set the training optimizer
params = model.named_parameters()
top_layer_params = []
for name, para in params:
    # require grad only for top layer
    # if match(r'classifier.*|roberta.encoder.layer.11.*', name):
    if match(r"classifier.*|roberta.encoder.layer.11.*", name):
        para.requires_grad = True
        top_layer_params.append(para)
    else:
        para.requires_grad = False

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
params = model.named_parameters()
top_layer_params = []
for name, para in params:
    # require grad only for top layer
    if match(r'classifier.*|roberta.encoder.layer.10.*', name):
        print(name, para.requires_grad)

roberta.encoder.layer.10.attention.self.query.weight False
roberta.encoder.layer.10.attention.self.query.bias False
roberta.encoder.layer.10.attention.self.key.weight False
roberta.encoder.layer.10.attention.self.key.bias False
roberta.encoder.layer.10.attention.self.value.weight False
roberta.encoder.layer.10.attention.self.value.bias False
roberta.encoder.layer.10.attention.output.dense.weight False
roberta.encoder.layer.10.attention.output.dense.bias False
roberta.encoder.layer.10.attention.output.LayerNorm.weight False
roberta.encoder.layer.10.attention.output.LayerNorm.bias False
roberta.encoder.layer.10.intermediate.dense.weight False
roberta.encoder.layer.10.intermediate.dense.bias False
roberta.encoder.layer.10.output.dense.weight False
roberta.encoder.layer.10.output.dense.bias False
roberta.encoder.layer.10.output.LayerNorm.weight False
roberta.encoder.layer.10.output.LayerNorm.bias False
classifier.dense.weight True
classifier.dense.bias True
classifier.out_proj.weight True
