In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [2]:
# Problem 13

In [3]:
# Training Data Processing

In [4]:
train_data = pd.read_csv('si630w22-hw3-train.csv')
train_data.head()

Unnamed: 0,id,annotator_id,rating,group
0,t3_n27vu3,user_00,5.0,group_09
1,t3_n27vu3,user_01,5.0,group_09
2,t3_n27vu3,user_02,5.0,group_09
3,t3_n2az7m,user_00,5.0,group_09
4,t3_n2az7m,user_01,5.0,group_09


In [5]:
def remove_group(train_data, group_name):
    train_data = train_data[train_data.group != group_name]
    return train_data

In [6]:
group_name = 'group_15'
train_data = remove_group(train_data, group_name)
train_data.head()

Unnamed: 0,id,annotator_id,rating,group
0,t3_n27vu3,user_00,5.0,group_09
1,t3_n27vu3,user_01,5.0,group_09
2,t3_n27vu3,user_02,5.0,group_09
3,t3_n2az7m,user_00,5.0,group_09
4,t3_n2az7m,user_01,5.0,group_09


In [7]:
def calculate_avg(train_data):
    train_data = pd.DataFrame(train_data.groupby('id').rating.mean().reset_index())
    return train_data

In [8]:
train_data = calculate_avg(train_data)
train_data.head(10)

Unnamed: 0,id,rating
0,t3_n2714y,4.75
1,t3_n27873,3.75
2,t3_n27b1e,3.5
3,t3_n27qop,4.0
4,t3_n27vu3,4.4
5,t3_n28cas,2.0
6,t3_n28ge1,2.6
7,t3_n28jtz,4.4
8,t3_n28n5s,2.5
9,t3_n28tgd,4.5


In [9]:
# Dev Data Processing

In [10]:
dev_data = pd.read_csv('si630w22-hw3-dev.csv')
dev_data.head()

Unnamed: 0,id,annotator_id,rating,group
0,t3_n2xpm3,user_00,5.0,group_09
1,t3_n2xpm3,user_01,5.0,group_09
2,t3_n2xpm3,user_02,5.0,group_09
3,t3_n2yp4z,user_00,1.0,group_09
4,t3_n2yp4z,user_01,1.0,group_09


In [11]:
# Split dev data

set_B = dev_data[dev_data.group == group_name]
set_AC = dev_data[dev_data.group != group_name]
set_A = pd.DataFrame(columns =  ["id", "annotator_id", "rating", "group"])
set_C = pd.DataFrame(columns =  ["id", "annotator_id", "rating", "group"])
group_ids = []

for index, row in set_B.iterrows():
    if row[0] not in group_ids:
        group_ids.append(row[0])
        
for index, row in set_AC.iterrows():
    if row[0] in group_ids:
        set_C.loc[set_C.shape[0]] = row
    else:
        set_A.loc[set_A.shape[0]] = row
display(set_A.head())
display(set_B.head())
display(set_C.head())

Unnamed: 0,id,annotator_id,rating,group
0,t3_n2xpm3,user_00,5.0,group_09
1,t3_n2xpm3,user_01,5.0,group_09
2,t3_n2xpm3,user_02,5.0,group_09
3,t3_n2yp4z,user_00,1.0,group_09
4,t3_n2yp4z,user_01,1.0,group_09


Unnamed: 0,id,annotator_id,rating,group
1615,t3_n2fbfq,user_26,5.0,group_15
1616,t3_n2fbfq,user_27,5.0,group_15
1617,t3_n309tu,user_26,3.0,group_15
1618,t3_n309tu,user_27,4.0,group_15
1619,t3_n3po7o,user_26,5.0,group_15


Unnamed: 0,id,annotator_id,rating,group
0,t3_novy1c,user_00,3.0,group_09
1,t3_novy1c,user_01,3.0,group_09
2,t3_novy1c,user_02,5.0,group_09
3,t3_n2fbfq,user_03,3.0,group_03
4,t3_n2fbfq,user_04,4.0,group_03


In [12]:
qaa = pd.read_csv("si630w22-hw3-data.csv")
qaa.head()

Unnamed: 0,question_id,question_text,reply_id,reply_text,rlen
0,t3_n27vu3,What's something nice you like to do just to b...,gwhrhmf,Give compliments. It’s extremely easy to do an...,205
1,t3_n2az7m,So what is the best headphones for people who ...,gwiatps,I prefer Raycon Performance Ear Buds. They are...,178
2,t3_n2dzr9,How do you go on knowing a loved one only has ...,gwit1wj,Make it as memorable as the rest of your time ...,278
3,t3_n2iy9q,You’ve been dropped to the year 1800 with all ...,gwjhw8i,They're gonna burn me at the stake for being a...,52
4,t3_n2kuuq,Stuck in bad habits for years and I realized i...,gwkiiie,Using new environments as a way to create heal...,651


In [13]:
train_data = train_data.rename(columns={"id": "question_id"})
train_data = train_data.merge(qaa, how="left", on="question_id")
set_A = set_A.rename(columns={"id": "question_id"})
set_A = set_A.merge(qaa, how="left", on="question_id")
set_B = set_B.rename(columns={"id": "question_id"})
set_B = set_B.merge(qaa, how="left", on="question_id")
set_C = set_C.rename(columns={"id": "question_id"})
set_C = set_C.merge(qaa, how="left", on="question_id")

In [14]:
train_data.head()

Unnamed: 0,question_id,rating,question_text,reply_id,reply_text,rlen
0,t3_n2714y,4.75,"Is there someone you turned down in the past, ...",gwhmmsp,Idk if this counts but my when I was younger m...,391
1,t3_n27873,3.75,"What is, in your opinion, the saddest villain ...",gwhn3bt,My man Dr Heinz Doofenschmirts was born withou...,154
2,t3_n27b1e,3.5,ELI5: How do we still not know how eels reprod...,gwho8nq,"For a long time, it wasn't known how eels mate...",207
3,t3_n27qop,4.0,ELI5: Why can’t freshwater fish live in saltwa...,gwht547,A living cell is designed to work at specific ...,1145
4,t3_n27vu3,4.4,What's something nice you like to do just to b...,gwhrhmf,Give compliments. It’s extremely easy to do an...,205


In [15]:
# !pip install torchvision
# !pip install transformers

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback


# Preprocess Data
train_data['text'] = train_data['question_text'] + " [SEP] " + train_data['reply_text'] 
data = train_data.dropna()
set_A['text'] = set_A['question_text'] + " [SEP] " + set_A['reply_text'] 
set_A = set_A.dropna()
set_B['text'] = set_B['question_text'] + " [SEP] " + set_B['reply_text'] 
set_B = set_B.dropna()
set_C['text'] = set_C['question_text'] + " [SEP] " + set_C['reply_text'] 
set_C = set_C.dropna()

# Define pretrained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)

# ----- 1. Preprocess data -----#
# Preprocess data
X_train = list(data["text"])
y_train = list(data["rating"])
X_valA = list(set_A['text'])
y_valA = list(set_A['rating'])
X_valB = list(set_B['text'])
y_valB = list(set_B['rating'])
X_valC = list(set_C['text'])
y_valC = list(set_C['rating'])

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_valA_tokenized = tokenizer(X_valA, padding=True, truncation=True, max_length=512)
X_valB_tokenized = tokenizer(X_valB, padding=True, truncation=True, max_length=512)
X_valC_tokenized = tokenizer(X_valC, padding=True, truncation=True, max_length=512)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
valA_dataset = Dataset(X_valA_tokenized)
val = Dataset(X_valA_tokenized, y_valA)
valB_dataset = Dataset(X_valB_tokenized)
valC_dataset = Dataset(X_valC_tokenized)

In [18]:
len(train_dataset), len(valA_dataset)

(3766, 3530)

In [19]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
device

device(type='cuda', index=0)

In [None]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 3766
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2360


Step,Training Loss,Validation Loss


In [None]:
raw_pred, _, _ = trainer.predict(valA_dataset)
pred = raw_pred.squeeze(1)
coefA = pd.DataFrame({"pred":pred, "truth":y_valA})
coefA.corr()
# SET A

In [None]:
raw_pred, _, _ = trainer.predict(valB_dataset)
pred = raw_pred.squeeze(1)
coefB = pd.DataFrame({"pred":pred, "truth":y_valB})
coefB.corr()
# SET B

In [None]:
raw_pred, _, _ = trainer.predict(valC_dataset)
pred = raw_pred.squeeze(1)
coefC = pd.DataFrame({"pred":pred, "truth":y_valC})
coefC.corr()
# SET C

In [None]:
nameA = 'coefA'+group_name+'.csv'
coefA.to_csv(nameA)
nameB = 'coefB'+group_name+'.csv'
coefB.to_csv(nameB)
nameC = 'coefC'+group_name+'.csv'
coefB.to_csv(nameC)