In [None]:
from sklearn.model_selection import train_test_split
import logging
from tqdm.auto import tqdm, trange
import os
import argparse
import random
import numpy as np
from glob import glob
from typing import List, Dict
import pandas as pd


In [None]:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, AutoTokenizer
from transformers import BertForMaskedLM, BertConfig, PreTrainedModel, AutoModel, AutoModelForSequenceClassification
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import IntervalStrategy
import torch
import wandb
from datasets import Dataset
from sklearn import preprocessing
import evaluate
import transformers
from sklearn.metrics import mean_squared_error, accuracy_score, precision_recall_fscore_support
import datasets

In [None]:
transformers.__version__

'4.18.0'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
device

device(type='cuda')

# data preprocessing

In [None]:
df_ccat = pd.read_csv('../../data/CCAT50/processed/CCAT50_train.csv')

In [None]:
df_ccat_val = pd.read_csv('../../data/CCAT50/processed/CCAT50_AA_val.csv')

In [None]:
def create_dataset(df_ccat, num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = None):
    unique_authors = list(df_ccat['author_id'].unique())
    if not picked_author_ids:
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    authors = []
    texts = []
    for author in picked_author_ids:
        df_temp = df_ccat[df_ccat['author_id'] == author]
        for i_doc in range(len(df_temp)):
            doc = df_temp['text'].iloc[i_doc].split('\n')
            for i in range(len(doc)):
                doc[i] = doc[i].strip()
            doc.remove('')
            for i in range(len(doc)-num_sent_per_text):
                authors.append(author)
                texts.append(' '.join(doc[i:i+num_sent_per_text]))
    df = pd.DataFrame({'author':authors, 'text':texts})
    if save_folder:
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f'author_{str_author}_sent_{num_sent_per_text}.csv'
        df.to_csv(f"{save_folder}/{file_name}", index=False)
        return df, file_name
    return df

In [None]:
# # num_authors_to_pick = 2 # useless when specifying picked authors
# picked_author_ids = [0,1]
# num_sent_per_text = 1
# save_folder = '../../data/CCAT50/processed/'
# df, file_name = create_dataset(df_ccat, picked_author_ids = picked_author_ids, num_sent_per_text = num_sent_per_text, save_folder = save_folder)

# model and train

In [None]:
def nested_to(dic, device):
    for k,v in dic.items():
        dic[k] = v.to(device)
    return dic

In [None]:
def tokenize(examples):
    labels = examples['author']
    out = tokenizer(examples['text'], padding=True, truncation=True, max_length=128)
    out.update({'labels':le.transform(labels)})
    return out

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/', local_files_only=True)

In [None]:
def freeze_model(model, freeze_bert):
    '''
    if freeze_bert == True, freeze all layer. 
    if freeze_bert is a positive integer, freeze the bottom {freeze_bert} attention layers
    negative integer should also work
    '''
    if freeze_bert==True:
        for param in model.bert.parameters():
            param.requires_grad = False
    elif isinstance(freeze_bert, (int, np.int32, np.int64, torch.int32, torch.int64)):
        for param in model.bert.embeddings.parameters():
            param.requires_grad = False  
        for layer in model.bert.encoder.layer[:freeze_bert]: 
            for param in layer.parameters():
                param.requires_grad = False  
    return model

In [None]:
data_collator = DataCollatorWithPadding(tokenizer, padding = True, return_tensors = 'pt')


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
epochs = 50
batchsize = 128
num_labels = 2
freeze_bert = 3
save_folder = '../../data/CCAT50/processed/'

# num_authors_to_pick = 2 # useless when specifying picked authors
NUM_SENT = [1,2,3]
PICKED_AUTHORS = ['0,1', '2,3', '4,5']
LR = [3e-5, 5e-5, 8e-5]

NUM_SENT, PICKED_AUTHORS, LR = np.meshgrid(NUM_SENT, PICKED_AUTHORS, LR)
NUM_SENT, PICKED_AUTHORS, LR = NUM_SENT.flatten(), PICKED_AUTHORS.flatten(), LR.flatten()

num_runs = len(NUM_SENT)

for i_run in trange(num_runs):
    
    
    num_sent_per_text = NUM_SENT[i_run]
    picked_author_ids = PICKED_AUTHORS[i_run]
    picked_author_ids = list(map(int,picked_author_ids.split(',')))
    lr = LR[i_run]
    
    df, file_name = create_dataset(df_ccat, picked_author_ids = picked_author_ids, num_sent_per_text = num_sent_per_text, save_folder = save_folder)
    
    le = preprocessing.LabelEncoder()
    le.fit(df['author'])
    train_dataset = Dataset.from_csv(f'{save_folder}/{file_name}').shuffle()
    train_dataset = train_dataset.map(tokenize, batched=True)
    train_dataset = train_dataset.remove_columns(['text'])
    train_dataset = train_dataset.remove_columns(['author'])
    
    df, file_name = create_dataset(df_ccat_val, picked_author_ids = picked_author_ids, num_sent_per_text = num_sent_per_text, save_folder = save_folder)
    test_dataset = Dataset.from_csv(f'{save_folder}/{file_name}').shuffle()
    test_dataset = test_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.remove_columns(['text'])
    test_dataset = test_dataset.remove_columns(['author'])
    
    model = AutoModelForSequenceClassification.from_pretrained('/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/', local_files_only=True, num_labels=num_labels)
    model = freeze_model(model, freeze_bert)
    
    # trainer config
    str_author = ','.join(map(str, picked_author_ids))
    training_args = TrainingArguments(
        learning_rate=lr,
        output_dir= f"/scratch/data_jz17d/result/POS_CCAT50/author_{str_author}_run_{i_run}",
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batchsize,
        per_device_eval_batch_size=batchsize,
        evaluation_strategy=IntervalStrategy.EPOCH,
        logging_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.EPOCH,
#         save_steps=control_steps,
#         logging_steps=control_steps,
#         eval_steps=control_steps,
#         metric_for_best_model='accuracy',
        save_total_limit=2,
        prediction_loss_only=True,
#         remove_unused_columns=False,
#         report_to='wandb',
        )
    
    # wandb config
    wconfig = {}
    wconfig['num_sent_per_text'] = num_sent_per_text
    wconfig['picked_author_ids'] = str_author
    wconfig['lr'] = lr
    run = wandb.init(project="POS CCAT50", 
                     entity="fsu-dsc-cil", 
                     dir='/scratch/data_jz17d/wandb_tmp/', 
                     config=wconfig,
                     name=f'author_{str_author}_run_{i_run}',
                     reinit=True)

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    
    # end of training evaluation
    metric = datasets.load_metric('accuracy')
    for x in trainer.get_eval_dataloader(test_dataset):
        labels = x['labels']
        x = nested_to(x, device)
        model_predictions = model(**x)
        metric.add_batch(predictions=model_predictions.logits.argmax(axis=-1).cpu().detach().numpy(), references=labels)
    accuracy = metric.compute()
    wandb.log({'accuracy':accuracy})  
    
    run.finish()

  0%|          | 0/27 [00:00<?, ?it/s]

Using custom data configuration default-87aa798d24125176


Downloading and preparing dataset csv/default to /scratch/data_jz17d/hf_datasets_cache/csv/default-87aa798d24125176/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /scratch/data_jz17d/hf_datasets_cache/csv/default-87aa798d24125176/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

Using custom data configuration default-13d1bb30dc2271c4


Downloading and preparing dataset csv/default to /scratch/data_jz17d/hf_datasets_cache/csv/default-13d1bb30dc2271c4/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /scratch/data_jz17d/hf_datasets_cache/csv/default-13d1bb30dc2271c4/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/config.json
Model config BertConfig {
  "_name_or_path": "/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "intermediate_size": 128,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 128,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 3,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 54
}

loading weights file /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/pytorch_model.bin
Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_core

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max‚Ä¶

0,1
eval/loss,‚ñà‚ñà‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñÇ‚ñà‚ñÑ‚ñÅ‚ñÑ‚ñÑ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñÖ‚ñÑ‚ñÖ‚ñÇ‚ñÑ‚ñÇ‚ñÖ‚ñÜ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÇ‚ñÇ‚ñÖ‚ñÑ‚ñÇ‚ñÇ‚ñÜ‚ñà‚ñÜ‚ñÅ‚ñÑ‚ñÖ‚ñÉ
eval/samples_per_second,‚ñá‚ñÅ‚ñÖ‚ñà‚ñÑ‚ñÖ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñÑ‚ñÖ‚ñÉ‚ñÑ‚ñÑ‚ñá‚ñÑ‚ñá‚ñÑ‚ñÉ‚ñÜ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñá‚ñá‚ñÑ‚ñÖ‚ñÜ‚ñá‚ñÉ‚ñÅ‚ñÉ‚ñá‚ñÖ‚ñÉ‚ñÖ
eval/steps_per_second,‚ñá‚ñÅ‚ñÖ‚ñà‚ñÑ‚ñÖ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñÑ‚ñÖ‚ñÉ‚ñÑ‚ñÑ‚ñá‚ñÑ‚ñá‚ñÑ‚ñÉ‚ñÜ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñá‚ñá‚ñÑ‚ñÖ‚ñÜ‚ñá‚ñÉ‚ñÅ‚ñÉ‚ñá‚ñÖ‚ñÉ‚ñÖ
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ
train/total_flos,‚ñÅ
train/train_loss,‚ñÅ

0,1
eval/loss,0.68738
eval/runtime,0.1313
eval/samples_per_second,2620.65
eval/steps_per_second,22.855
train/epoch,50.0
train/global_step,550.0
train/learning_rate,0.0
train/loss,0.6905
train/total_flos,2108296285200.0
train/train_loss,0.69107


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.03334208329518636, max=1.0)‚Ä¶

***** Running training *****
  Num examples = 1379
  Num Epochs = 50
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 550
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,0.6928,0.692404
2,0.6926,0.69209
3,0.6926,0.691767
4,0.6924,0.691521
5,0.6922,0.69128
6,0.6921,0.691037
7,0.692,0.690827
8,0.6916,0.69057
9,0.6917,0.690347
10,0.6915,0.690112


***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-11
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-11/config.json
Model weights saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-11/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-11/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-11/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-539] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-22
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/che

***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-132
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-132/config.json
Model weights saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-132/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-132/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-132/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-110] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-143
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run

Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-220] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-253
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-253/config.json
Model weights saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-253/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-253/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-253/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-231] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17

Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-363/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-341] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-374
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-374/config.json
Model weights saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-374/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-374/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-374/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-352] due to args.save_total

tokenizer config file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-484/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-484/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-462] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 344
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-495
Configuration saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-495/config.json
Model weights saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-495/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-495/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/POS_CCAT50/author_0,1_run_0/checkpoint-495/special_tokens_map.js

NameError: name 'device' is not defined