<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#train-each-task-and-get-prediction" data-toc-modified-id="train-each-task-and-get-prediction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>train each task and get prediction</a></span><ul class="toc-item"><li><span><a href="#country" data-toc-modified-id="country-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>country</a></span></li><li><span><a href="#politics" data-toc-modified-id="politics-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>politics</a></span></li><li><span><a href="#tod" data-toc-modified-id="tod-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>tod</a></span></li><li><span><a href="#age" data-toc-modified-id="age-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>age</a></span></li><li><span><a href="#education" data-toc-modified-id="education-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>education</a></span></li><li><span><a href="#ethnic" data-toc-modified-id="ethnic-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>ethnic</a></span></li><li><span><a href="#gender" data-toc-modified-id="gender-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>gender</a></span></li></ul></li></ul></div>

In [None]:
%run ./PASTEL_MTL_training_utils.ipynb

In [None]:
def predict_logits_labels(trainer, dataset, task_name):
    pred = trainer.predict(dataset)
    df = dataset.df.copy()
    logits = pred.predictions[task_name]
    labels = logits.argmax(-1)
    df['logits'] = logits.tolist()
    df[f'predicted_{task_name}'] = labels.tolist()
    return df

# train each task and get prediction

## country

In [None]:
################# settings #################
selected_tasks = ['country']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running training *****
  Num examples = 6385
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distr

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 25421
  Batch size = 32


## politics

In [None]:
################# settings #################
selected_tasks = ['politics']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)

## tod

In [None]:
################# settings #################
selected_tasks = ['tod']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)

## age

In [None]:
################# settings #################
selected_tasks = ['age']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)

## education

In [None]:
################# settings #################
selected_tasks = ['education']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)

## ethnic

In [None]:
################# settings #################
selected_tasks = ['ethnic']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)

## gender

In [None]:
################# settings #################
selected_tasks = ['gender']
p=0.8
############################################
# build args
my_training_args = MyTrainingArgs(selected_tasks=selected_tasks,
                                  model_name=f'PASTEL single task {selected_tasks[0]}',
                                  base_model_name='bert-base-uncased',
                                  freeze_bert=False,
                                  use_pooler=False,
                                  num_epoch=5,
#                                   data_limit=30000,
                                 )
hg_training_args = TrainingArguments(
    output_dir=my_training_args.model_folder,   # output directory
    num_train_epochs=my_training_args.num_epoch,     # total number of training epochs
    per_device_train_batch_size=my_training_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=my_training_args.batch_size,   # batch size for evaluation
    warmup_steps=my_training_args.num_warmup_steps,    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{my_training_args.model_folder}/logs",  # directory for storing logs
    logging_first_step = True, 
    evaluation_strategy="epoch",     # either 'epoch' or 'steps'
    save_strategy = 'epoch',
    save_total_limit = 1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,  # by default decide on loss, change it by setting metric_for_best_model
)

# initialize model and dataset
model = init_model(my_training_args)
freeze_model(model, my_training_args.freeze_bert)

train_dataset = MyDataset(my_training_args, f'p={p}_unmasked_train')
val_dataset = MyDataset(my_training_args, f'p={p}_masked_train')

trainer = MyTrainer(
    model=model,   # the instantiated Transformers model to be trained
    args=hg_training_args,                  # training arguments, defined above
    tokenizer=model.tokenizer, 
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train
trainer.train()

# get prediction and save to file
df_train = predict_logits_labels(trainer, train_dataset, selected_tasks[0])
df_val = predict_logits_labels(trainer, val_dataset, selected_tasks[0])
df_combined = pd.concat([df_train, df_val], axis = 0)
df_combined = df_combined.sort_values('Unnamed: 0')

output_path = f'{data_folder}/pastel/processed/p={p}_predicted'
Path(output_path).mkdir(parents=True, exist_ok=True)
df_combined.to_csv(f'{output_path}/{selected_tasks[0]}.csv', index=False)