# Retrain model, retune hyperparameters or predict with saved model


Originally called: Rallypoint_Multimodal_model_notebook(final).ipynb

Author: Noah Jones

## Load library requirements

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')


In [None]:
import os
os.chdir("/content/drive/MyDrive/Rallypoint Milestone 6 Code/Model notebook/Multimodal-Toolkit")
!pip install -r requirements.txt

## Setup WANDB Logging for Hyperparameter Tuning (Optional)

In [None]:
!wandb login

##### As an additional note it is better to use comet.ml instead with this version of transformers (3.1.0). Use the environment variables from the comet.ml documentation and make sure COMET_MODE in the main.py file is set to enabled i order to use comet.ml as shown below:

```
os.environ['COMET_MODE'] = 'ENABLED'
```



## Format data

#### *only required if not previously formatted (automatically done in notebooks from "Data Notebook")

### Encodes categorical labels, imputes missing data, and normalizes numerical data)

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

def encode_labels(df,ordinal=True):

  '''Vectorizes `labels` column
     Input: df
     Return: df'''
  label_codes = {'SITB_ABSENT' : 0, 'SITB_PRESENT' : 1}
  categorical_codes = {'comment':0, 'Question': 1, 'StatusUpdate':2}
  if df['label'].dtype == 'O':
    df['label'] = df['label'].replace(label_codes)
  if df['type'].dtype == 'O' and ordinal:
    df['type'] = df['type'].replace(categorical_codes)
  return df

def fill_in_miss_numeric_and_column(df,ordinal=True,strategy='constant'):
  num_cols = ['reputation','contact_size']
  imp = SimpleImputer(missing_values=np.nan, strategy=strategy)
  df.loc[:, num_cols] = imp.fit_transform(df[num_cols])
  if ordinal:
    df['type'] = df.loc[:, 'type'].fillna(4)
  else:
    df = pd.get_dummies(df,columns=['type'],dummy_na=True)
    print("REMEMBER TO CHANGE rp.config")

  print(df.columns)
  return df

def format_df(df, ordinal, imputation_strategy):
  new_df = encode_labels(df,ordinal=ordinal)
  clean_df = fill_in_miss_numeric_and_column(new_df,ordinal=ordinal,strategy=imputation_strategy)
  return clean_df

# Place train, test and validation datasets in RP directory: By default these are the files from september and val/test are the same are the same
train = pd.read_csv('datasets/rp/train.csv')
test = pd.read_csv('datasets/rp/test.csv')
print(train.shape, test.shape)
train = format_df(train,True,'constant')
print(train.head())
test = format_df(test,True,'constant')
print(test.head())

#make sure rp config matches the correct columns

train_path = 'datasets/rp/train.csv'
val_path = 'datasets/rp/val.csv'
test_path = 'datasets/rp/test.csv'

train.to_csv(train_path,index=False)
test.to_csv(val_path,index=False)
test.to_csv(test_path,index=False)
print(train.shape, test.shape)

## Show Dataset Shapes

In [None]:
import pandas as pd
import os
try:
  os.chdir("/content/drive/MyDrive/Rallypoint Milestone 6 Code/Model notebook/Multimodal-Toolkit")
except FileNotFoundError:
  pass
train = pd.read_csv("datasets/rp/train.csv")
val = pd.read_csv("datasets/rp/val.csv")
test = pd.read_csv("datasets/rp/test.csv")

train.shape

In [None]:
val.shape

In [None]:
test.shape

## Train

[I 2021-07-22 11:01:33,437] Trial 2 finished with value: 710.392753783129 and parameters: {'learning_rate': 4.2655252122250964e-05, 'num_train_epochs': 2, 'weight_decay': 2, 'per_device_train_batch_size': 16}. Best is trial 2 with value: 710.392753783129.


In [None]:

import os
try:
  os.chdir("/content/drive/MyDrive/Rallypoint Milestone 6 Code/Model notebook/Multimodal-Toolkit")
except FileNotFoundError:
  pass
%run main.py \
    --output_dir=./logs/save_model_test \
    --task=classification \
    --combine_feat_method=text_only \
    --categorical_encode_type=none\
    --learning_rate=5.4e-5 \
    --seed=42 \
    --mode=train \
    --model_name_or_path=roberta-base \
    --data_path=./datasets/rp \
    --num_train_epochs=1\
    --column_info_path=./datasets/rp/rp_config(meta).json \
    --overwrite_output_dir \
    --per_device_train_batch_size 16 \
    --eval_steps=420 \

## Hyperparameter Tune

Default runs for 10 trials has a batch size of 16 and searches over weight decay (0, 2), train epochs (1,4), learning rate (1e-5, 9e-5)

### Pretrained RoBERTa Model with Text and Numeric Metadata

In [None]:
#can change hyperparameter arguments by modifying my_hyp_space function in main.py
import os
try:
  os.chdir("/content/drive/MyDrive/Rallypoint Milestone 6 Code/Model notebook/Multimodal-Toolkit")
except FileNotFoundError:
  pass
%run main.py \
    --output_dir=./logs/roberta_pretrained_meta_text_num(final) \
    --task=classification \
    --combine_feat_method=attention_on_cat_and_numerical_feats \
    --categorical_encode_type=none\
    --learning_rate=5.4e-5 \
    --seed=42 \
    --mode=hyp_tune \
    --model_name_or_path=roberta-pretrained \
    --data_path=./datasets/rp \
    --num_train_epochs=3\
    --column_info_path=./datasets/rp/rp_config(meta).json \
    --overwrite_output_dir \
    --do_predict \
    --per_device_train_batch_size 16 \
    --eval_steps=200 \
    --save_steps=10000 \

## Predict single time with saved model

#### Below is the cell to predict with the best model found from hyperparameter tuning. First overwrite the old test set ("test.csv") in datasets/rp with a new formatted file with the same name.  Then run the code below. Output can be found in output_dir below

In [None]:
import os
try:
  os.chdir("/content/drive/MyDrive/Rallypoint Milestone 6 Code/Model notebook/Multimodal-Toolkit")
except FileNotFoundError:
  pass
%run main.py \
    --output_dir=./logs/roberta_meta_text_num(final)/prediction_from_best \
    --task=classification \
    --combine_feat_method=attention_on_cat_and_numerical_feats \
    --categorical_encode_type=none\
    --model_name_or_path=./logs/roberta_meta_text_num(final)/pred_seed_2 \
    --data_path=./datasets/rp \
    --mode=infer \
    --do_predict \
    --num_train_epochs=3\
    --column_info_path=./datasets/rp/rp_config(meta).json \
    --overwrite_output_dir \
    --disable_tqdm \
    --per_device_train_batch_size 16 \
    --eval_steps=75 \
    --save_steps=160 \

## Predicting multiple times with best models and varying seeds

### RoBERTa Pretrain (Meta + Num)

In [None]:
#can change hyperparameter arguments by modifying my_hyp_space function in main.py
import os
try:
  os.chdir("/content/drive/MyDrive/Rallypoint Milestone 6 Code/Model notebook/Multimodal-Toolkit")
except FileNotFoundError:
  pass
%run multiple_prediction.py \
    --output_dir=./logs/roberta_pretrained_meta_text_num(final)/predictions \
    --task=classification \
    --combine_feat_method=attention_on_cat_and_numerical_feats \
    --categorical_encode_type=none\
    --learning_rate=5.4e-5 \
    --seed=41 \
    --mode=train \
    --model_name_or_path=roberta-pretrained \
    --data_path=./datasets/rp \
    --num_train_epochs=3\
    --column_info_path=./datasets/rp/rp_config(meta).json \
    --overwrite_output_dir \
    --do_predict \
    --per_device_train_batch_size 16 \
    --eval_steps=200 \
    --save_steps=10000 \
    --num_seeds = 4 \

## Additional Changes (don't need to run)

#### Log model metrics (Optional)

In [None]:
# To log data relevant to a model uncomment this code and load wandb.  Must be done before training
# %env WANDB_NAME = <YOUR_NAME_HERE>
# !wandb login

### Code changes that I added to base multimodal data library

In [None]:
#applied oversampling with proportion to the classes
import torch
from torch.utils.data import WeightedRandomSampler, DataLoader
from transformers import Trainer
class CW_Trainer(Trainer):
    def get_train_dataloader(self):
        """
        Returns the training :class:`~torch.utils.data.DataLoader`.

        Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
        to distributed training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        # train_sampler = self._get_train_sampler()
        target = np.array(train_dataset.labels)
        print('target train 0/1: {}/{}'.format(
            len(np.where(target == 0)[0]), len(np.where(target == 1)[0])))
        class_sample_count = np.array([len(np.where(target == t)[0]) for t in np.unique(target)])
        weight = 1. / class_sample_count
        samples_weight = np.array([weight[t] for t in target])
        samples_weight = torch.from_numpy(samples_weight)
        samples_weight = samples_weight.double()
        train_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))


        return DataLoader(
        self.train_dataset,
        batch_size=self.args.train_batch_size,
        sampler=train_sampler,
        collate_fn=self.data_collator,
        drop_last=self.args.dataloader_drop_last,
        )
    
#added hyperparameter tuning
import torch
from torch.utils.data import WeightedRandomSampler, DataLoader
from transformers import Trainer

        def model_init():
          return AutoModelWithTabular.from_pretrained(
          model_args.config_name if model_args.config_name else model_args.model_name_or_path,
          config=config,
          cache_dir=model_args.cache_dir
          )

        def my_hp_space(trial) -> Dict[str, float]:
          #optuna 
          return {
                    "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
                    "num_train_epochs": trial.suggest_int("num_train_epochs", 1,4, step=1),
                    "weight_decay": trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True),
            #   "learning_rate": trial.suggest_float("learning_rate", 9e-5, 1e-5, log=True),
            #   "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 4),
            #   "weight_decay": trial.suggest_categorical("weight_decay", [0, 2]),
              "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16]),
          }

        if i == 0:
            logger.info(tabular_config)
            logger.info(model_init())


        if training_args.do_train:
            # trainer.train(
            #     model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
            # )
            best_trial = trainer.hyperparameter_search(
            direction="maximize", 
            backend="optuna",
            hp_space=my_hp_space, 
            n_trials=2)
            print(best_trial)
            trainer.save_model()