# Training workflow

## Install necessary libraries for loading repo


In [None]:
%%capture
!pip install dvc fastds
import os

## Set all credentials and download all necessary files/data for training

### Set up local repo and branch

In [None]:
# Clone repo with personal token (Settings -> Tokens -> Default Access Token)
!git clone https://{your_token}@dagshub.com/Omdena/NYU.git
%cd NYU

# Switch to branch you want to work with and sync with remote branch (if necessary)
!git fetch origin
!git checkout -b cross-validation origin/cross-validation

# Change directory to training workflow
%cd tasks/task-4-language-transformer-models/workflow

### Set up DVC and git

In [None]:
%%bash
dvc remote add origin --local https://dagshub.com/Omdena/NYU.dvc
dvc remote modify --local origin auth basic
dvc remote modify --local origin user {your_username}
dvc remote modify --local origin password {your_token}

In [None]:
%%bash
git config --global user.email "user@gmail.com"
git config --global user.name "username"

### Set mlflow credentials as env variables

In [None]:
%env MLFLOW_TRACKING_USERNAME={your_username}
%env MLFLOW_TRACKING_PASSWORD={your_token}
%env MLFLOW_TRACKING_URI=https://dagshub.com/Omdena/NYU.mlflow

### Pull training data

In [None]:
!dvc pull -r origin -R ../data/hate-speech-homophobia

### Install dependencies for training

In [None]:
%%capture
!pip install -r requirements.txt

## Making changes to code only

In [None]:
!git status

In [None]:
!git add train_k_fold.py

In [None]:
!git commit -m "Update training code"

In [None]:
!git push

## Data processing pipeline

## Create cross validation pipeline (*This should be run only if there's a change in the pipeline that we want to update in DagsHub*)

Takes all data and creates k folds to train model and give confidence interval on performance metrics. No trained model is saved since cross-validation is intended for picking best configuration only.

In [None]:
# Create branch

!git checkout -b cross-validation

In [None]:
!dvc run -n train_k_fold \
-d ../data/hate-speech-homophobia/train_data.csv \
-d train_k_fold.py \
-p params.yaml: \
-m model_artifacts/cv_results.json \
-o model_artifacts/args.pt \
-o model_artifacts/logfile.log \
--force \
--no-run-cache \
python train_k_fold.py

In [None]:
%%bash
git status
git add dvc.yaml dvc.lock
git commit -m "Create k stratified cross validation training pipeline"
git push
dvc push -r origin

## Create pipeline for train-dev-test procedure (*This should be run only if there's a change in the pipeline that we want to update in DagsHub*)

Best model on dev set is saved and compute metrics of this model on test set.

In [None]:
!dvc run -n train_eval \
-d ../data/hate-speech-homophobia/train_data.csv \
-d ../data/hate-speech-homophobia/dev_data.csv \
-d ../data/hate-speech-homophobia/test_data.csv \
-p params.yaml \
-d train_eval.py \
-o model_artifacts/model_best.pt \
-o model_artifacts/args.pt \
-o model_artifacts/test_labels.txt \
-o model_artifacts/test_labels_gold.txt \
-o model_artifacts/test_labels_pred.txt \
-o model_artifacts/test_labels_prob_pred.txt \
-o model_artifacts/test_pr_values.txt \
--force \
--no-run-cache \
python train_eval.py

In [None]:
%%bash
git status
git add dvc.yaml dvc.lock
git commit -m "Create k stratified cross validation training pipeline"
git push
dvc push -r origin

## Run workflow

Modify `params.yaml` file to tune hyperparams and training arguments as needed.

In [None]:
!dvc repro

Save changes

In [None]:
%%bash
git add add dvc.yaml dvc.lock
git commit -m "Update pipeline for binary classification"
git push

In [None]:
!dvc push -r origin

## Deleting pipeline

In [None]:
%%bash
dvc remove train_k_fold
git add .gitignore dvc.lock dvc.yaml
git commit -m "Remove cross-validation pipeline"
git push