# Training workflow

## Install necessary libraries for loading repo


In [None]:
%%capture
!pip install dvc fastds
import os
from getpass import getpass
import urllib

## Set all credentials and download all necessary files/data for training

### Set up local repo and branch

In [None]:
your_token = getpass('dagshub access token: ') 
your_token = urllib.parse.quote(your_token) 

your_username = input('dagshub username: ')
your_email = input('email address: ')

In [None]:
# Clone repo with personal token (Settings -> Tokens -> Default Access Token)
cmd_string = 'git clone https://{0}@dagshub.com/Omdena/NYU.git'.format(your_token)
os.system(cmd_string)
%cd NYU

# Switch to branch you want to work with and sync with remote branch (if necessary)
!git fetch origin
#!git checkout -b cross-validation origin/cross-validation

# Change directory to training workflow
%cd tasks/task-4-language-transformer-models/workflow

### Set up DVC and git

In [None]:
!dvc remote modify --local origin auth basic
!dvc remote modify --local origin user '{your_username}'
!dvc remote modify --local origin password '{your_token}'

[0m[0m[0m

In [None]:
!git config --global user.email '{your_email}'
!git config --global user.name '{your_username}'

### Set mlflow credentials as env variables

In [None]:
os.environ['MLFLOW_TRACKING_USERNAME'] = your_username
os.environ['MLFLOW_TRACKING_PASSWORD'] = your_token
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/Omdena/NYU.mlflow'

### Pull training data

In [None]:
!dvc pull -r origin -R ../data/train.csv ../data/test.csv

### Install dependencies for training

In [None]:
%%capture
!pip install -r requirements.txt

## Create pipeline for train-dev procedure (*This should be run only if there's a change in the pipeline that we want to update in DagsHub*)

Best model on dev set is saved and compute metrics of this model on test set.

In [None]:
!dvc run -n train_eval \
-d ../data/train.csv \
-p params.yaml: \
-d train_eval.py \
-o model_artifacts/model_best.pt \
-o model_artifacts/args.pt \
-o model_artifacts/test_labels.txt \
-o model_artifacts/test_labels_gold.txt \
-o model_artifacts/test_labels_pred.txt \
-o model_artifacts/test_labels_prob_pred.txt \
-m model_artifacts/test_pr_values.csv \
--force \
--no-run-cache \
python train_eval.py

Update repo if you want to overwrite model artifacts with the one you just trained

In [None]:
%%bash
git status
git add params.yaml dvc.lock
git commit -m "Update training pipeline"
git push
dvc push -r origin

## Create pipeline for test procedure (This should be run only if there's a change in the pipeline that we want to update in DagsHub)

In [None]:
!dvc run -n test \
-d ../data/test.csv \
-d test.py \
-d model_artifacts/model_best.pt \
-d model_artifacts/args.pt \
-m test_results/test_metrics.json \
--force \
--no-run-cache \
python test.py

In [None]:
!git status

In [None]:
%%bash
git status
git add dvc.lock dvc.yaml est_results/.gitignore
git commit -m "Update training pipeline"
git push
dvc push -r origin

## Run workflow

This will both stages of the workflow:


*   train_eval: Train model and get evaluation metrics
*   test: Get test metrics on model trained in previous stage



Modify `params.yaml` file to tune hyperparams and training arguments as needed.

In [None]:
!dvc repro

Save changes

In [None]:
%%bash
git add ../data.dvc dvc.lock params.yaml
git commit -m "Update pipeline for binary classification"
git push

In [None]:
!dvc push -r origin

## Deleting pipeline

Run only if you want to delete a stage in the pipeline

In [None]:
%%bash
dvc remove train_eval
git add .gitignore dvc.lock dvc.yaml
git commit -m "Remove cross-validation pipeline"
git push