# Training workflow

## Install necessary libraries for loading repo


In [None]:
%%capture
!pip install dvc fastds
import os

## Set all credentials and download all necessary files/data for training

### Set up local repo and branch

In [None]:
# Clone repo with personal token (Settings -> Tokens -> Default Access Token)
!git clone https://{token}@dagshub.com/Omdena/NYU.git
%cd NYU

# Switch to branch you want to work with and sync with remote branch (if necessary)
!git fetch origin
#!git checkout -b cross-validation origin/cross-validation

# Change directory to training workflow
%cd tasks/task-4-language-transformer-models/workflow

### Set up DVC and git

In [None]:
%%bash
dvc remote add origin --local https://dagshub.com/Omdena/NYU.dvc
dvc remote modify --local origin auth basic
dvc remote modify --local origin user {user}
dvc remote modify --local origin password {token}

In [None]:
%%bash
git config --global user.email "{user}@gmail.com"
git config --global user.name "{user}"

### Set mlflow credentials as env variables

In [None]:
%env MLFLOW_TRACKING_USERNAME={user}
%env MLFLOW_TRACKING_PASSWORD={token}
%env MLFLOW_TRACKING_URI=https://dagshub.com/Omdena/NYU.mlflow

### Pull training data

In [None]:
!dvc pull -r origin

In [None]:
!dvc pull -r origin -R ../data/train.csv ../data/test.csv

### Install dependencies for training

In [None]:
%%capture
!pip install -r requirements.txt

## Making changes to code only

In [None]:
!git status

In [None]:
!git add train_k_fold.py

In [None]:
!git commit -m "Update training code"

In [None]:
!git push

## Data processing pipeline

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
file_names = [
    "/content/1-100-positives-og.csv",
    "/content/2-100-positives-og.csv",
    "/content/3-100-positives-og.csv",
    "/content/4-561-positives.csv"
]
df_data = []

for filename in file_names:
    df = pd.read_csv(filename, index_col=None, header=0, sep=",")
    df_data.append(df)

data_positive = pd.concat(df_data, axis=0, ignore_index=True)
data_positive.rename(columns = {'sample':'text', 'category': 'label'}, inplace = True)
data_positive.dropna(axis=0, how="any", inplace=True)
data_positive['label'] = 'Hate-Speech'

print(f"Dreamspace annotated positives: {data_positive.shape[0]}")

In [None]:
data_negative = pd.read_csv("/content/1-366-negatives.csv", index_col=None, header=0)
data_negative.rename(columns = {'sample':'text', 'category': 'label'}, inplace = True)
data_negative.dropna(axis=0, how="any", inplace=True)
data_negative['label'] = 'Non-Hate-Speech'

print(f"Dreamspace annotated negatives: {data_negative.shape[0]}")

In [None]:
data_validation = pd.read_csv("/content/validation-set.csv", index_col=None, header=0)
data_validation.rename(columns = {'sample':'text'}, inplace = True)
data_validation['label'] = data_validation['label'].map(lambda x: 'Hate-Speech' if x == 'positive' else 'Non-Hate-Speech')

print(f"Dreamspace valdation positives: {data_validation.label.value_counts()[1]} ({data_validation.label.value_counts(normalize=True)[1]*100 :.2f}%)")
print(f"Dreamspace valdation negatives: {data_validation.label.value_counts()[0]} ({data_validation.label.value_counts(normalize=True)[0]*100 :.2f}%)")

In [None]:
data_docano = pd.read_csv("/content/doccano_annotated.csv", index_col=0, header=0)
data_docano['label'] = data_docano['label'].map(lambda x: 'Hate-Speech' if x == 'Positive' else 'Non-Hate-Speech')

print(f"Task 2 annotated positives: {data_docano.label.value_counts()[1]} ({data_docano.label.value_counts(normalize=True)[1]*100 :.2f}%)")
print(f"Task 2 annotated negatives: {data_docano.label.value_counts()[0]} ({data_docano.label.value_counts(normalize=True)[0]*100 :.2f}%)")

In [None]:
data_homophobia = pd.read_csv("/content/hate-speech-homophobia.csv", index_col=0, header=0)

print(f"homophobia dataset positives: {data_homophobia.label.value_counts()[1]} ({data_homophobia.label.value_counts(normalize=True)[1]*100 :.2f}%)")
print(f"homophobia dataset negatives: {data_homophobia.label.value_counts()[0]} ({data_homophobia.label.value_counts(normalize=True)[0]*100 :.2f}%)")

For the test set, we consider the provided validation set and a small sample of the other datasets, given that the different datasets focus on different topics.

In [None]:
train_homophobia, test_homophobia = train_test_split(
    data_homophobia,
    test_size=0.05,
    random_state=1,
    stratify=data_homophobia.label
)

train_docano, test_docano = train_test_split(
    data_docano,
    test_size=0.05,
    random_state=1,
    stratify=data_docano.label
)

train_positive, test_positive = train_test_split(
    data_positive,
    test_size=0.1,
    random_state=1,
)

In [None]:
all_test = pd.concat([test_homophobia, test_docano, test_positive, data_negative, data_validation], axis=0, ignore_index=True)
test_counts = all_test.label.value_counts(normalize=True)
print(f"Test positives: {all_test.shape[0]*test_counts[1]} ({test_counts[1]*100 :.2f})")
print(f"Test negatives: {all_test.shape[0]*test_counts[0]} ({test_counts[0]*100 :.2f})")

In [None]:
all_train = pd.concat([train_homophobia, train_docano, train_positive], axis=0, ignore_index=True)
train_counts = all_train.label.value_counts(normalize=True)
print(f"Train positives: {all_train.shape[0]*train_counts[1]} ({train_counts[1]*100 :.2f})")
print(f"Train negatives: {all_train.shape[0]*train_counts[0]} ({train_counts[0]*100 :.2f})")

In [None]:
all_test.to_csv("/content/NYU/tasks/task-4-language-transformer-models/data/test.csv")
all_train.to_csv("/content/NYU/tasks/task-4-language-transformer-models/data/train.csv")

In [None]:
!dvc status

In [None]:
!dvc add ../data

In [None]:
!git status

In [None]:
!git add ../data.dvc

In [None]:
!git commit -m "Update datasets"
!git push

In [None]:
!dvc push -r origin

## Create cross validation pipeline (*This should be run only if there's a change in the pipeline that we want to update in DagsHub*)

Takes all data and creates k folds to train model and give confidence interval on performance metrics. No trained model is saved since cross-validation is intended for picking best configuration only.

In [None]:
# Create branch
!git checkout -b cross-validation origin/cross-validation

In [None]:
!dvc run -n train_k_fold \
-d ../data/train.csv \
-d train_k_fold.py \
-p params.yaml: \
-m model_artifacts/cv_results.json \
-o model_artifacts/args.pt \
-o model_artifacts/logfile.log \
--force \
--no-run-cache \
python train_k_fold.py

In [None]:
!git status

In [None]:
%%bash
git status
git add dvc.lock
git commit -m "Update k stratified cross validation training pipeline"
git push
dvc push -r origin

## Create pipeline for train-dev procedure (*This should be run only if there's a change in the pipeline that we want to update in DagsHub*)

In [None]:
%cd /content/NYU/tasks/task-4-language-transformer-models/workflow

Best model on dev set is saved and compute metrics of this model on test set.

In [None]:
!dvc run -n train_eval \
-d ../data/train.csv \
-p params.yaml: \
-d train_eval.py \
-o model_artifacts/model_best.pt \
-o model_artifacts/args.pt \
-o model_artifacts/test_labels.txt \
-o model_artifacts/test_labels_gold.txt \
-o model_artifacts/test_labels_pred.txt \
-o model_artifacts/test_labels_prob_pred.txt \
-m model_artifacts/test_pr_values.csv \
--force \
--no-run-cache \
python train_eval.py

In [None]:
!rm ./../../../.dvc/tmp/rwlock

In [None]:
%%bash
git status
git add train_eval.py params.yaml dvc.lock
git commit -m "Update training pipeline"
git push
dvc push -r origin

## Create pipeline for test procedure (This should be run only if there's a change in the pipeline that we want to update in DagsHub)

In [None]:
!dvc run -n test \
-d ../data/test.csv \
-d test.py \
-d model_artifacts/model_best.pt \
-d model_artifacts/args.pt \
-m test_results/test_metrics.json \
--force \
--no-run-cache \
python test.py

In [None]:
!git status

In [None]:
%%bash
git status
git add dvc.lock dvc.yaml est_results/.gitignore
git commit -m "Update training pipeline"
git push
dvc push -r origin

## Run workflow

Modify `params.yaml` file to tune hyperparams and training arguments as needed.

In [None]:
!dvc repro

In [None]:
!git status

Save changes

In [None]:
%%bash
git add ../data.dvc dvc.lock params.yaml
git commit -m "Update pipeline for binary classification"
git push

In [None]:
!dvc push -r origin

## Deleting pipeline

In [None]:
%%bash
dvc remove train_k_fold
git add .gitignore dvc.lock dvc.yaml
git commit -m "Remove cross-validation pipeline"
git push