# Build datasets workflow

## Install necessary libraries for loading repo


In [None]:
%%capture
!pip install dvc fastds
import os
from getpass import getpass
import urllib

## Set all credentials and download all necessary files/data for training

### Set up local repo and branch

In [None]:
your_token = getpass('dagshub access token: ') 
your_token = urllib.parse.quote(your_token) 

your_username = input('dagshub username: ')
your_email = input('email address: ')

In [None]:
# Clone repo with personal token (Settings -> Tokens -> Default Access Token)
cmd_string = 'git clone https://{0}@dagshub.com/Omdena/NYU.git'.format(your_token)
os.system(cmd_string)
%cd NYU

# Switch to branch you want to work with and sync with remote branch (if necessary)
!git fetch origin
#!git checkout -b cross-validation origin/cross-validation

# Change directory to training workflow
%cd tasks/task-4-language-transformer-models/workflow

### Set up DVC and git

In [None]:
!dvc remote modify --local origin auth basic
!dvc remote modify --local origin user '{your_username}'
!dvc remote modify --local origin password '{your_token}'

In [None]:
!git config --global user.email '{your_email}'
!git config --global user.name '{your_username}'

### Pull training data

In [None]:
!dvc pull -r origin

## Data processing pipeline

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

DreamSpace increments for positive examples

In [None]:
path_incremental_data = '/content/NYU/DS-data/100-increments'
path_incremental_positives = os.path.join(path_incremental_data, 'positives')
file_names = [file for file in os.listdir(path_incremental_positives) if file.endswith('csv')]
print(*file_names, sep='\n')

df_data = []

for filename in file_names:
    df = pd.read_csv(os.path.join(path_incremental_positives, filename), index_col=None, header=0, sep=",")
    df.columns = ['text', 'label']
    df_data.append(df)

data_positive = pd.concat(df_data, axis=0, ignore_index=True)
#data_positive.rename(columns = {'sample':'text', 'category': 'label'}, inplace = True) #names of columns not consistent
data_positive.dropna(axis=0, how="any", inplace=True)
data_positive['label'] = 'Hate-Speech'

print(f"Dreamspace annotated positives: {data_positive.shape[0]}")

DreamSpace increments for negative examples

In [None]:
path_incremental_negatives = os.path.join(path_incremental_data, 'negatives')
file_names = [file for file in os.listdir(path_incremental_negatives) if file.endswith('csv')]
print(*file_names, sep='\n')

df_data = []

for filename in file_names:
    df = pd.read_csv(os.path.join(path_incremental_negatives, filename), index_col=None, header=0, sep=",")
    df.columns = ['text', 'label']
    df_data.append(df)

data_negative = pd.concat(df_data, axis=0, ignore_index=True)
#data_positive.rename(columns = {'sample':'text', 'category': 'label'}, inplace = True) #names of columns not consistent
data_negative.dropna(axis=0, how="any", inplace=True)
data_negative['label'] = 'Non-Hate-Speech'

print(f"Dreamspace annotated negatives: {data_negative.shape[0]}")

Validation CSV file

In [None]:
data_validation = pd.read_csv("/content/NYU/tasks/task-4-language-transformer-models/data/validation-set.csv", index_col=None, header=0)
data_validation.rename(columns = {'sample':'text'}, inplace = True)
data_validation['label'] = data_validation['label'].map(lambda x: 'Hate-Speech' if x == 'positive' else 'Non-Hate-Speech')

print(f"Dreamspace valdation positives: {data_validation.label.value_counts()[1]} ({data_validation.label.value_counts(normalize=True)[1]*100 :.2f}%)")
print(f"Dreamspace valdation negatives: {data_validation.label.value_counts()[0]} ({data_validation.label.value_counts(normalize=True)[0]*100 :.2f}%)")

Doccano data

In [None]:
doccano_annotated_file = '/content/NYU/tasks/task-2-data-annotation/data/doccano_annotated.csv'
data_docano = pd.read_csv(doccano_annotated_file, index_col=0, header=0)
data_docano['label'] = data_docano['label'].map(lambda x: 'Hate-Speech' if x == 'Positive' else 'Non-Hate-Speech')

print(f"Task 2 annotated positives: {data_docano.label.value_counts()[1]} ({data_docano.label.value_counts(normalize=True)[1]*100 :.2f}%)")
print(f"Task 2 annotated negatives: {data_docano.label.value_counts()[0]} ({data_docano.label.value_counts(normalize=True)[0]*100 :.2f}%)")

Homophobia dataset

In [None]:
data_homophobia = pd.read_csv("/content/NYU/tasks/task-4-language-transformer-models/data/hate-speech-homophobia/hate-speech-homophobia.csv", index_col=0, header=0)

print(f"homophobia dataset positives: {data_homophobia.label.value_counts()[1]} ({data_homophobia.label.value_counts(normalize=True)[1]*100 :.2f}%)")
print(f"homophobia dataset negatives: {data_homophobia.label.value_counts()[0]} ({data_homophobia.label.value_counts(normalize=True)[0]*100 :.2f}%)")

For the test set, we consider the provided validation set and a small sample of the other datasets, given that the different datasets focus on different topics.

In [None]:
train_homophobia, test_homophobia = train_test_split(
    data_homophobia,
    test_size=0.05,
    random_state=1,
    stratify=data_homophobia.label
)

train_docano, test_docano = train_test_split(
    data_docano,
    test_size=0.05,
    random_state=1,
    stratify=data_docano.label
)

train_positive, test_positive = train_test_split(
    data_positive,
    test_size=0.1,
    random_state=1,
)

In [None]:
all_test = pd.concat([test_homophobia, test_docano, test_positive, data_negative, data_validation], axis=0, ignore_index=True)
test_counts = all_test.label.value_counts(normalize=True)
print(f"Test positives: {all_test.shape[0]*test_counts[1]} ({test_counts[1]*100 :.2f})")
print(f"Test negatives: {all_test.shape[0]*test_counts[0]} ({test_counts[0]*100 :.2f})")

In [None]:
all_train = pd.concat([train_homophobia, train_docano, train_positive], axis=0, ignore_index=True)
train_counts = all_train.label.value_counts(normalize=True)
print(f"Train positives: {all_train.shape[0]*train_counts[1]} ({train_counts[1]*100 :.2f})")
print(f"Train negatives: {all_train.shape[0]*train_counts[0]} ({train_counts[0]*100 :.2f})")

In [None]:
all_test.to_csv("/content/NYU/tasks/task-4-language-transformer-models/data/test.csv")
all_train.to_csv("/content/NYU/tasks/task-4-language-transformer-models/data/train.csv")

In [None]:
!dvc status

In [None]:
!dvc add ../data

In [None]:
!git status

In [None]:
!git add ../data.dvc

In [None]:
!git commit -m "Update datasets"
!git push

In [None]:
!dvc push -r origin