<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preprocessing-all-datasets" data-toc-modified-id="Preprocessing-all-datasets-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing all datasets</a></span></li><li><span><a href="#PASTEL-sentences" data-toc-modified-id="PASTEL-sentences-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>PASTEL sentences</a></span></li><li><span><a href="#PASTEL-sentences-mask-label" data-toc-modified-id="PASTEL-sentences-mask-label-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>PASTEL sentences mask label</a></span></li></ul></div>

# Preprocessing all datasets 
The purpose is making all files have exactly the same columns "index", "text", and "label".  
A dictionary named tasks is also generated, specifying number of labels for each task. If number of labels is one, means it's a regression task

Classification tasks are transformed into 0-indexed labels, while regression tasks are transformed with MinMaxScaler

In [None]:
import os 
import pandas as pd
import numpy as np
import json
from sklearn import preprocessing
from pathlib import Path
from tqdm.auto import tqdm, trange
import matplotlib.pyplot as plt

In [None]:
data_folder = '../data'

# PASTEL sentences

In [None]:
pastel_tasks = {}
les = {}
split = ['train', 'test', 'valid']
cols = ['output.sentences', 'country', 'politics', 'tod', 'age', 'education', 'ethnic', 'gender']
for s in split:
    input_data_path = f'{data_folder}/pastel/raw/v2/sentences/{s}'
    output_data_path = f'{data_folder}/pastel/processed/{s}'
    Path(output_data_path).mkdir(parents=True, exist_ok=True)

    files = os.listdir(input_data_path)
    df = pd.DataFrame()
    for file in files:
        with open(f'{input_data_path}/{file}') as f:
            l = json.load(f)
            
        l.update(l['persona'])
        del l['persona']
        df = df.append(l, ignore_index=True)
        df = df[cols]
        df.to_csv(f'{data_folder}/pastel/raw/raw_{s}_sentences.csv', index=False)
    
    if s == 'train':
        for col in cols[1:]:
            num_label = len(df[col].unique())
            pastel_tasks[col] = num_label 
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col])
            les[col] = le
    else:
        for col in cols[1:]:
            df[col] = les[col].transform(df[col])
    
    df.to_csv(os.path.join(output_data_path, 'pastel.csv'), index=False)


In [None]:
with open(os.path.join(f'{data_folder}/pastel', 'pastel_tasks2labels.json'), 'w') as f:
    json.dump(pastel_tasks, f)

# PASTEL sentences mask label

In [None]:
p = 0.8  # probability to mask the label as -1

df = pd.read_csv(f'{data_folder}/pastel/processed/train/pastel.csv')
df_copy = df.copy()
masked_output_path = f'{data_folder}/pastel/processed/p={p}_masked_train'
Path(masked_output_path).mkdir(parents=True, exist_ok=True)
unmasked_output_path = f'{data_folder}/pastel/processed/p={p}_unmasked_train'
Path(unmasked_output_path).mkdir(parents=True, exist_ok=True)

    
mask = np.random.rand(len(df), len(pastel_tasks)) < p
values = df[['country','politics','tod','age','education','ethnic','gender']].values
values[mask] = -1
df[['country','politics','tod','age','education','ethnic','gender']] = values

# for each task, save a masked csv and an unmasked one. The unmasked is used to train a classifier. 
# The trained classifier will be used to generate labels for the masked.
for task in pastel_tasks:
    df_temp = df_copy[['output.sentences', task]]
    df_masked = df_temp[df[task]==-1]
    df_masked.to_csv(f'{masked_output_path}/masked_{task}.csv', index=False)
    df_unmasked = df_temp[df[task]!=-1]    
    df_unmasked.to_csv(f'{unmasked_output_path}/unmasked_{task}.csv', index=False)
    