# Preprocessing all datasets 
The purpose is making all files have exactly the same columns "index", "text", and "label".  
A dictionary named tasks is also generated, specifying number of labels for each task. If number of labels is one, means it's a regression task

Classification tasks are transformed into 0-indexed labels, while regression tasks are transformed with MinMaxScaler

In [1]:
import os 
import pandas as pd
import json
from sklearn import preprocessing
from pathlib import Path

In [2]:
datasets = [
        'CrowdFlower',  # not used in xslue 
        'DailyDialog',
        'EmoBank',
        #  'GYAFC', # no data provided
        'HateOffensive',
        'PASTEL',
        'SARC',
        'SarcasmGhosh',
        'SentiTreeBank',
        'ShortHumor',
        'ShortJokeKaggle',
        'ShortRomance',
        'StanfordPoliteness',
        'TroFi',
        'VUA',
]

In [3]:
split = ['train', 'test', 'dev']
in_dir = '../data/xslue/raw'
out_dir = '../data/xslue/processed'
Path(out_dir).mkdir(parents=True, exist_ok=True)
tasks = {}

In [4]:
# CrowdFlower
dataset = datasets[0]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None, usecols = [0,1,2])
    df.columns = ['index', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [5]:
# DailyDialog
dataset = datasets[1]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df = df.reset_index()
    df.columns = ['index', 'text', 'label']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
            
    df.to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [6]:
# EmoBank
dataset = datasets[2]
attributes = ['Valence', 'Arousal', 'Dominance']
for attr in attributes:
    for s in split:
        df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
        df.columns = ['index', 'Valence', 'Arousal', 'Dominance', 'text']
        df_temp = df[['index', 'text']]
        df_temp['label'] = df[attr]
        
        if s == 'train':
            num_label = len(df_temp['label'].unique())
            if num_label < 20: # classification task
                tasks[dataset+'_'+attr] = num_label 
                le = preprocessing.LabelEncoder()
                df_temp['label'] = le.fit_transform(df_temp['label'])
            else: # regression task
                tasks[dataset+'_'+attr] = 1 
                scaler = preprocessing.MinMaxScaler()
                df_temp['label'] = scaler.fit_transform(df_temp['label'].values.reshape(-1,1))
        else:
            if num_label < 20: # classification task
                df_temp['label'] = le.transform(df_temp['label'])
            else: # regression task
                df_temp['label'] = scaler.transform(df_temp['label'].values.reshape(-1,1))
        
        df_temp.to_csv(os.path.join(out_dir, s, dataset+'_'+attr+'.tsv'), sep = '\t', index=False)
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['label'] = df[attr]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['label'] = scaler.fit_transform(df_temp['label'].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['label'] = df[attr]
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [7]:
# HateOffensive
dataset = datasets[3]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)
   

In [8]:
# PASTEL
dataset = datasets[4]
attributes = ['age', 'country', 'education', 'ethnic', 'gender', 'politics', 'tod']
for attr in attributes:
    for s in split:
        df = pd.read_csv(os.path.join(in_dir, dataset, s+'_'+attr+'.tsv'), sep = '\t', header=None, usecols=[0,1,2])
        df.columns = ['index', 'label', 'text']
        
        if s == 'train':
            num_label = len(df['label'].unique())
            if num_label < 20: # classification task
                tasks[dataset+'_'+attr] = num_label 
                le = preprocessing.LabelEncoder()
                df['label'] = le.fit_transform(df['label'])
            else: # regression task
                tasks[dataset+'_'+attr] = 1 
                scaler = preprocessing.MinMaxScaler()
                df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
        else:
            if num_label < 20: # classification task
                df['label'] = le.transform(df['label'])
            else: # regression task
                df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
        
        df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'_'+attr+'.tsv'), sep = '\t', index=False)
       

In [9]:
# SARC
dataset = datasets[5]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [10]:
# SarcasmGhosh
dataset = datasets[6]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)
   

In [11]:
# SentiTreeBank
# I only choose to use the sentiment col. The fine and coarse columns are infered from the sentiment col. 
# In xslue, sentiment col is transformed into a binary classification
dataset = datasets[7]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t')
    df = df[['id', 'phrase', 'sentiment']].rename(columns={'id':'index', 'phrase':'text', 'sentiment':'label'})
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
                
    df.to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [12]:
# ShortHumor
# The source (I named it) may be used as labels too. The xslue paper only use the binary label.
dataset = datasets[8]
for s in split:
    if s == 'dev':
        df = pd.read_csv(os.path.join(in_dir, dataset, 'dev.tsv'), error_bad_lines=False, engine="python", sep = '\t', header=None)
    else:    
        df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'source', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)




  exec(code_obj, self.user_global_ns, self.user_ns)
Skipping line 185: '	' expected after '"'
Skipping line 1118: '	' expected after '"'
Skipping line 1223: '	' expected after '"'
Skipping line 1854: '	' expected after '"'
Skipping line 1866: '	' expected after '"'
Skipping line 1989: unexpected end of data


In [13]:
# ShortJokeKaggle
# The source (I named it) may be used as labels too. The xslue paper only use the binary label.
dataset = datasets[9]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'source', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
            
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)
    

In [14]:
# ShortRomance
# The source (I named it) may be used as labels too. The xslue paper only use the binary label.
dataset = datasets[10]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'source', 'label', 'text']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
            
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [15]:
# StanfordPoliteness
# I use the float score. In the xslue paper, the scores are transformed into a binary category variable
dataset = datasets[11]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['source', 'index', 'text', 'label']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
            
    df[['index', 'text', 'label']].to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [16]:
# TroFi
dataset = datasets[12]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'text', 'label']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df.to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [17]:
# VUA
dataset = datasets[13]
for s in split:
    df = pd.read_csv(os.path.join(in_dir, dataset, s+'.tsv'), sep = '\t', header=None)
    df.columns = ['index', 'text', 'label']
    
    if s == 'train':
        num_label = len(df['label'].unique())
        if num_label < 20: # classification task
            tasks[dataset] = num_label 
            le = preprocessing.LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        else: # regression task
            tasks[dataset] = 1 
            scaler = preprocessing.MinMaxScaler()
            df['label'] = scaler.fit_transform(df['label'].values.reshape(-1,1))
    else:
        if num_label < 20: # classification task
            df['label'] = le.transform(df['label'])
        else: # regression task
            df['label'] = scaler.transform(df['label'].values.reshape(-1,1))
    
    df.to_csv(os.path.join(out_dir, s, dataset+'.tsv'), sep = '\t', index=False)


In [18]:
tasks

{'CrowdFlower': 13,
 'DailyDialog': 7,
 'EmoBank_Valence': 1,
 'EmoBank_Arousal': 1,
 'EmoBank_Dominance': 1,
 'HateOffensive': 3,
 'PASTEL_age': 8,
 'PASTEL_country': 2,
 'PASTEL_education': 10,
 'PASTEL_ethnic': 10,
 'PASTEL_gender': 3,
 'PASTEL_politics': 3,
 'PASTEL_tod': 5,
 'SARC': 2,
 'SarcasmGhosh': 2,
 'SentiTreeBank': 1,
 'ShortHumor': 2,
 'ShortJokeKaggle': 2,
 'ShortRomance': 2,
 'StanfordPoliteness': 1,
 'TroFi': 2,
 'VUA': 2}

In [19]:
with open(os.path.join('../data/xslue', 'tasks.json'), 'w') as fp:
    json.dump(tasks, fp)

In [174]:
# select dataset one by one
dataset = datasets[13]
print(dataset)
df = pd.read_csv(os.path.join(in_dir, dataset, 'train.tsv'), sep = '\t', header=None)
# df = pd.read_csv(os.path.join(in_dir, dataset, 'test_age.tsv'), sep = '\t', header=None)
df.columns = ['index', 'text', 'label']
df

VUA


Unnamed: 0,index,text,label
0,a1k-fragment02,Ca n't fail to be entertaining . fail,0
1,cdb-fragment04,How much was he going to tell her ? go,0
2,ac2-fragment06,"Up until that news hit the Committee , Don had...",0
3,kbc-fragment13,Could go on to the rugby and go with them coul...,0
4,ahb-fragment51,"Finally , we went to the office and they gave ...",0
...,...,...,...
15152,a1n-fragment09,EACH new indignity in the heap visited on Wels...,1
15153,fef-fragment03,Substituting the above equation into eqn ( 3.1...,1
15154,amm-fragment02,"A fern-like plant , beautifully preserved in a...",0
15155,ahf-fragment24,And there were never more than a few dozen rin...,0
