# Creating custom dataset

In [39]:
import torch
from torchtext import data

A = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
B = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)

examples = [
    {"a": 1, "b": 2}, 
    {"a": 2, "b": 3},
]

fields = {"a": ('a', A), "b": ('b', B)}
examples = [data.Example.fromdict(t, fields=fields) for t in examples]

dataset = data.Dataset(examples, fields=fields)

### Create Dataset from multiple csvs

First, create fields and tokenizer

In [110]:
from transformers import BertTokenizer
from offenseval.nn import Tokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id

    # Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)

ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

TEXT = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)

Let's load the distant dataset

In [112]:
import pandas as pd

df = pd.read_table("../../data/English/task_a_distant.xsmall.tsv")

df["subtask_a"] = df["average"] > 0.5

df

Unnamed: 0,id,text,average,std,subtask_a
0,1158636827068239873,I’m so happy everything is happening the way i...,0.145000,0.201947,False
1,1186293241857093632,@USER It started? Did you just say Evodia isn...,0.190505,0.187738,False
2,1162448233165017089,interstellar was amazing but it blew my mind t...,0.523429,0.087635,True
3,1186861360216465408,@USER When i come back home and i see that you...,0.348687,0.173647,False
4,1157489832106401794,"An Ass Is But An Ass, Though Laden With Gold",0.845292,0.186436,True
...,...,...,...,...,...
903,1160428332137242624,If you wher born in 1972 or 1983 you most like...,0.454989,0.084214,False
904,1157923899801313280,"if a woman asks you a question, it’s better to...",0.270029,0.196148,False
905,1186016484943958016,@USER He’s a cracker 🐶,0.407586,0.132179,False
906,1187946296390410240,@USER The playlists should have more than 2 so...,0.171523,0.178548,False


In [113]:
df.iloc[1].text

'@USER It started?  Did you just say Evodia isnt part of the cast? yoh, then i have no reason to watch😭'

In [118]:
fields = {"id": ('id', ID), "text": ('text', TEXT), "subtask_a": ("subtask_a", SUBTASK_A)}

ex = df.iloc[150]

example = data.Example.fromdict(ex.to_dict(), fields=fields)


In [119]:
examples = [data.Example.fromdict(t.to_dict(), fields=fields) for _, t in df.iterrows()]

dataset = data.Dataset(examples, fields=fields)

Summing up in one function

In [126]:
def build_examples(path, fields):
    df = pd.read_table(path)
    print(df.columns)
    if "average" in df.columns:
        df["subtask_a"] = "NOT"
        df.loc[df["average"] > 0.5, "subtask_a"] = "OFF" 
    if "tweet" in df.columns:
        df["text"] = df["tweet"]
    examples = [data.Example.fromdict(t.to_dict(), fields=fields) for _, t in df.iterrows()]
    return examples


In [130]:
examples = build_examples("../../data/English/task_a_distant.xsmall.tsv", fields)
examples += build_examples("../../data/Danish/train.tsv", fields)
examples += build_examples("../../data/Greek/train.tsv", fields)
examples += build_examples("../../data/Arabic/offenseval-ar-training-v1.tsv", fields)
 

Index(['id', 'text', 'average', 'std'], dtype='object')
Index(['id', 'tweet', 'subtask_a'], dtype='object')
Index(['id', 'tweet', 'subtask_a'], dtype='object')
Index(['id', 'tweet', 'subtask_a'], dtype='object')


In [131]:
data.Dataset(examples, fields)

<torchtext.data.dataset.Dataset at 0x7f05f3adfda0>