Source: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb

# Reading JSON

Starting with json, your data must be in the json lines format, i.e. it must be something like:

{"name": "John", "location": "United Kingdom", "age": 42, "quote": ["i", "love", "the", "united kingdom"]}
{"name": "Mary", "location": "United States", "age": 36, "quote": ["i", "want", "more", "telescopes"]}

In [0]:

from torchtext import data
from torchtext import datasets

NAME = data.Field()
SAYING = data.Field()
PLACE = data.Field()

In [0]:
fields = {'name': ('n', NAME), 'location': ('p', PLACE), 'quote': ('s', SAYING)}

In [0]:

train_data, test_data = data.TabularDataset.splits(
                            path = 'data',
                            train = 'train.json',
                            test = 'test.json',
                            format = 'json',
                            fields = fields
)

In [0]:

print(vars(train_data[0]))

# Reading CSV/TSV

In [0]:
fields = [('n', NAME), ('p', PLACE), (None, None), ('s', SAYING)]

In [0]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.tsv',
                                        validation = 'valid.tsv',
                                        test = 'test.tsv',
                                        format = 'tsv',
                                        fields = fields,
                                        skip_header = True
)

In [0]:
print(vars(train_data[0]))

In [0]:
fields = [('n', NAME), ('p', PLACE), (None, None), ('s', SAYING)]

In [0]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [0]:
print(vars(train_data[0]))

In [0]:
NAME.build_vocab(train_data)
SAYING.build_vocab(train_data)
PLACE.build_vocab(train_data)

In [0]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 1

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort = False, #don't sort test/validation data
    batch_size=BATCH_SIZE,
    device=device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.s, #sort by s attribute (quote)
    batch_size=BATCH_SIZE,
    device=device)

print('Train:')
for batch in train_iterator:
    print(batch)
    
print('Valid:')
for batch in valid_iterator:
    print(batch)
    
print('Test:')
for batch in test_iterator:
    print(batch)