In [1]:
import csv
from typing import Dict, Iterable, List

import torch
from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer



In [32]:
@DatasetReader.register('ag-news-csv', exist_ok=True)
class AGNewsCSVReader(DatasetReader):
    def __init__(self):
        super().__init__()
        self.tokenizer = SpacyTokenizer()
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path, newline='', encoding='utf-8-sig') as csvfile:
            reader = csv.reader(csvfile)
            next(reader, None)
            for row in reader:
                label, title, text = row
                label_field = LabelField(label)
                title_field = TextField(self.tokenizer.tokenize(title),
                                        self.token_indexers)
                text_field = TextField(self.tokenizer.tokenize(text),
                                        self.token_indexers)
                fields = {'text': text_field, 'label': label_field, 'title': title_field}
                yield Instance(fields)

In [33]:
SMALL_PATH = "/Users/weicognite/Documents/self learning/NLU_SCPD_XCS224U/Project/Efficient-Hotflip/data/small_train.csv"

In [34]:
reader = AGNewsCSVReader()
dataset = reader.read(SMALL_PATH)

HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




In [35]:
for instance in dataset[:5]:
    print(instance)

Instance with fields:
 	 text: TextField of length 20 with text: 
 		[Reuters, -, Short, -, sellers, ,, Wall, Street, 's, dwindling\band, of, ultra, -, cynics, ,, are,
		seeing, green, again, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: 3 in namespace: 'labels'.' 
 	 title: TextField of length 11 with text: 
 		[Wall, St., Bears, Claw, Back, Into, the, Black, (, Reuters, )]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 

Instance with fields:
 	 text: TextField of length 34 with text: 
 		[Reuters, -, Private, investment, firm, Carlyle, Group,\which, has, a, reputation, for, making,
		well, -, timed, and, occasionally\controversial, plays, in, the, defense, industry, ,, has, quietly,
		placed\its, bets, on, another, part, of, the, market, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: 3 in namespace: 'labels'.' 
 	 title: TextField of length 8 with text: 
 		[Carlyle, Looks, To