In [81]:
import torchtext.data as data

id_TEXT = data.Field(lower=True)
text_TEXT = data.Field(lower=True)
keys_TEXT = data.Field(lower=True)

# 每条数据'id','text','keys'对应的Fields
fields_list = [('id', id_TEXT),
               ('text', text_TEXT),
               ('keys', keys_TEXT)] # Dataset参数fields的格式

In [82]:
# 为继承自Dataset的类提供了一个基本的模板
tr, va, te = data.Dataset.splits(path='test_text', # 连接路径
                                 train="train.txt", # 文件名
                                 validation="validation.txt",
                                 test="test.txt",
                                 fields=fields_list)

In [83]:
# 与上使用splits方法等价;若需对不同数据集设置不同的参数,则需分别进行设置(即采用下面的设置方法)
tr_ = data.Dataset(examples='test_text/train.txt', fields=fields_list)
va_ = data.Dataset('test_text/validation.txt', fields=fields_list)
te_ = data.Dataset('test_text/test.txt', fields=fields_list)

In [84]:
print(tr)
print(va)
print(te)

<torchtext.data.dataset.Dataset object at 0x00000252C3401100>
<torchtext.data.dataset.Dataset object at 0x00000252C3401040>
<torchtext.data.dataset.Dataset object at 0x00000252C3401910>


In [85]:
import numpy as np

train, var = tr.split(split_ratio=0.8, random_state=np.random.seed(3)) # 将tr划分成trian(80%),var(20%)
print('train:', train)
print('var', var)

train: <torchtext.data.dataset.Dataset object at 0x00000252C3494340>
var <torchtext.data.dataset.Dataset object at 0x00000252C34947C0>


In [86]:
print(tr.examples)
print(va.examples)
print(te.examples)

test_text\train.txt
test_text\validation.txt
test_text\test.txt


In [87]:
print(tr.fields)
print(va.fields)
print(te.fields) # 三者的fields相等

{'id': <torchtext.data.field.Field object at 0x00000252C3762670>, 'text': <torchtext.data.field.Field object at 0x00000252C3762F10>, 'keys': <torchtext.data.field.Field object at 0x00000252C37621F0>}
{'id': <torchtext.data.field.Field object at 0x00000252C3762670>, 'text': <torchtext.data.field.Field object at 0x00000252C3762F10>, 'keys': <torchtext.data.field.Field object at 0x00000252C37621F0>}
{'id': <torchtext.data.field.Field object at 0x00000252C3762670>, 'text': <torchtext.data.field.Field object at 0x00000252C3762F10>, 'keys': <torchtext.data.field.Field object at 0x00000252C37621F0>}


### 重写Dataset举例

In [88]:
class DataFrameDataset(data.Dataset):
    """重写一个处理DataFrame数据类型的Dataset(通过模仿Dataset源代码)"""
    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.label if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, train_df=None, val_df=None, test_df=None, **kwargs):

        train_data = None if train_df is None else cls(
            train_df.copy(), **kwargs)
        val_data = None if val_df is None else cls(
            val_df.copy(), **kwargs)
        test_data = None if test_df is None else cls(
            test_df.copy(), is_test=True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data)
                     if d is not None)

In [89]:
import pandas as pd

tr_df = pd.DataFrame([['Duan Chao does not study hard now', 1],
                      ['There is something wrong with Duan Chaos mentality', 1],
                      ['Duan Chao is not tired of learning', 0]],
                     columns=['text', 'label'])

tr_df

Unnamed: 0,text,label
0,Duan Chao does not study hard now,1
1,There is something wrong with Duan Chaos menta...,1
2,Duan Chao is not tired of learning,0


In [90]:
te_df = pd.DataFrame([['A little addicted to cell phones'],
                      ['A little bit of a fantasy'],
                      ['Do not want to face difficulties']],
                     columns=['text'])
te_df

Unnamed: 0,text
0,A little addicted to cell phones
1,A little bit of a fantasy
2,Do not want to face difficulties


In [91]:
TEXT = data.Field(lower=True)
LABEL = data.Field()

all_fields = [('text', TEXT),
              ('label', LABEL)]

In [92]:
my_df_tr, my_df_te = DataFrameDataset.splits(train_df=tr_df,
                                             test_df=te_df,
                                             fields=all_fields)

In [93]:
my_df_tr

<__main__.DataFrameDataset at 0x252c349c5b0>

In [94]:
my_df_tr.examples

[<torchtext.data.example.Example at 0x252c3791700>,
 <torchtext.data.example.Example at 0x252c3791160>,
 <torchtext.data.example.Example at 0x252c3791e50>]

In [95]:
print(my_df_tr.examples[0].__dict__)
print(my_df_tr.examples[1].__dict__)
print(my_df_tr.examples[2].__dict__)

{'text': ['duan', 'chao', 'does', 'not', 'study', 'hard', 'now'], 'label': 1}
{'text': ['there', 'is', 'something', 'wrong', 'with', 'duan', 'chaos', 'mentality'], 'label': 1}
{'text': ['duan', 'chao', 'is', 'not', 'tired', 'of', 'learning'], 'label': 0}


In [96]:
print(my_df_te.examples[0].__dict__)
print(my_df_te.examples[1].__dict__)
print(my_df_te.examples[2].__dict__)

{'text': ['a', 'little', 'addicted', 'to', 'cell', 'phones'], 'label': None}
{'text': ['a', 'little', 'bit', 'of', 'a', 'fantasy'], 'label': None}
{'text': ['do', 'not', 'want', 'to', 'face', 'difficulties'], 'label': None}
