# fastNLP 1分钟上手教程

## step 1
读取数据集

In [1]:
from fastNLP import DataSet
           
data_path = "./sample_data/tutorial_sample_dataset.csv"
ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t')

In [2]:
ds[1]

{'raw_sentence': This quiet , introspective and entertaining independent is worth seeking . type=str,
'label': 4 type=str}

## step 2
数据预处理
1. 类型转换
2. 切分验证集
3. 构建词典

In [3]:
# 将所有数字转为小写
ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)

def split_sent(ins):
    return ins['raw_sentence'].split()
ds.apply(split_sent, new_field_name='words', is_input=True)


[['a',
  'series',
  'of',
  'escapades',
  'demonstrating',
  'the',
  'adage',
  'that',
  'what',
  'is',
  'good',
  'for',
  'the',
  'goose',
  'is',
  'also',
  'good',
  'for',
  'the',
  'gander',
  ',',
  'some',
  'of',
  'which',
  'occasionally',
  'amuses',
  'but',
  'none',
  'of',
  'which',
  'amounts',
  'to',
  'much',
  'of',
  'a',
  'story',
  '.'],
 ['this',
  'quiet',
  ',',
  'introspective',
  'and',
  'entertaining',
  'independent',
  'is',
  'worth',
  'seeking',
  '.'],
 ['even',
  'fans',
  'of',
  'ismail',
  'merchant',
  "'s",
  'work',
  ',',
  'i',
  'suspect',
  ',',
  'would',
  'have',
  'a',
  'hard',
  'time',
  'sitting',
  'through',
  'this',
  'one',
  '.'],
 ['a',
  'positively',
  'thrilling',
  'combination',
  'of',
  'ethnography',
  'and',
  'all',
  'the',
  'intrigue',
  ',',
  'betrayal',
  ',',
  'deceit',
  'and',
  'murder',
  'of',
  'a',
  'shakespearean',
  'tragedy',
  'or',
  'a',
  'juicy',
  'soap',
  'opera',
  '.'],
 ['

In [4]:
# 分割训练集/验证集
train_data, dev_data = ds.split(0.3)
print("Train size: ", len(train_data))
print("Test size: ", len(dev_data))

Train size:  54
Test size:  23


In [5]:
from fastNLP import Vocabulary
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)


[[120, 121, 6, 2, 122, 5, 72, 123, 3],
 [14,
  4,
  152,
  153,
  154,
  155,
  8,
  156,
  157,
  9,
  16,
  2,
  158,
  21,
  159,
  30,
  98,
  57,
  4,
  160,
  161,
  13,
  162,
  163,
  164,
  165,
  3],
 [4,
  112,
  113,
  15,
  114,
  35,
  10,
  68,
  115,
  69,
  8,
  23,
  116,
  5,
  18,
  36,
  11,
  4,
  70,
  7,
  117,
  7,
  118,
  119,
  71,
  3],
 [4, 1, 1, 5, 138, 14, 2, 1, 1, 1, 12],
 [2, 27, 11, 139, 140, 141, 15, 142, 8, 143, 3],
 [12, 9, 14, 32, 8, 4, 59, 60, 7, 61, 2, 62, 63, 64, 65, 4, 66, 67, 3],
 [97, 145, 14, 146, 147, 5, 148, 149, 23, 150, 3],
 [4, 1, 1, 5, 138, 14, 2, 1, 1, 1, 12],
 [4, 1, 1, 5, 138, 14, 2, 1, 1, 1, 12],
 [14,
  4,
  152,
  153,
  154,
  155,
  8,
  156,
  157,
  9,
  16,
  2,
  158,
  21,
  159,
  30,
  98,
  57,
  4,
  160,
  161,
  13,
  162,
  163,
  164,
  165,
  3],
 [10,
  2,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  11,
  4,
  28,
  94,
  6,
  95,
  96,
  2,
  17,
  11,
  3],
 [12, 73, 20, 33, 74, 7

## step 3
 定义模型

In [6]:
from fastNLP.models import CNNText
model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)


## step 4
开始训练

In [7]:
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric
trainer = Trainer(model=model, 
                  train_data=train_data, 
                  dev_data=dev_data,
                  loss=CrossEntropyLoss(),
                  metrics=AccuracyMetric()
                  )
trainer.train()
print('Train finished!')


input fields after batch(if batch size is 2):
	words: (1)type:numpy.ndarray (2)dtype:object, (3)shape:(2,) 
	word_seq: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 11]) 
target fields after batch(if batch size is 2):
	label_seq: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 



AttributeError: 'numpy.ndarray' object has no attribute 'contiguous'

### 本教程结束。更多操作请参考进阶教程。