# RNN实现意图识别

## ATIS数据集预处理

### 加载数据集

In [2]:
import pandas as pd

In [3]:
local_file_path = 'E:\\files\\datasets\\atis\\atis_intents.csv'
kaggle_file_path = '../input/atisdatset/atis_intents.csv'
atis_intents_path = kaggle_file_path
atis_intents = pd.read_csv(atis_intents_path, header=None)

In [5]:
atis_intents

Unnamed: 0,0,1
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
...,...,...
4973,atis_airfare,what is the airfare for flights from denver t...
4974,atis_flight,do you have any flights from denver to baltim...
4975,atis_airline,which airlines fly into and out of denver
4976,atis_flight,does continental fly from boston to san franc...


In [10]:
atis_intents[0][0]

'atis_flight'

In [9]:
atis_intents[1][0]

' i want to fly from boston at 838 am and arrive in denver at 1110 in the morning'

### 构造词典

In [4]:
import io
from torchtext.vocab import build_vocab_from_iterator

In [20]:
def yield_tokens(file_path):
    with io.open(file_path) as f:
        for line in f:
           yield line.split(',')[1].strip().split()

In [21]:
vocab = build_vocab_from_iterator(yield_tokens(atis_intents_path), specials=["<UNK>"])

In [22]:
len(vocab)

899

In [25]:
vocab.lookup_token(1)

'to'

In [28]:
vocab.lookup_indices(atis_intents[1][0].strip().split())

[18, 70, 1, 38, 2, 9, 68, 407, 86, 16, 79, 15, 12, 68, 533, 15, 4, 36]

In [5]:
class AtisIntentVocabulary():
    def __init__(self, file_path, specials=None):
        self.specials = specials
        self.file_path = file_path
        self.vocal = build_vocab_from_iterator(self.yield_tokens(self.file_path), specials=self.specials)
        self.vocal.set_default_index(0)
        
    def yield_tokens(self, file_path):
        with io.open(file_path) as f:
            for line in f:
               yield line.split(',')[1].strip().split()
    
    def lookup_indices(self, tokens_list):
        return self.vocal.lookup_indices(tokens_list)
    
    def __len__(self):
        return len(self.vocal)

In [11]:
atis_intent_vocab = AtisIntentVocabulary(atis_intents_path, specials=["<UNK>"])

In [11]:
len(atis_intent_vocab)

899

In [45]:
atis_intent_vocab.lookup_indices(atis_intents[1][0].strip().split())

[18, 70, 1, 38, 2, 9, 68, 407, 86, 16, 79, 15, 12, 68, 533, 15, 4, 36]

### 构造数据集

In [6]:
from torch.utils.data import Dataset
from torchtext.functional import to_tensor
from sklearn.preprocessing import LabelEncoder

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(atis_intents[0])

LabelEncoder()

In [51]:
atis_intents

Unnamed: 0,0,1
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
...,...,...
4973,atis_airfare,what is the airfare for flights from denver t...
4974,atis_flight,do you have any flights from denver to baltim...
4975,atis_airline,which airlines fly into and out of denver
4976,atis_flight,does continental fly from boston to san franc...


In [8]:
label_encoder.classes_

array(['atis_abbreviation', 'atis_aircraft',
       'atis_aircraft#atis_flight#atis_flight_no', 'atis_airfare',
       'atis_airfare#atis_flight_time', 'atis_airline',
       'atis_airline#atis_flight_no', 'atis_airport', 'atis_capacity',
       'atis_cheapest', 'atis_city', 'atis_distance', 'atis_flight',
       'atis_flight#atis_airfare', 'atis_flight_no', 'atis_flight_time',
       'atis_ground_fare', 'atis_ground_service',
       'atis_ground_service#atis_ground_fare', 'atis_meal',
       'atis_quantity', 'atis_restriction'], dtype=object)

In [52]:
atis_intents['label'] = label_encoder.transform(atis_intents[0]) 

In [53]:
atis_intents

Unnamed: 0,0,1,label
0,atis_flight,i want to fly from boston at 838 am and arriv...,12
1,atis_flight,what flights are available from pittsburgh to...,12
2,atis_flight_time,what is the arrival time in san francisco for...,15
3,atis_airfare,cheapest airfare from tacoma to orlando,3
4,atis_airfare,round trip fares from pittsburgh to philadelp...,3
...,...,...,...
4973,atis_airfare,what is the airfare for flights from denver t...,3
4974,atis_flight,do you have any flights from denver to baltim...,12
4975,atis_airline,which airlines fly into and out of denver,5
4976,atis_flight,does continental fly from boston to san franc...,12


In [9]:
class CustomAtisIntentDataset(Dataset):
    def __init__(self, file_path, label_encoder, vocab):
        self.file_path = file_path
        self.label_encoder = label_encoder
        self.vocab = vocab
        self.atis_intent_dataset = pd.read_csv(self.file_path, header=None)
        
    def __len__(self):
        return len(self.atis_intent_dataset)
        
    def __getitem__(self, idx):
        self.atis_intent_dataset['label'] = self.label_encoder.transform(self.atis_intent_dataset[0])
        X = []
        for text in self.atis_intent_dataset[1]:
            token_ids = self.vocab.lookup_indices(text.strip().split())
            X.append(token_ids)
        X = to_tensor(X, padding_value=0)
        y = self.atis_intent_dataset['label']
        return X[idx], y[idx]

In [12]:
train_atis_intent_dataset = CustomAtisIntentDataset(atis_intents_path, label_encoder, atis_intent_vocab)

In [13]:
train_atis_intent_dataset[2]

(tensor([  7,  20,   4, 409, 183,  15,  11,  14,  37,   4, 550,  86,   8,  33,
          32,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0]),
 15)

In [14]:
len(train_atis_intent_dataset)

4978

In [15]:
validate_local_file_path = 'E:\\files\\datasets\\atis\\atis_intents_val.csv'
test_local_file_path = 'E:\\files\\datasets\\atis\\atis_intents_test.csv'
validate_kaggle_file_path = '../input/atisdatset/atis_intents_val.csv'
test_kaggle_file_path = '../input/atisdatset/atis_intents_test.csv'

validata_atis_intent_dataset_path = validate_kaggle_file_path
test_atis_intent_dataset_path = test_kaggle_file_path

In [16]:
validate_atis_intent_dataset = CustomAtisIntentDataset(validata_atis_intent_dataset_path, 
                                                       label_encoder, 
                                                       atis_intent_vocab)
test_atis_intent_dataset = CustomAtisIntentDataset(test_atis_intent_dataset_path, 
                                                       label_encoder, 
                                                       atis_intent_vocab)

In [17]:
len(validate_atis_intent_dataset), len(test_atis_intent_dataset)

(4834, 800)

In [18]:
from torch.utils.data import DataLoader

In [18]:
train = DataLoader(train_atis_intent_dataset, batch_size=10, shuffle=True)

In [21]:
for X, y in train:
    print(X, y.type(torch.long))
    break

tensor([[ 10,   6,  23,   3,   2,  22,   1,  21,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0],

        [ 10,   6,  67,  71,   3,   2,  89,   1, 117,  45,   0,   0,   0,   0,

           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0],

        [ 50, 122, 119,  61,  13,   8,   2, 110,   1, 142, 141,  56,  77, 155,

         352,  41,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0],

        [  7, 172,  29, 135,  20, 208,   5,  65, 114,   8, 682,   0,   0,   0,

           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

           0,   0,   0,   0

In [89]:
next(iter(train))

[tensor([[ 47,  25,   1,  72,   2,   9,   5,  83,  16,  47,  25,   1,  72, 428,
           15,   4,  36,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0],
         [  7,  26, 346, 486,  29,   3,   1, 159,   2, 102,   1, 171,  57, 152,
          173,  35,  15, 171,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0],
         [ 45,  36,   3,  31,  22,  16,  99,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0],
         [  7,   3,  84,   2,  17,   1,  32,  74,   5,  76,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   

## 构造模型

In [19]:
import torch
import torch.nn as nn

In [115]:
with torch.no_grad():
    embed = nn.Embedding(len(atis_intent_vocab), 16)
    input = torch.LongTensor([[0,1,2,3],[4,5,6,7]])
    print(input)
    x = embed(input)
    print(x)

tensor([[0, 1, 2, 3],

        [4, 5, 6, 7]])

tensor([[[-0.5838,  2.4045,  1.4510, -0.1649,  0.1741, -0.7844,  1.8591,

           0.9358,  0.6031,  0.6675,  0.6015, -1.0938,  0.7716,  1.1108,

          -0.9600,  0.2014],

         [ 0.5307, -1.3925,  1.1198,  0.9192, -0.0643, -0.2046,  0.9014,

          -0.6693,  1.0462, -0.1761,  0.2789,  0.0260,  0.6260, -1.3169,

           0.6439, -1.4934],

         [-1.4369, -0.5872,  0.1698,  0.1747,  0.5161, -0.4540, -0.4658,

           0.4795, -1.0159, -1.0690, -0.4362,  0.7642,  1.5331,  1.1989,

           0.6605,  0.9056],

         [ 0.2967, -0.9220,  1.2872,  0.2350, -0.2862, -0.4634, -0.3571,

           1.1692,  0.4095,  0.2361, -0.0262,  0.0174,  0.5646, -0.1995,

          -0.1050, -1.7378]],



        [[ 1.3155, -2.9393, -0.2742,  2.0667,  1.0325, -0.3022,  0.0683,

          -1.1511,  0.2522, -1.7599,  0.7475, -1.0811, -1.0894,  0.8687,

           0.5581,  0.4331],

         [-0.1851,  0.0301, -0.3609,  0.5217, -0.2710, -1.49

In [116]:
with torch.no_grad():
    lstm = nn.LSTM(16, 16, batch_first=True)
    output, (h, c) = lstm(x)
    print(output)

tensor([[[ 0.0813,  0.0423, -0.1138, -0.0590, -0.0604, -0.0619,  0.0385,

           0.1602,  0.0028,  0.1270,  0.0557, -0.0297, -0.2081, -0.1041,

           0.1716, -0.0030],

         [ 0.1735,  0.0578,  0.0719,  0.2337,  0.0661,  0.1109,  0.1190,

           0.2434, -0.1574, -0.1799,  0.0524, -0.0741, -0.0382, -0.1216,

           0.1193, -0.0302],

         [ 0.1539,  0.0241, -0.0891,  0.1933, -0.0417,  0.1043,  0.1229,

           0.0824, -0.0710, -0.0593,  0.1498, -0.0214,  0.0102,  0.0080,

           0.1847,  0.0588],

         [ 0.1997, -0.1644,  0.0158,  0.3748,  0.0579,  0.0772,  0.1803,

           0.1149, -0.1618, -0.1974,  0.0221, -0.0722, -0.0130, -0.1916,

           0.0111,  0.0616]],



        [[-0.0503, -0.1483,  0.0595,  0.3257, -0.0450,  0.1643,  0.1543,

           0.2109,  0.2114, -0.0660,  0.0037,  0.1841,  0.1833,  0.0080,

           0.0149, -0.1426],

         [ 0.0854,  0.1774,  0.0306, -0.0813,  0.1392,  0.4039,  0.0938,

           0.4020,  0.2018,  0.13

In [128]:
output[:, -1]

tensor([[ 0.1997, -0.1644,  0.0158,  0.3748,  0.0579,  0.0772,  0.1803,  0.1149,
         -0.1618, -0.1974,  0.0221, -0.0722, -0.0130, -0.1916,  0.0111,  0.0616],
        [ 0.2303, -0.1644,  0.1375,  0.0884,  0.0327,  0.1038,  0.0907,  0.0874,
          0.0086, -0.0736,  0.1433, -0.0268,  0.0328,  0.0367, -0.2424,  0.1462]])

In [129]:
with torch.no_grad():
    linear = nn.Linear(16, 10)
    softmax = nn.Softmax(dim=1)
    lin = linear(output[:, -1])
    pred = softmax(lin)
    print(pred)

tensor([[0.1147, 0.1287, 0.0721, 0.0916, 0.0737, 0.1205, 0.0897, 0.0942, 0.0851,

         0.1298],

        [0.1085, 0.1133, 0.0820, 0.1006, 0.0737, 0.1187, 0.0851, 0.1011, 0.0962,

         0.1208]])


In [132]:
pred.sum(dim=1, keepdim=True)

tensor([[1.0000],
        [1.0000]])

In [43]:
class RNNForAtisIntentClassification(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, num_labels):
        super(RNNForAtisIntentClassification, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_labels)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.embedding(x)
        output, (h,c) = self.lstm(x)
        x = self.linear(output[:, -1])
        x = self.softmax(x)
        return x

In [135]:
with torch.no_grad():
    rnn = RNNForAtisIntentClassification(len(atis_intent_vocab),
                                         16,
                                         16,
                                         10)
    prediction = rnn(input)
    print(prediction)

tensor([[0.0871, 0.1081, 0.1100, 0.1058, 0.0753, 0.0826, 0.1194, 0.0987, 0.1012,

         0.1118],

        [0.0883, 0.1071, 0.1135, 0.1035, 0.0869, 0.0813, 0.1090, 0.1170, 0.0845,

         0.1089]])


In [136]:
prediction.sum(dim=1, keepdim=True)

tensor([[1.],
        [1.]])

In [137]:
prediction.argmax(dim=1)

tensor([6, 7])

## 模型训练

In [21]:
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [26]:
import numpy as np

In [46]:
batch_size = 2
vocab_size = len(atis_intent_vocab)
embed_dim = 16
hiddem_size = 20
num_labels = len(label_encoder.classes_)
num_epochs = 3
num_layers = 3
lr = 0.03

train_set = DataLoader(train_atis_intent_dataset, batch_size=batch_size, shuffle=True)
validate_set = DataLoader(validate_atis_intent_dataset, batch_size=batch_size, shuffle=True)

model = RNNForAtisIntentClassification(vocab_size=vocab_size,
                                       embed_dim=embed_dim,
                                       hidden_size=hiddem_size,
                                       num_layers=num_layers,
                                       num_labels=num_labels).to(device)

loss = nn.CrossEntropyLoss().to(device)

optimer = torch.optim.SGD(model.parameters(), lr=lr)

for epoch in range(num_epochs):
    train_l = []
    validate_l = []
    validate_acc = []
    validate_f1 = []
    with tqdm(train_set, unit="batch") as tepoch:
        for X, y in tepoch:
            tepoch.set_description(f"Epoch {epoch} train")
            model.train()
            X = X.to(device)
            y = y.type(torch.long).to(device)
            y_hat = model(X)
            l = loss(y_hat, y)
            optimer.zero_grad()
            l.backward()
            optimer.step()
            train_l.append(l.item())

    with tqdm(validate_set, unit="batch") as tepoch:
        with torch.no_grad():
            for X, y in tepoch:
                tepoch.set_description(f"Epoch {epoch} validate")
                model.eval()
                X = X.to(device)
                y = y.type(torch.long).to(device)
                y_hat = model(X)
                l = loss(y_hat, y)
                validate_l.append(l.item())
                acc = accuracy_score(y.cpu().numpy(), y_hat.argmax(dim=1).cpu().numpy())
                validate_acc.append(acc)
                f1 = f1_score(y.cpu().numpy(), y_hat.argmax(dim=1).cpu().numpy(), average='weighted')
                validate_f1.append(f1)
                
    print(f'epoch {epoch}, train_loss {np.mean(train_l)}, validate_loss {np.mean(validate_l)}, accuracy_val {np.mean(validate_acc)}, f1_val {np.mean(validate_f1)}')

Epoch 0 train: 100%|██████████| 2489/2489 [05:51<00:00,  7.09batch/s]
Epoch 0 validate: 100%|██████████| 2417/2417 [05:29<00:00,  7.33batch/s]


epoch 0, train_loss 2.6372830341886258, validate_loss 2.4099286187518763, accuracy_val 0.7583781547372777, f1_val 0.6960419252516893


Epoch 1 train: 100%|██████████| 2489/2489 [06:04<00:00,  6.82batch/s]
Epoch 1 validate: 100%|██████████| 2417/2417 [05:36<00:00,  7.18batch/s]


epoch 1, train_loss 2.4309316436274075, validate_loss 2.408577159854761, accuracy_val 0.7583781547372777, f1_val 0.6981106054337333


Epoch 2 train: 100%|██████████| 2489/2489 [06:03<00:00,  6.85batch/s]
Epoch 2 validate: 100%|██████████| 2417/2417 [05:32<00:00,  7.27batch/s]

epoch 2, train_loss 2.4303252709443526, validate_loss 2.40828372152768, accuracy_val 0.7583781547372777, f1_val 0.6972831333609156





In [141]:
len(train)

498

In [26]:
model

RNNForAtisIntentClassification(
  (embedding): Embedding(899, 16)
  (lstm): LSTM(16, 16, batch_first=True)
  (linear): Linear(in_features=16, out_features=22, bias=True)
  (softmax): Softmax(dim=1)
)

In [26]:
text = 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis'
label = 'atis_flight'

In [27]:
text_token_id = atis_intent_vocab.lookup_indices(text.split())

In [28]:
text_token_id

[18, 39, 25, 1, 90, 13, 8, 2, 102, 1, 98, 96, 30, 344, 13, 109, 15, 87, 179]

In [29]:
text_tensor = torch.tensor([text_token_id]).to(device)

In [30]:
text_tensor

tensor([[ 18,  39,  25,   1,  90,  13,   8,   2, 102,   1,  98,  96,  30, 344,
          13, 109,  15,  87, 179]])

In [31]:
pedict = model(text_tensor)

In [32]:
pedict.argmax(dim=1)

tensor([12])

In [33]:
text2 = 'is there ground transportation available at the indianapolis airport'
text2_label = 'atis_ground_service'

In [34]:
text2_token_id = atis_intent_vocab.lookup_indices(text2.split())

In [35]:
text2_token_id

[20, 43, 46, 51, 58, 68, 4, 126, 81]

In [36]:
text2_tensor = torch.tensor([text2_token_id]).to(device)

In [37]:
pedict2 = model(text2_tensor)

In [38]:
pedict2

tensor([[0.0299, 0.0269, 0.0331, 0.0420, 0.0290, 0.0345, 0.0332, 0.0394, 0.0324,
         0.0358, 0.0283, 0.0325, 0.3127, 0.0289, 0.0335, 0.0327, 0.0281, 0.0337,
         0.0341, 0.0362, 0.0327, 0.0304]], grad_fn=<SoftmaxBackward0>)

In [39]:
pedict2.argmax(dim=1)

tensor([12])

In [29]:
a = [1,2,3,4]
print(np.mean(a))

2.5
