In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchtext.data import Field,BucketIterator,TabularDataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('Reviews.csv',usecols=['Text','Summary'])

In [7]:
#移除空標籤
data = data.dropna()

In [9]:
train,test = train_test_split(data)

In [10]:
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)

In [2]:
TEXT = Field(lower=True,
             init_token = '<sos>', 
             eos_token = '<eos>'
             )
LABEL = Field(lower=True,
              init_token = '<sos>', 
              eos_token = '<eos>')
fields = {'Text': ('text',TEXT),'Summary': ('label',LABEL)}

In [3]:
train, test = TabularDataset.splits(path='./',
                                    train='train.csv',
                                    test='test.csv',
                                    format='csv',
                                    fields=fields)

In [4]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [5]:
train_iter, test_iter = BucketIterator.splits(
    (train, test), batch_size=32, device=0,sort=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [6]:
class Encoder(nn.Module):
    
    def __init__(self, embed_dim = 128):
        super(Encoder,self).__init__()
        self.embed = nn.Embedding(len(TEXT.vocab),embed_dim)
        self.rnn = nn.LSTM(embed_dim,embed_dim,1)
    
    def forward(self, x):
        tmp = self.embed(x)
        tmp, (ht, ct) = self.rnn(tmp)
        return ht, ct

In [7]:
class Decoder(nn.Module):
    
    def __init__(self, embed_dim = 128):
        super(Decoder,self).__init__()
        self.output_dim = len(LABEL.vocab) # for simplicity, we set output dim equals to embed dim
        
        self.embed = nn.Embedding(len(LABEL.vocab),embed_dim)
        self.rnn = nn.LSTM(embed_dim,embed_dim,1)
        self.fc = nn.Linear(embed_dim,len(LABEL.vocab))
    
    def forward(self, x, h, c):
        tmp = x.unsqueeze(0)
        tmp = self.embed(tmp)
        tmp, (h, c) = self.rnn(tmp, (h, c))
        tmp = tmp.squeeze(0)
        tmp = self.fc(tmp)
        return tmp, h, c

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        trg_len, batch_size = trg.shape[0], trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).cuda()
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [9]:
enc = Encoder().cuda()
dec = Decoder().cuda()
model = Seq2Seq(enc,dec).cuda()

In [85]:
## 測試 forward
for i in train_iter:
    text = i.text
    label = i.label
    h,c = enc(text)
#     out, h, c = dec(label, h, c)
    break

In [87]:
## 測試 forward
for i in train_iter:
    text = i.text
    label = i.label
    output = model(text,label)
    print(output.shape)
    break

torch.Size([12, 32, 66741])


In [10]:
criterion = nn.CrossEntropyLoss(ignore_index = LABEL.vocab.stoi[LABEL.pad_token])
optim = torch.optim.Adam(model.parameters())

In [11]:
epoches = 5

In [12]:
model.train()
training_loss = []
for e in range(epoches):
    running_loss = 0.0
    for data in tqdm(train_iter):
        text = data.text.cuda()
        label = data.label.cuda()
        
        output = model(text,label)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        label = label[1:].view(-1)
        
        optim.zero_grad()
        loss = criterion(output,label)
        loss.backward()
        optim.step()
        
        output = output.detach().cpu()
        text = text.detach().cpu()
        label = label.detach().cpu()
        
        running_loss += loss.item()
    training_loss.append(running_loss/len(train))
    print(f'epoch {e+1} : {running_loss/len(train)}')

  0%|                                                                              | 8/13323 [00:03<1:28:07,  2.52it/s]


RuntimeError: CUDA out of memory. Tried to allocate 262.00 MiB (GPU 0; 3.00 GiB total capacity; 1.95 GiB already allocated; 39.86 MiB free; 44.52 MiB cached)

In [None]:
plt.plot(training_loss)

In [None]:
model.eval()
with torch.no_grad():
    running_hit = 0.0
    for data in tqdm(train_iter):
        text = data.text[0].cuda()
        label = (data.label.cuda()-1)
        
        output = model(text)
        
        pred = output.argmax(dim=1)
        pred = torch.sum(pred == label)
        running_hit += pred.item()
    print(running_hit/len(train))

In [208]:
torch.save(model.state_dict(),'textcnn.pth')