In [1]:
!pip install -U torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 26.3 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.6 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.legacy import data
import torchtext.datasets as datasets

In [4]:
class RNN_Text(nn.Module):
  def __init__(self, embed_num, class_num):
    super(RNN_Text, self).__init__()

    #V : 단어 사전 크기
    #C : 분류하고자 하는 클래스 개수
    #H : 히든 사이즈
    #D : 단어벡터 차원

    V = embed_num
    C = class_num
    H = 256
    D = 100
    
    self.embed = nn.Embedding(V,D)
    self.rnn = nn.LSTM(D,H,bidirectional = True)
    self.out = nn.Linear(H*2,C)
  
  def forward(self, x):
    x = self.embed(x)
    x,_ = self.rnn(x,(self.h,self.c))
    logit = self.out(x[-1])

    return logit
  
  def inithidden(self,b):
    self.h = torch.randn(2,b,256)
    self.c = torch.randn(2,b,256)


In [5]:
class mydataset(data.Dataset):
  @staticmethod
  def sort_key(ex):
    return len(ex.text)
  def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
    fields = [('text',text_field),('label',label_field)]
    if examples is None:
      path = self.dirname if path is None else path
      examples = []
      for i,line in enumerate(open(path,'r',encoding='utf-8')):
        if i == 0:
          continue
        line = line.strip().split('\t')
        txt = line[1].split(' ')

        examples += [data.Example.fromlist([txt,line[2]],fields)]
    super(mydataset, self).__init__(examples, fields, **kwargs)

In [6]:
text_field = data.Field(fix_length=30)
label_field = data.Field(sequential=False, batch_first = True, unk_token = None)

train_data = mydataset(text_field, label_field,path='/content/gdrive/My Drive/Colab Notebooks/aivle/data/nsm/small_ratings_train_tok.txt')
test_data = mydataset(text_field, label_field,path='/content/gdrive/My Drive/Colab Notebooks/aivle/data/nsm/small_ratings_test_tok.txt')

text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

train_iter,test_iter = data.Iterator.splits(
    (train_data,test_data),
    batch_sizes=(100,1), repeat=False
)


In [7]:
rnn = RNN_Text(len(text_field.vocab),2)
optimizer = torch.optim.Adam(rnn.parameters())
rnn.train()

RNN_Text(
  (embed): Embedding(21893, 100)
  (rnn): LSTM(100, 256, bidirectional=True)
  (out): Linear(in_features=512, out_features=2, bias=True)
)

In [8]:
%%time
for epoch in range(10):
  totalloss = 0
  for batch in train_iter:
    optimizer.zero_grad()

    txt=batch.text
    label=batch.label

    rnn.inithidden(txt.size(1))
    pred = rnn(txt)

    loss = F.cross_entropy(pred,label)
    totalloss += loss.data

    loss.backward()
    optimizer.step()
  
  print(epoch,'epoch')
  print('loss : {:.3f}'.format(totalloss.numpy()))

0 epoch
loss : 69.768
1 epoch
loss : 69.269
2 epoch
loss : 65.924
3 epoch
loss : 53.854
4 epoch
loss : 41.693
5 epoch
loss : 31.672
6 epoch
loss : 24.346
7 epoch
loss : 17.879
8 epoch
loss : 13.014
9 epoch
loss : 9.869
CPU times: user 6min 23s, sys: 13.5 s, total: 6min 37s
Wall time: 6min 53s


In [10]:
%%time

from sklearn.metrics import classification_report
correct = 0
incorrect = 0
rnn.eval()
y_test =[]
prediction =[]

for batch in test_iter:
  txt = batch.text
  label = batch.label
  y_test.append(label.data[0])
  
  rnn.inithidden(txt.size(1))

  pred = rnn(txt)

  _, ans = torch.max(pred,dim=1)
  prediction.append(ans.data[0])

  if ans.data[0] == label.data[0]:
    correct +=1
  else:
    incorrect +=1


print('correct : ',correct)
print('incorrect : ',incorrect)
print(classification_report(
    torch.tensor(y_test),
    torch.tensor(prediction),
    digits=4,
    target_names=['negative','positive']
))

correct :  84
incorrect :  16
              precision    recall  f1-score   support

    negative     0.7742    0.9600    0.8571        50
    positive     0.9474    0.7200    0.8182        50

    accuracy                         0.8400       100
   macro avg     0.8608    0.8400    0.8377       100
weighted avg     0.8608    0.8400    0.8377       100

CPU times: user 781 ms, sys: 3.97 ms, total: 785 ms
Wall time: 778 ms
