In [1]:
max_length = 256

## 1. 데이터 불러오기

In [3]:
import pandas as pd
df = pd.read_csv('sms.tsv', sep='\t',)
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5572, 2)


In [4]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 1.2 데이터 전처리

In [5]:
#클래스 파악 
classes = sorted(set(df['label']))
class_to_index = {}

for i, c in enumerate(classes): #모든 클래스에 대해
    class_to_index.update({c: i})

nclass = len(classes)

print("# of classes: %d" %nclass)
print(classes)
print(class_to_index)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


## 2. 새로운 DataFrame
### 1) 'label, sms'만 남기기
### 2) 최대 텍스트 길이 만큼 자르기 #pandas.Series.str.slice

In [12]:
new_df = pd.DataFrame({'label': df['label'],
                      'sms': df['sms'].str.slice(
                      start = 0 , stop= max_length)})

In [13]:
len(new_df)

5572

### 3) 중복제거

In [14]:
new_df = pd.DataFrame(new_df.drop_duplicates())

In [15]:
len(new_df)

5169

### 4) 셔플

In [16]:
df_shuffled = new_df.sample(frac = 1).reset_index(drop = True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,Are you staying in town ?
1,ham,Get me out of this dump heap. My mom decided t...
2,ham,Okie ü wan meet at bishan? Cos me at bishan no...
3,ham,NO GIFTS!! You trying to get me to throw mysel...
4,spam,REMINDER FROM O2: To get 2.50 pounds free call...


### 5) train, test 나누기

In [18]:
train_ratio = 0.9

#train dataset
s,e = 0, int(df_shuffled.shape[0] * train_ratio)
df_train = pd.DataFrame({'label': df_shuffled['label'][s:e],
                        'sms': df_shuffled['sms'][s:e]})
print("index for train: %d ~ %d" %(s,e))

#test dataset
s,e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for test: %d~%d" %(s,e))

df_test = pd.DataFrame({'label':df_shuffled['label'][s:e],
                       'sms': df_shuffled['sms'][s:e]})

index for train: 0 ~ 4652
index for test: 4652~5168


In [19]:
print(df_train.shape)
print(df_test.shape)

(4652, 2)
(516, 2)


### 6) 저장

In [20]:
df_train.to_csv('./sms.maxlen.uiq.shuf.train.tsv', header = False, index = False, sep ='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv', header = False, index = False, sep= '\t')

## 데이터 로더

In [21]:
import torch
print(torch.__version__)

1.4.0


In [22]:
!pip install torchtext==0.4.0

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.2 MB/s  eta 0:00:01
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [23]:
import torchtext
import numpy as np

## RNN + SMS 구현

### 01. 라이브러리 임포트

In [26]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

### 02. 하이퍼파라미터 셋팅

In [27]:
#Hyper-parameters
batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

learning_rate =0.001

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 1. SMS train, test dataset 가져오기

In [30]:
from data_loader import DataLoader

In [34]:
loaders = DataLoader(
    train_fn = './sms.maxlen.uiq.shuf.train.tsv',
    batch_size = batch_size,
    valid_ratio = .2,
    device = -1,
    max_vocab = 999999,
    min_freq = 5
) #80%가 train, 20%가 validation

In [39]:
test_loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.test.tsv',
    batch_size = batch_size,
    valid_ratio = .01,
    device = -1,
    max_vocab = 999999,
    min_freq = 5
) #test의 0값을 받아들이지 않으므로, 0.01

### 2. 대략적인 데이터 형태

In [41]:
print("|train| = ", len(loaders.train_loader.dataset),
     "|test| = ", len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print("|vocab| = ", vocab_size, "|classes| =", num_classes)

|train| =  3722 |test| =  930
|vocab| =  1566 |classes| = 2


### 3. RNN+SMS 구현

In [42]:
# 데이터 로드 함수 이해하기
n = 3
for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i>n:
        break
    print("[%d]" %i)
    print("한번에 로드되는 데이터 크기:", len(labels))
    
    #출력
    for j in range(n):
        label = labels[j].numpy()
        text= texts[j].numpy()
        print("label:", label)
        print("text:", text.shape)

[0]
한번에 로드되는 데이터 크기: 128
label: 0
text: (12,)
label: 0
text: (12,)
label: 0
text: (12,)
[1]
한번에 로드되는 데이터 크기: 10
label: 0
text: (60,)
label: 0
text: (60,)
label: 0
text: (60,)
[2]
한번에 로드되는 데이터 크기: 128
label: 0
text: (10,)
label: 0
text: (10,)
label: 0
text: (10,)
[3]
한번에 로드되는 데이터 크기: 128
label: 0
text: (8,)
label: 0
text: (8,)
label: 0
text: (8,)


### 4. 모델 선언

In [43]:
class RNN(nn.Module):
    def __init__(self, input_size, word_vec_size, hidden_size, n_classes, num_layers=4, dropout_p =0.3):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.word_vec_size = word_vec_size #이부분 추가함
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        #입력 차원(vocab_size), 출력 차원(word_vec_size)
        self.emb = nn.Embedding(input_size, word_vec_size)
        
        self.lstm = nn.LSTM(input_size = word_vec_size,
                           hidden_size = hidden_size,
                           num_layers = num_layers,
                           dropout = dropout_p,
                           batch_first = True,
                           bidirectional = True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.activation = nn.LogSoftmax(dim = -1)
        
    def forward(self, x):
        x = self.emb(x)
        
        x, _= self.lstm(x)
        
        out = self.activation(self.fc(x[:,-1]))
        
        return out

In [44]:
model = RNN(input_size = vocab_size,
           word_vec_size = word_vec_size,
           hidden_size = hidden_size,
           n_classes = num_classes,
           num_layers = num_layers,
           dropout_p= dropout_p)

In [49]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total = 0
    
    model.eval()
    for i, data in enumerate(dloader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        
        #forward prop.
        output = model(texts)
        _, output_index = torch.max(output,1)
        
        total += labels.size(0)
        correct += (output_index == labels).sum().float()
    #print("Accuracy of Test Data:{}".format(100 *correct /total))
    return(100*correct/total).numpy() #tensor 2 numpy

In [50]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 14.62


### 5. loss, optimizer

In [51]:
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

### 6. 학습

In [53]:
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        
        print("[%d]" %i)
        
        #forward prop
        outputs =model(texts)
        loss = loss_func(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print("Epoch[{}/{}], Step[{}/{}], Loss{:.4f}, Accr{:.2f}"
                 .format(epoch+1, num_epochs, i+1, total_step, 
                        loss.item(), ComputeAccr(loaders.valid_loader, model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[1/10], Step[10/30], Loss0.2794, Accr88.06
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[1/10], Step[20/30], Loss0.1816, Accr88.06
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch[1/10], Step[30/30], Loss0.2084, Accr88.06
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[2/10], Step[10/30], Loss0.9573, Accr88.06
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[2/10], Step[20/30], Loss0.7982, Accr88.17
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch[2/10], Step[30/30], Loss0.4675, Accr87.96
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[3/10], Step[10/30], Loss0.1546, Accr88.06
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[3/10], Step[20/30], Loss0.5452, Accr88.06
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch[3/10], Step[30/30], Loss1.0574, Accr88.06
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[4/10], Step[10/30], Loss0.1592, Accr89.35
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[4/10

## 7. 테스트

In [54]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 96.24


## 8. 학습된 파라미터 저장

In [55]:
netname = './nets/rnn_weight.pkl'
torch.save(model, netname,)

  "type " + obj.__name__ + ". It won't be checked "


## 9. 학습된 파라미터 로드

In [56]:
netname = './nets/rnn_weight.pkl'
model = torch.load(netname)

In [57]:
print("accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

accuracy of Valid Data: 96.24
