In [2]:
import re 
import os 
import sys
import json
import pickle
from collections import Counter
import urllib.request
import tqdm

import pandas as pd
import numpy as np

import konlpy
from konlpy.tag import Okt

import sklearn
from sklearn import model_selection
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'konlpy'

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt", filename="ratings_total.txt")

reviews_df = pd.read_table('ratings_total.txt', names=['ratings','reviews'])
reviews_df = reviews_df.drop_duplicates('reviews')
reviews_df = reviews_df.reset_index(drop=True)
reviews_df['label'] = reviews_df['ratings'].apply(lambda x : 1 if x>3 else 0)


In [None]:
def tokenize_and_stemming(sentence, tokenizer):
    sentence = re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-|가-힣]", "", sentence) # ??
    raw_pos_tagged = tokenizer.pos(sentence, stem=True)
    sentence_tokenized = []
    for token, pos in raw_pos_tagged:
        if (len(token) != 1) & (pos in ["Noun", "VerbPrefix", "Verb", "Adverb","Adjective", "Conjuction","KoreanParticle"]):
            sentence_tokenized.append(token)
    return sentence_tokenized

In [None]:
okt = Okt()
tokenized_sentences = []

for sentence in tqdm.tqdm(reviews_df['reviews']):
    try:
        tokenized_sentences.append(tokenize_and_stemming(sentence, okt))
    except:
        print("Error occured at :", sentence)
        tokenized_sentences.append([])
        
reviews_df['reviews'] = tokenized_sentences

In [None]:
data_path = '/data/ephemeral/temp_data/' 

if not os.path.exists(data_path):
    os.makedirs(data_path)

reviews_df.to_csv(data_path + 'data_review_tokenized.csv', encoding='utf-8',index=False)
reviews_df = pd.read_csv(data_path+'data_review_tokenized.csv', encoding='utf-8')
reviews_df['reviews'] = reviews_df['reviews'].apply(eval)

train_x ,test_x, train_y, test_y = model_selection.train_test_split(reviews_df['reviews'], reviews_df['label'],test_size=0.3, random_state=44) # model selectoin은 뭐에요??


In [None]:
# 불용어 처리
tokens = []

for token_lst in train_x:
    tokens.extend(token_lst)

tokens_cnted = Counter(tokens) #Counter는 뭐에요 아 중복되지 않게 세는 거?!

n_all_tkn == len(tokens_cnted)
n_rare_tkn = 0

for t, c in tokens_cnted.items():
    if c==1:
        n_rare_tkn+=1


In [None]:

print(f"중복 제거 전체 형태소 개수: {n_all_tkn}")
print(f"빈도 1인 형태소 수: {n_rare_tkn}")
print(f"중복 제거 전체 형태소 중 빈도 1인 형태소 비율: {(n_rare_tkn / n_all_tkn)*100:.3f}")
print(f"중복 포함 전체 형태소 중 빈도 1인 형태소 비율: {(n_rare_tkn / len(tokens))*100:.3f}")


빈도가 엄청 높은데 이걸 제외해서 과적합이 나는 거 아닙니까?

In [None]:
def stopwords(text):
    find_sw = [k for k,v in tokens_cnted.items() if v==1]
    result = [token for token in text if token not in find_sw]

    return result

pickle로 저장하면 좋겠는데요?!!

In [None]:
reviews_df['token'] = reviews_df['reviews'].apply(stopwords)
reviews_df.to_csv(data_path + 'data_reviews_fin.csv', encoding='utf-8', index=False)

In [None]:
data_path = '/data/ephemeral/temp_data/'
reviews_df = pd.read_csv(data_path + 'data_reviews_tokenized.csv', encoding='utf-8')
# eval 함수 사용 전 형태
print(reviews_df.head(1))
reviews_df['reviews'] = reviews_df['reviews'].apply(eval)
# eval 함수 사용 후
print(reviews_df.head(1))

In [None]:
# toke -> 불용어 처리 됨
reviews_df = pd.read_csv(data_path + 'data_reviews_fin.csv', encoding='utf-8')
# eval 깨져서 한 번 더 진행
reviews_df['token'] = reviews_df['token'].apply(eval)
reviews_df['reviews'] = reviews_df['reviews'].apply(eval)
reviews_df.head()

In [None]:
def getbow(corpus):
    bow = {'<PAD>':0, '<UNK>':1}

    for line in corpus:
        for word in line:
            if word not in bow.keys():
                bow[word] = len(bow.keys())
    return bow

In [None]:
print(f"데이터 전체 수: {len(reviews_df)}")
print(f"불용어 처리된 길이 수 : {len(reviews_df[reviews_df['reviews'] == reviews_df['toke']])}")
print("불용어 수", 199908-183586)

In [None]:

train_x, test_x, train_y, test_y = model_selection.train_test_split(reviews_df['toke'], reviews_df['label'], 
                                                                    test_size=0.2,random_state=44)

In [None]:
corpus = train_x
korbow = getbow(corpus=corpus) # 단어 -> 토큰

In [None]:
inkorbow = {v:k for k,v in korbow.items()}
len(inkorbow)

In [None]:
max_length = 0
length_lst = []

for num in range(train_x.shape[0]):
    length = len(train_x.iloc[num])
    length_lst.append(length)
    if length > max_length:
        max_length=length

max_length = 50 # > 43

In [None]:
class MyDataset(Dataset):
    def __init__(self, data_x, data_y, korbow, max_length):
        self.x = data_x
        self.y = data_y.values
        self.korbow = korbow
        self.max_length = max_length
    
    def padding(self, x):
        x += ['<PAD>'] * (self.max_length-len(x))
        return x
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        x = self.padding(self.x[idx])
        lst = []
        for word in x:
            try:
                lst.append(korbow[word]) # 단어 -> 숫자
            except:
                lst.append(korbow['<UNK>'])
        x = np.array(lst)
        y = self.y[idx]

        return x, y

In [None]:
def prepare_loader(data_x, label_y, idx_num, batch_size, train=True):
    if train:
        train_df = data_x[:idx_num].reset_index(drop=True)
        train_y = label_y[:idx_num].reset_index(drop=True)
        valid_df = data_x[idx_num:].reset_index(drop=True)
        valid_y = label_y[idx_num:].reset_index(drop=True)

        train_dataset = MyDataset(train_df, train_y, korbow, max_length)
        valid_dataset = MyDataset(valid_df, valid_y, korbow, max_length)
        
        train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size, shuflle=False) 

        return train_dataloader, valid_loader
    
    if not train:
        test_df = data_x.reset_index(drop=True)
        test_lab = label_y.reset_index(drop=True)

        test_ds = MyDataset(test_df, test_lab, korbow, max_length)
        test_loader = DataLoader(test_ds, batch_size, shuffle=False)

        return test_loader


In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
class LSTM_CLS(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, ouput_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True) # batch_first는 뭐야?
        self.fclayer = nn.Linear(hidden_dim, ouput_dim)

    def forward(self, x):
        embed = self.emb(x)
        out, (hidden, _) = self.lstm(embed) # out B * L * H (128, 50, 64) , hidden (1 * B * H)
        out = self.fclayer(hidden.squeeze(0))
        return out

In [None]:
EMB_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 2
LEARNING_RATE = 0.001
NUM_EPOCHS = 100
VOCAB_SIZE = len(korbow)

In [None]:
model = LSTM_CLS(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM)
model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr= LEARNING_RATE)

In [None]:
def accuracy(output, labels):
    predicted = torch.argmax(output, dim=1)
    correct = (predicted==labels).sum().item() #item()은 뭐야
    total=labels.size(0)
    accuracy=correct/total
    return accuracy

In [None]:
# 위에 있듯 약 28000개 정도만 valid로 사용. -> 0.5:0.2:0.3 = train:valid:test 맞나? 아마 
train_loader, valid_loader = prepare_loader(train_x,train_y, 127940, 128)
test_loader = prepare_loader(test_x, test_y, 0, 64, train=False)


In [None]:
def train():
    best_loss = float('inf')
    early_stop = 30
    low_epoch = float('inf')

    global train_p, valid_p, train_accs, valid_accs
    train_p, valid_p = [], []
    train_accs, valid_accs = [], []
    for epoch in range(NUM_EPOCHS):
        train_loss, train_correct, train_total = 0, 0, 0
        for X,Y in train_loader:
            out = model(X)
            loss = criterion(out,Y)
            optim.zero_grad()
            loss.backward()
            optim.step()

            train_loss += loss.item() # item()이 모야?
            train_correct += accuracy(out, Y) * Y.size(0)
            train_total += Y.size(0)

        train_acc = train_correct / train_total
        train_loss /= len(train_loader)

        valid_loss, valid_acc = evaluate(model, valid_loader, criterion, device)
        
        train_p.append(train_loss) 
        valid_p.append(valid_loss)
        train_accs.append(train_acc)
        valid_accs.append(valid_acc)


        print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
        print(f"Train Loss: {train_loss:.4f},    Train_Acc: {train_acc:.4f}")
        print(f"Validation loss {valid_loss:.4f},    Validation Acc: {valid_acc:.4f}")

        if valid_loss < best_loss:
            print(f"최고 값 loss {best_loss:.4f} 에서 {valid_loss:.4f} 로 변경.")
            best_loss = valid_loss
            low_epoch = epoch
            torch.save(model.state_dict(), 'best_model_checkpoint.pth')
        else:
            if early_stop > 0 and low_epoch + early_stop < epoch+1:
                print("Early Stop")
                break





In [None]:
def evaluate(model, valid_loader, criterion, device):
    val_loss, val_corr, val_tota = 0,0,0
    model.eval()

    with torch.no.grad():
        for X,Y in valid_loader:
            X, Y = X.to(device), Y.to(device)
            predict = model(X)
            loss = criterion(predict, Y)

            val_loss += loss
            val_corr += accuracy(predict, Y) * Y.size(0)
            val_tota += Y.size(0)

        val_acc = val_corr / val_tota
        val_loss /= len(valid_loader)

    return val_loss, val_acc


In [None]:
train()

In [None]:

model.load_state_dict(torch.load('best_model_checkpoint.pth'))
model.to(device)

val_loss, val_acc = evaluate(model, valid_loader, criterion, device)

print(f'Best valid loss: {val_loss:.4f}')
print(f'Best valid acc: {val_acc:.4f}')

In [None]:
test_loss, test_acc = evaluate(model, test_loader, criterion, device)

print(f'Best test loss: {test_loss:.4f}')
print(f'Best test acc: {test_acc:.4f}')

In [None]:
result = dict()
result["Train Loss"] = train_p
result["Valid Loss"] = valid_p

result["Train Acc"] = train_accs
result["Valid Acc"] = valid_accs

In [None]:
## Train/Valid History

plot_from = 0
plt.figure(figsize=(20, 10))
plt.title("Train/Valid Loss History", fontsize = 20)
plt.plot(
    range(0, len(result['Train Loss'][plot_from:])),
    result['Train Loss'][plot_from:],
    label = 'Train Loss'
    )

plt.plot(
    range(0, len(result['Valid Loss'][plot_from:])),
    result['Valid Loss'][plot_from:],
    label = 'Valid Loss'
    )

plt.legend()
plt.yscale('log')
plt.grid(True)
plt.show()

In [None]:
check = {0: '부정', 1:'긍정'}
okt = Okt()

def predict(text, model, korbow, check):
    model.eval()

    # 문장 토큰화 
    tokens = tokenize_and_stemming(text, okt)
    toke = [korbow.get(token, 1) for token in tokens]
    input= torch.tensor([toke], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input)

    predict = torch.argmax(output, dim=1)
    predict_ = check[predict.item()]

    return predict_

In [None]:
text= '아 진짜 이 옷 별로네요'

predict(text, model, korbow, check)

In [None]:
text= '너무 잘 산 것 같아요 감사합니다 잘 쓸게요.'
predict(text, model, korbow, check)

In [None]:
text= '똥똥 잘 맞추는 것 같아 다행이네요.'
predict(text, model, korbow, check)

In [None]:
text= '아 진짜 괜히 삼. 님들은 돈 낭비하지 마셈.'
predict(text, model, korbow, check)