## **`Mecab` Install on Colab**
출처 :  `https://colab.research.google.com/drive/1tL2WjfE0v_es4YJCLGoEJM5NXs_O_ytW#scrollTo=Z7PCBmGrsR4Y`

In [0]:
!pip3 install konlpy       # Python 3.x
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip3 install JPype1-py3

import os
os.chdir('/tmp/')
!curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.1.tar.gz
!tar zxfv mecab-0.996-ko-0.9.1.tar.gz
os.chdir('/tmp/mecab-0.996-ko-0.9.1')
!./configure
!make
!make check
!make install

os.chdir('/tmp/') 
!wget -O m4-1.4.9.tar.gz http://ftp.gnu.org/gnu/m4/m4-1.4.9.tar.gz
!tar -zvxf m4-1.4.9.tar.gz
os.chdir('/tmp/m4-1.4.9')
!./configure
!make
!make install

os.chdir('/tmp')
!curl -OL http://ftp.gnu.org/gnu/autoconf/autoconf-2.69.tar.gz
!tar xzf autoconf-2.69.tar.gz
os.chdir('/tmp/autoconf-2.69')
!./configure --prefix=/usr/local
!make
!make install
!export PATH=/usr/local/bin

os.chdir('/tmp')
!curl -LO http://ftp.gnu.org/gnu/automake/automake-1.11.tar.gz
!tar -zxvf automake-1.11.tar.gz
os.chdir('/tmp/automake-1.11')
!./configure
!make
!make install

os.chdir('/tmp')
!curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
!tar -zxvf mecab-ko-dic-2.0.1-20150920.tar.gz
os.chdir('/tmp/mecab-ko-dic-2.0.1-20150920')
!ldconfig
!ldconfig -p | grep /usr/local/lib
!./autogen.sh
!./configure
!make
# !sh -c 'echo "dicdir=/usr/local/lib/mecab/dic/mecab-ko-dic" > /usr/local/etc/mecabrc'
!make install

os.chdir('/content')
!git clone https://bitbucket.org/eunjeon/mecab-python-0.996.git
os.chdir('/content/mecab-python-0.996')
!python3 setup.py build
!python3 setup.py install

from konlpy.tag import Mecab
m = Mecab()


##**Functions for preprocessing**
> **`import_normal_dset`** : returns normal news dset _(600000+)_<br>
> **`import_bad_dset`** : returns violation-of-law-interest news dset _(5465)_<br>
> **`tokenize_title_and_remove_stopword`** : returns tokenized and stopword-removed title list<br>
> **`testdata_tag`** : tag 20% of the data to be for test

In [0]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
import konlpy
from konlpy.tag import Mecab
import nltk
import json
import sklearn 
from pprint import pprint as print
import torch
import pickle, os, re, glob, ast
from matplotlib import font_manager, rc
import random

category = ['정치', '세계', 'IT과학', '경제', '사회', '생활문화'] #네이버 뉴스 분류와 동일한 카테고리 분류 채택

def import_normal_dset():
    dlist = []
    fname = u'./newsdata/*_raw.csv'
    flist = glob.glob(fname)
    for f in flist:
        if len(dlist) == 0:
            dlist = pd.read_csv(f)
        else:
            dlist = pd.concat([dlist, pd.read_csv(f)], axis = 0)

    dlist.drop(['Unnamed: 0'], axis = 1, inplace = True)
    dlist.reset_index(inplace = True)
    dlist.drop(['index'], axis = 1, inplace = True)
    dlist.columns = ['Year','Category', 'Media','Title']
    dlist['Reason'] = u'없음'
    dlist['Result'] = u'정상'
    return dlist

def import_bad_dset(state = 'res'):
    DataPath = u'./newsdata/'+state+'.csv'
    data = pd.read_csv(DataPath)
    data.drop(['Unnamed: 0','2'], axis = 'columns', inplace = True)
    data.columns = ['Year','Media','Space','Title','Reason','Result']
    data['Result'] = data['Result'].fillna('n')
    data['Reason'] = data['Reason'].fillna('n')
    data = data.dropna()
    data = data.reset_index()

    for i in range(data.shape[0]-1):  #Raw Data에서 동일한 Reason 항목 연속시 "로 표기한 것을 원래대로 복구
        if len(data['Reason'][i+1]) <=1: data['Reason'][i+1] = data['Reason'][i]
        if len(data['Result'][i+1]) <=1: data['Result'][i+1] = data['Result'][i]

    df = {}
    for i in range(8):
        df[2012+i] = data['Year'].str.find(str(i+2012))!=-1

    for i in range(8):
        data['Year'][df[2012+i]] = 2012+i  #Year formatting

    data.drop(['index'], axis = 1, inplace = True)

    category = {  # Raw Data에서 Category가 명확하게 분류되지 않음. 다음 단어가 포함된 카테고리를 총 6개의 카테고리로 분류. 나머지는 Others.
        '정치':['정치', '여의도', '국회', '시사', '자치'],
        '경제':['경제', 'Business', '산업'],
        '사회':['사회', '사고', '법원', '지역'],
        '생활문화':['문화', '연예', '스타', '라이프', '엔터', '방송', '엔터테인먼트'],
        '세계':['국제', '외교', '해외'],
        'IT과학':['IT', '과학', '의료']
    }

    data['Category'] = 'Others'

    for cat in category:
        for c in category[cat]:
            data['Category'][data['Space'].str.find(c)!=-1] = cat
            
    data['Category'][0:5] = '사회'

    for i in range(len(data)):  # 비슷한 카테고리끼리 인접한 데이터배치이므로 주변 카테고리들과 유사한 카테고리로 지정
        if data['Category'][i] == 'Others':
            data['Category'][i] = data['Category'][i-1:i+2].max() 
    return data

def tokenize_title_and_remove_stopword(dat):
    tokenizer = Mecab()
    stopwords=['의','가','이','은','들','는','잘','과','도','를','로','으로','자','에','와','한','하다','면','어','다'] # 임의로 지정한 불용어. 기사제목이고 데이터 길이가 짧으므로 최소화한 것.
    print("Tokenizing")
    tdat = dat['Title'].apply(tokenizer.morphs) # Mecab 사용해 tokenize
    print("Tokenize Done")
    for i in range(len(tdat)):  # 불용어 제거, Empty data의 경우 ["0"]*10으로 padding.
        tdat[i] = [word for word in tdat[i] if not word in stopwords and (re.match('[가-힣]+', word))]
        if tdat[i] == []:
            tdat[i] = ['0']*10
    print("Remove Stopwords Done")
    return tdat


def testdata_tag(all_dset_token): # 20% 데이터에 대해 테스팅용임을 태깅 : bad, normal data 각각에 대하여.
    all_dset_token['Test'] = False
    all_dset_token['Test'][all_dset_token[all_dset_token['없음'] == 1].sample(frac = 0.2).index] = True
    all_dset_token['Test'][all_dset_token[all_dset_token['없음'] != 1].sample(frac = 0.2).index] = True
    return all_dset_token


##**Load Data**
> **`all_dset`** : (611347*5) sampled newstitle dataset from 2012/9/ to 2019/9<br>
> **`all_dset_token`** : all_dset title _tokenized_ and _stopword_removed_<br>
> **`new_b_dset`** : bad newstitle dataset from 2019/10/ to 2019/11/<br>
> **`new_b_dset_token`** : new_b_dset title _tokenized_ and _stopword_removed_

In [0]:
os.chdir('/content')

# Raw Data 분포 확인, 각각 세부 침해내역이 priv, social 중 어디에 속하는지 분류
priv_key = ['신원공개', '명예훼손', '사생활', '고소고발', '기사제목', '아동']
socl_key = ['보도윤리', '차별금지', '범죄묘사', '범죄사건', '범죄수법', '충격', '자살', '폭력', '여론조사', '기사형', '마약', '성관련', '국가적', '재난']


# new data importing
new_b_dset = import_bad_dset('newres').drop(['Space', 'Media'], axis = 1)
new_b_dset['Reason'] = new_b_dset['Reason'].map(lambda x : x.replace(' ', ''))

for i in range(len(new_b_dset)):
    for key in priv_key:        
        if new_b_dset['Reason'][i].find(key) != -1:
            new_b_dset['Reason'][i] = 'Private'
    for key in socl_key:
        if new_b_dset['Reason'][i].find(key) != -1:
            new_b_dset['Reason'][i] = 'Social'

new_b_dset_token = new_b_dset.copy()
new_b_dset_token['Title'] = tokenize_title_and_remove_stopword(new_b_dset)
new_b_dset_token = pd.concat([new_b_dset_token.drop(['Reason', 'Result'], axis = 1), pd.get_dummies(new_b_dset['Reason'])], axis = 1)
new_b_dset_token['없음']=0
new_b_dset_token['Test']=True
print('Saving new_b_dset')
new_b_dset.to_csv('./newsdata/new_b_dset.csv')


# bad data importing
b_dset = import_bad_dset().drop(['Space', 'Media'], axis = 1)
all_dset = pd.merge(b_dset, import_normal_dset(), how = 'outer').drop(['Media'], axis = 1)
all_dset['Reason'] = all_dset['Reason'].map(lambda x : x.replace(' ', ''))

for i in range(len(all_dset)):
    for key in priv_key:        
        if all_dset['Reason'][i].find(key) != -1:
            all_dset['Reason'][i] = 'Private'
    for key in socl_key:
        if all_dset['Reason'][i].find(key) != -1:
            all_dset['Reason'][i] = 'Social'

all_dset_token = all_dset.copy()
all_dset_token['Title'] = tokenize_title_and_remove_stopword(all_dset_token)
all_dset_token = pd.concat([all_dset_token.drop(['Reason', 'Result'], axis = 1), pd.get_dummies(all_dset['Reason'])], axis = 1)
all_dset_token = testdata_tag(all_dset_token)
print('Saving all_dset')
all_dset.to_csv('./newsdata/all_dset.csv')
print('Saving all_dset_token')
all_dset_token.to_csv('./newsdata/all_dset_token.csv')

## **Embed**
> **`w2v_param`** : Word2Vec parameter set<br>
> **`model_construct`** : load if exists, or make Word2Vec model with given all_title_token(train)<br>
> **`get_sample`** : sample normal dset and return with bad dset<br>
> **`get_tsample`** : return test-tagged dset<br>
> **`fitsize`** : make the num of token to be 10 to embed(duplicate or truncate)<br>
> **`embed_sample`** : get data from get_sample and embed by constructed model<br>
> **`embed_tsample`** : get data from get_tsample and embed by constructed model<br>
> **`returnwv`** : return vocab vector if exists, or return dummy vector for OOV vocabs<br>

In [0]:
from sklearn.cluster import KMeans
from gensim.models.word2vec import Word2Vec
import logging, time, multiprocessing
from time import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

cores = multiprocessing.cpu_count()

class w2v_param:
    def __init__(self, vs, w, es, e):
      self.vector_size = vs
      self.window = w
      self.epochs = es
      self.epoch = e
      self.min_count = 1

w2v_param = w2v_param(32, 3, 10, 4) # Embedding metadata 객체

def model_construct(param_dic = w2v_param, all_title_token = all_dset_token['Title'], mode = 'word', load = True):
    model_name = './modeldata/Word2Vec('+mode+'_dset, E'+str(param_dic.epoch)+'-vs'+str(param_dic.vector_size)+'w'+str(param_dic.window)+'e'+str(param_dic.epochs)+'mc'+str(param_dic.min_count)+')'
    if load and os.path.exists(model_name):
        model = Word2Vec.load(model_name)
    else:
        model = Word2Vec(size = param_dic.vector_size, window = param_dic.window, iter = param_dic.epochs, min_count = param_dic.min_count, workers = cores)    # Learning
        model.build_vocab(all_title_token)
        start = time()
        for epoch in range(param_dic.epoch):  # alpha decaying하며 반복학습
            model.train(all_title_token, total_examples = model.corpus_count, epochs = model.iter)
            model.alpha-=0.002
            model.min_alpha = model.alpha
        end = time()
        print("Time : {}s".format(end-start))
        model.save(model_name)
    return model

def get_sample(all_dset_token = all_dset_token):
    b_dset = all_dset_token[all_dset_token['없음'] != 1]
    b_dset = b_dset[b_dset['Test'] == False]  #non-testing data 얻음
    n_dset = all_dset_token[all_dset_token['없음'] == 1]
    n_dset = n_dset[n_dset['Test'] == False]  #non-testing data 얻음
    sample_rate = pd.crosstab(b_dset.Year, b_dset.Category)
    total_sample_num = b_dset.shape[0]//2 # balanced sampling : bad 2 : normal 1의 비율로 normal data sampling
    sample_num = sample_rate/sample_rate.sum().sum() * total_sample_num
    sample_num = sample_num.round(0).astype(int)
    sample = []
    for cat in sample_num.columns:
        for year in sample_num.index:
            smp = n_dset[:][(n_dset['Year'] == year) & (n_dset['Category'] == cat)]
            if len(smp) == 0: smp = n_dset
            if sample_num[cat][year] != 0:
                smp = smp.sample(n = sample_num[cat][year])
                if len(sample) == 0: sample = smp
                else: sample = pd.concat([sample, smp], axis = 0)

        sample = sample.reset_index().drop(['index'], axis = 1)
    return pd.concat([sample, b_dset], axis = 0).reset_index().drop(['index', 'Category', 'Year', 'Test'], axis = 1)

def get_tsample(all_dset_token = all_dset_token):
    return all_dset_token[all_dset_token['Test'] == True].reset_index().drop(['index', 'Category', 'Year', 'Test'], axis = 1)

def fitsize(target, targetlen):
    processed_dset = []
    for token in target:
        curlen = len(token)
        if curlen == 0: # empty data를 padding으로 채움
            processed_dset.append(['0']*10)
        else: # 길이가 부족한 데이터의 경우 반복을 통해 채움
            while len(token) < targetlen:
                token += token
            processed_dset.append(token[:targetlen])  # targetlen만큼 truncate
    
    return processed_dset

def embed_sample(param_dic = w2v_param, token = all_dset_token, mode = 'word'):
    sample = get_sample(token)
    sample.Title = fitsize(sample.Title, 10)
    model = model_construct(param_dic, token.Title)
    train_data = torch.tensor(sample.Title.apply(lambda x : list(map(lambda y : model.wv[y], x))))  # 주어진 토큰의 임베딩 벡터를 얻음
    train_label = torch.tensor(np.array(sample.drop('Title', axis = 1)))
    return train_data, train_label

def returnwv(model, y): # out-of-vocab 단어의 경우 padding의 단어벡터를 사용
    res = ''
    try:
        res = model.wv[y]
    except:
        res = model.wv['0']
    return res

def embed_tsample(param_dic = w2v_param, token = all_dset_token, mode = 'word'):
    sample = get_tsample(token)
    sample.Title = fitsize(sample.Title, 10)
    model = model_construct(param_dic, token.Title)
    test_data = torch.tensor(sample.Title.apply(lambda x : list(map(lambda y : returnwv(model, y), x))))
    test_label = torch.tensor(np.array(sample.drop('Title', axis = 1)))
    return test_data, test_label



##**Training `dset_vector` by FNN**


In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable

BATCH_SIZE = 240

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        drop1 = nn.Dropout(0.5)
        linear1 = nn.Linear(320, 256)
        linear3 = nn.Linear(256, 3)
        nn.init.xavier_uniform_(linear1.weight)
        nn.init.xavier_uniform_(linear3.weight)
        self.fnn_module = nn.Sequential(
            linear1,
            nn.LeakyReLU(),
            drop1,
            linear3,
            nn.Softmax()
        )
        if torch.cuda.is_available():
            self.fnn_module = self.fnn_module.cuda()
    
    def forward(self, x):
        return self.fnn_module(x).reshape(-1,3)

class NewsDataset(Dataset):
    def __init__(self, data_vector, label_vector):
        shape = data_vector.shape
        self.len = shape[0]
        self.tr_X = data_vector.reshape(shape[0], shape[1]*shape[2])
        self.tr_Y = label_vector

    def __getitem__(self, index):
        return self.tr_X[index], self.tr_Y[index]

    def __len__(self):
        return self.len

test_vector, test_label = embed_tsample()
test_dataset = NewsDataset(test_vector, test_label)

LEARNING_RATE, EP = 2e-3, 10

try:
    del model, tr_X, tr_Y, ts_X, ts_Y
except:
    pass

total_loss = []
test_loss = []
model = Model()
if torch.cuda.is_available():
    torch.device('cuda')
    model = model.cuda()

for i in range(10):
    train_vector, train_label = embed_sample()  # 매 반복마다 sampling을 다시 함(embed_sample은 새로운 get_sample을 수반)
    train_dataset = NewsDataset(train_vector, train_label)
    train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = cores, drop_last = False)


    model.train()
    crit = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

    for epoch in range(EP):
        train_loss = 0.0
                
        for i, data in enumerate(train_loader):
            # Train
            tr_X, tr_Y = data
            tr_X, tr_Y = Variable(tr_X), Variable(tr_Y).long()
            if torch.cuda.is_available():
                tr_X = tr_X.cuda()
                tr_Y = tr_Y.cuda()
            optimizer.zero_grad()
            y_pred = model(tr_X)
            loss = crit(y_pred, torch.max(tr_Y,1)[1])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            total_loss.append(loss.item())
            # Test
            model.eval()
            ts_X, ts_Y = Variable(test_dataset.tr_X).cuda(), Variable(test_dataset.tr_Y).long().cuda()
            optimizer.zero_grad()
            y_pred = model(ts_X)
            test_loss_val = crit(y_pred, torch.max(ts_Y, 1)[1]).item()
            test_loss.append(test_loss_val)
            model.train()
            del loss, y_pred
        print("LR = {:.4f} : epoch = {:3d} : loss = {:.4f} : testloss = {:.4f}".format(LEARNING_RATE, epoch, train_loss/(i+1), test_loss_val))
        train_loss = 0.0

    plt.plot(total_loss)
    plt.plot(test_loss, color = 'RED')
    plt.show()

model_name = './modeldata/'+'FNN_E'+str(epoch)+"320_256_3"+"_B"+str(BATCH_SIZE)+"_"+str(i)
lossdat = pd.DataFrame({'train':total_loss, 'test':test_loss})
lossdat.to_csv(model_name+"_loss.csv")
torch.save(model.state_dict(), model_name)

# evaluation for training, test, new data
model.eval()
ans = torch.max(test_label, 1)[1].to("cuda")
pred = torch.max(model(test_dataset.tr_X.cuda()), 1)[1]
print("Test Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
ans = torch.max(train_label, 1)[1].to("cuda")
pred = torch.max(model(train_dataset.tr_X.cuda()), 1)[1]
print("Train Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
new_bad_vector, new_bad_label = embed_tsample(token = new_b_dset_token)
new_bad_dataset = NewsDataset(new_bad_vector, new_bad_label)
ans = torch.max(new_bad_label, 1)[1].to("cuda")
pred = torch.max(model(new_bad_dataset.tr_X.cuda()), 1)[1]
print("Newdat Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))

##**Training `dset_vector` by CNN**

In [0]:
torch.cuda.empty_cache()
BATCH_SIZE = 240

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        conv1 = nn.Conv2d(1, 8, 4, stride = 1, padding_mode='zeros')
        conv2 = nn.Conv2d(1, 8, 5, stride = 1, padding_mode='zeros')
        conv_f1 = nn.Conv2d(8, 5, 1, stride = 1)
        conv_f2 = nn.Conv2d(8, 4, 1, stride = 1)
        conv1r = nn.Conv2d(5, 8, 3, stride = 1, padding_mode='zeros')
        conv2r = nn.Conv2d(4, 8, 4, stride = 1, padding_mode='zeros')
        conv_f1f = nn.Conv2d(8, 4, 1, stride = 1)
        conv_f2f = nn.Conv2d(8, 3, 1, stride = 1)
        
        linear = nn.Linear(369, 3)
        nn.init.xavier_uniform_(conv1.weight)
        nn.init.xavier_uniform_(conv2.weight)
        nn.init.xavier_uniform_(conv_f1.weight)
        nn.init.xavier_uniform_(conv_f2.weight)
        nn.init.xavier_uniform_(conv1r.weight)
        nn.init.xavier_uniform_(conv2r.weight)
        nn.init.xavier_uniform_(conv_f1f.weight)
        nn.init.xavier_uniform_(conv_f2f.weight)
        nn.init.xavier_uniform_(linear.weight)
        self.cnn1_module = nn.Sequential(
            conv1,
            nn.LeakyReLU(),
            nn.MaxPool2d(2, 1),
            conv_f1,
            conv1r,
            nn.LeakyReLU(),
            nn.MaxPool2d(2, 1),
            conv_f1f,
        )
        self.cnn2_module = nn.Sequential(
            conv2,
            nn.LeakyReLU(),
            nn.MaxPool2d(2, 1),
            conv_f2,
            conv2r,
            nn.LeakyReLU(),
            nn.MaxPool2d(2, 1),
            conv_f2f,
        )
        self.fnn_module = nn.Sequential(
            nn.Dropout(0.5),
            linear,
            nn.Softmax()
        )

        if torch.cuda.is_available():
            self.cnn1_module = self.cnn1_module.cuda()
            self.cnn2_module = self.cnn2_module.cuda()
            self.fnn_module = self.fnn_module.cuda()
    
    def forward(self, x):
        len = x.shape[0]
        x = x.reshape(len, 1, x.shape[1], x.shape[2])
        cnn1 = self.cnn1_module(x).reshape(len, -1)
        cnn2 = self.cnn2_module(x).reshape(len, -1)
        cnn = torch.cat((cnn1, cnn2), dim=1)
        res = self.fnn_module(cnn)
        return res

class NewsDataset(Dataset):
    def __init__(self, data_vector, label_vector):
        shape = data_vector.shape
        self.len = shape[0]
        self.tr_X = data_vector
        self.tr_Y = label_vector

    def __getitem__(self, index):
        return self.tr_X[index], self.tr_Y[index]

    def __len__(self):
        return self.len

test_vector, test_label = embed_tsample()
test_dataset = NewsDataset(test_vector, test_label)

LEARNING_RATE, EP = 2e-3, 10

try:
    del model, tr_X, tr_Y, ts_X, ts_Y
except:
    pass

total_loss = []
test_loss = []
model = Model()
if torch.cuda.is_available():
    torch.device('cuda')
    model = model.cuda()

for i in range(10):
    train_vector, train_label = embed_sample()  # 매 반복마다 sampling을 다시 함(embed_sample은 새로운 get_sample을 수반)
    train_dataset = NewsDataset(train_vector, train_label)
    train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = cores, drop_last = False)


    model.train()
    crit = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

    for epoch in range(EP):
        train_loss = 0.0
                
        for i, data in enumerate(train_loader):
            # Train
            tr_X, tr_Y = data
            tr_X, tr_Y = Variable(tr_X), Variable(tr_Y).long()
            if torch.cuda.is_available():
                tr_X = tr_X.cuda()
                tr_Y = tr_Y.cuda()
            optimizer.zero_grad()
            y_pred = model(tr_X)
            loss = crit(y_pred, torch.max(tr_Y,1)[1])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            total_loss.append(loss.item())
            # Test
            model.eval()
            ts_X, ts_Y = Variable(test_dataset.tr_X).cuda(), Variable(test_dataset.tr_Y).long().cuda()
            optimizer.zero_grad()
            y_pred = model(ts_X)
            test_loss_val = crit(y_pred, torch.max(ts_Y, 1)[1]).item()
            test_loss.append(test_loss_val)
            model.train()
            del loss, y_pred
        print("LR = {:.4f} : epoch = {:3d} : loss = {:.4f} : testloss = {:.4f}".format(LEARNING_RATE, epoch, train_loss/(i+1), test_loss_val))
        train_loss = 0.0

    plt.plot(total_loss)
    plt.plot(test_loss, color = 'RED')
    plt.show()
model_name = './modeldata/'+'CNN_E'+str(epoch)+"50_4*8*5*8*4_5*8*4*8*3"+"_B"+str(BATCH_SIZE)+"_"+str(i)
lossdat = pd.DataFrame({'train':total_loss, 'test':test_loss})
lossdat.to_csv(model_name+"_loss.csv")
torch.save(model.state_dict(), model_name)

# evaluation for training, test, new data
model.eval()
ans = torch.max(test_label, 1)[1].to("cuda")
pred = torch.max(model(test_dataset.tr_X.cuda()), 1)[1]
print("Test Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
ans = torch.max(train_label, 1)[1].to("cuda")
pred = torch.max(model(train_dataset.tr_X.cuda()), 1)[1]
print("Train Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
new_bad_vector, new_bad_label = embed_tsample(token = new_b_dset_token)
new_bad_dataset = NewsDataset(new_bad_vector, new_bad_label)
ans = torch.max(new_bad_label, 1)[1].to("cuda")
pred = torch.max(model(new_bad_dataset.tr_X.cuda()), 1)[1]
print("Newdat Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))

## **Training `doc_vector` by `GRU`**

In [0]:
torch.cuda.empty_cache()

BATCH_SIZE = 240

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        gru = nn.GRU(input_size = 32, hidden_size = 24, num_layers = 3, dropout = 0.5, bidirectional = True, batch_first = True)
        linear = nn.Linear(480, 3)
        nn.init.xavier_uniform_(linear.weight)
        self.gru_module = nn.Sequential(
            gru,
        )
        self.linear_module = nn.Sequential(
            nn.LeakyReLU(),
            nn.Dropout(0.5),
            linear,
            nn.Softmax()
        )
        if torch.cuda.is_available():
            self.gru_module = self.gru_module.cuda()
    
    def forward(self, x):
        g = self.gru_module(x)
        return self.linear_module(g[0].reshape(g[0].shape[0], -1))

class NewsDataset(Dataset):
    def __init__(self, data_vector, label_vector):
        shape = data_vector.shape
        self.len = shape[0]
        self.tr_X = data_vector
        self.tr_Y = label_vector

    def __getitem__(self, index):
        return self.tr_X[index], self.tr_Y[index]

    def __len__(self):
        return self.len

test_vector, test_label = embed_tsample()
test_dataset = NewsDataset(test_vector, test_label)

LEARNING_RATE, EP = 5e-3, 10

try:
    del model, tr_X, tr_Y, ts_X, ts_Y
except:

    pass

total_loss = []
test_loss = []
model = Model()
if torch.cuda.is_available():
    torch.device('cuda')
    model = model.cuda()

for i in range(10):
    train_vector, train_label = embed_sample()  # 매 반복마다 sampling을 다시 함(embed_sample은 새로운 get_sample을 수반)
    train_dataset = NewsDataset(train_vector, train_label)
    train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = cores, drop_last = False)


    model.train()
    crit = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

    for epoch in range(EP):
        train_loss = 0.0
                
        for i, data in enumerate(train_loader):
            # Train
            tr_X, tr_Y = data
            tr_X, tr_Y = Variable(tr_X), Variable(tr_Y).long()
            if torch.cuda.is_available():
                tr_X = tr_X.cuda()
                tr_Y = tr_Y.cuda()
            optimizer.zero_grad()
            y_pred = model(tr_X)
            loss = crit(y_pred, torch.max(tr_Y,1)[1])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            total_loss.append(loss.item())
            # Test
            model.eval()
            ts_X, ts_Y = Variable(test_dataset.tr_X).cuda(), Variable(test_dataset.tr_Y).long().cuda()
            optimizer.zero_grad()
            y_pred = model(ts_X)
            test_loss_val = crit(y_pred, torch.max(ts_Y, 1)[1]).item()
            test_loss.append(test_loss_val)
            model.train()
            del loss, y_pred
        print("LR = {:.4f} : epoch = {:3d} : loss = {:.4f} : testloss = {:.4f}".format(LEARNING_RATE, epoch, train_loss/(i+1), test_loss_val))
        train_loss = 0.0

    plt.plot(total_loss)
    plt.plot(test_loss, color = 'RED')
    plt.show()
model_name = './modeldata/'+'GRU_E'+str(epoch)+"240_3"+"_B"+str(BATCH_SIZE)+"_"+str(i)
lossdat = pd.DataFrame({'train':total_loss, 'test':test_loss})
lossdat.to_csv(model_name+"_loss.csv")
torch.save(model.state_dict(), model_name)

# evaluation for training, test, new data
model.eval()
ans = torch.max(test_label, 1)[1].to("cuda")
pred = torch.max(model(test_dataset.tr_X.cuda()), 1)[1]
print("Test Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
ans = torch.max(train_label, 1)[1].to("cuda")
pred = torch.max(model(train_dataset.tr_X.cuda()), 1)[1]
print("Train Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
new_bad_vector, new_bad_label = embed_tsample(token = new_b_dset_token)
new_bad_dataset = NewsDataset(new_bad_vector, new_bad_label)
ans = torch.max(new_bad_label, 1)[1].to("cuda")
pred = torch.max(model(new_bad_dataset.tr_X.cuda()), 1)[1]
print("Newdat Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))

## **Training `doc_vector` by GRU and CNN**

In [0]:
torch.cuda.empty_cache()

BATCH_SIZE = 240

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        gru = nn.GRU(input_size = 32, hidden_size = 18, num_layers = 2, dropout = 0.5, bidirectional = True, batch_first = True)
        conv1 = nn.Conv2d(1, 8, 4, stride = 1, padding_mode='zeros')
        conv2 = nn.Conv2d(8, 3, 4, stride = 1, padding_mode='zeros')
        linear = nn.Linear(360, 3)
        nn.init.xavier_uniform_(conv1.weight)
        nn.init.xavier_uniform_(conv2.weight)
        nn.init.xavier_uniform_(linear.weight)
        self.gru_module = nn.Sequential(
            gru,
        )
        self.cnn_module = nn.Sequential(
            nn.LeakyReLU(),
            conv1,
            nn.LeakyReLU(),
            conv2,
            nn.LeakyReLU(),
        )
        self.linear_module = nn.Sequential(
            nn.LeakyReLU(),
            nn.Dropout(0.5),
            linear,
            nn.Softmax()
        )
        if torch.cuda.is_available():
            self.gru_module = self.gru_module.cuda()
            self.cnn_module = self.cnn_module.cuda()
            self.linear_module = self.linear_module.cuda()
    
    def forward(self, x):
        gru = self.gru_module(x)
        len = gru[0].shape[0]
        gru = gru[0].reshape(len, 1, gru[0].shape[1], gru[0].shape[2])
        cnn = self.cnn_module(gru).reshape(len, -1)
        lin = self.linear_module(cnn)
        return lin

class NewsDataset(Dataset):
    def __init__(self, data_vector, label_vector):
        shape = data_vector.shape
        self.len = shape[0]
        self.tr_X = data_vector
        self.tr_Y = label_vector

    def __getitem__(self, index):
        return self.tr_X[index], self.tr_Y[index]

    def __len__(self):
        return self.len

test_vector, test_label = embed_tsample()
test_dataset = NewsDataset(test_vector, test_label)

LEARNING_RATE, EP = 5e-3, 10

try:
    del model, tr_X, tr_Y, ts_X, ts_Y
except:

    pass

total_loss = []
test_loss = []
model = Model()
if torch.cuda.is_available():
    torch.device('cuda')
    model = model.cuda()

for i in range(10):
    train_vector, train_label = embed_sample()  # 매 반복마다 sampling을 다시 함(embed_sample은 새로운 get_sample을 수반)
    train_dataset = NewsDataset(train_vector, train_label)  
    train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = cores, drop_last = False)


    model.train()
    crit = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

    for epoch in range(EP):
        train_loss = 0.0
                
        for i, data in enumerate(train_loader):
            # Train
            tr_X, tr_Y = data
            tr_X, tr_Y = Variable(tr_X), Variable(tr_Y).long()
            if torch.cuda.is_available():
                tr_X = tr_X.cuda()
                tr_Y = tr_Y.cuda()
            optimizer.zero_grad()
            y_pred = model(tr_X)
            loss = crit(y_pred, torch.max(tr_Y,1)[1])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            total_loss.append(loss.item())
            # Test
            model.eval()
            ts_X, ts_Y = Variable(test_dataset.tr_X).cuda(), Variable(test_dataset.tr_Y).long().cuda()
            optimizer.zero_grad()
            y_pred = model(ts_X)
            test_loss_val = crit(y_pred, torch.max(ts_Y, 1)[1]).item()
            test_loss.append(test_loss_val)
            model.train()
            del loss, y_pred
        print("LR = {:.4f} : epoch = {:3d} : loss = {:.4f} : testloss = {:.4f}".format(LEARNING_RATE, epoch, train_loss/(i+1), test_loss_val))
        train_loss = 0.0

    plt.plot(total_loss)
    plt.plot(test_loss, color = 'RED')
    plt.show()
model_name = './modeldata/'+'GRU_CNNE'+str(epoch)+"240_3"+"_B"+str(BATCH_SIZE)+"_"+str(i)
lossdat = pd.DataFrame({'train':total_loss, 'test':test_loss})
lossdat.to_csv(model_name+"_loss.csv")
torch.save(model.state_dict(), model_name)

# evaluation for training, test, new data
model.eval()
ans = torch.max(test_label, 1)[1].to("cuda")
pred = torch.max(model(test_dataset.tr_X.cuda()), 1)[1]
print("Test Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
ans = torch.max(train_label, 1)[1].to("cuda")
pred = torch.max(model(train_dataset.tr_X.cuda()), 1)[1]
print("Train Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))
new_bad_vector, new_bad_label = embed_tsample(token = new_b_dset_token)
new_bad_dataset = NewsDataset(new_bad_vector, new_bad_label)
ans = torch.max(new_bad_label, 1)[1].to("cuda")
pred = torch.max(model(new_bad_dataset.tr_X.cuda()), 1)[1]
print("Newdat Accuracy : {}".format((pred == ans).sum()/(pred.shape[0]*1.0)))