In [1]:
import pandas as pd
import torch
from tqdm import tqdm, tqdm_notebook
from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration

import FinanceDataReader as fdr
from sklearn.metrics import f1_score

In [2]:
start_date = '2022-01-01'
end_date = '2022-12-31'

stocks = {'apple':'AAPL',
          'tesla':'TSLA',
          'google':'GOOG',
          'kakao':'035720',
          'samsung':'005930'}

In [3]:
def get_close(code:str, start_date:str, end_date:str):
    prev_close = fdr.DataReader(code, '2021-12-31', '2021-12-31')
    prev_close = prev_close['Close'][0]
    
    change_proportions = []
    
    data = fdr.DataReader(code, start_date, end_date)
    change_proportions.append(
        (data['Close'][0] - prev_close) / data['Close'][0]
    )
    for i in range(len(data)-1):
        r = (data['Close'][i+1] - data['Close'][i]) / data['Close'][i+1]
        change_proportions.append(r)
        
    data['Change'] = change_proportions
    data.reset_index(inplace=True)
    return data

In [4]:
apple = get_close(stocks['apple'], start_date, end_date)
tesla = get_close(stocks['tesla'], start_date, end_date)
google = get_close(stocks['google'], start_date, end_date)
samsung = fdr.DataReader(stocks['samsung'], start_date, end_date).reset_index()
kakao = fdr.DataReader(stocks['kakao'], start_date, end_date).reset_index()

In [5]:
def get_label(df):
    t = []
    t_2 = []
    for change in df['Change']:
        if change > 0.01:
            t.append(0)
            t_2.append('up')
        elif change < -0.01:
            t.append(1)
            t_2.append('down')
        else:
            t.append(2)
            t_2.append('same')
            
    df['label_desc'] = t_2
    df['label'] = t
    
    return df

In [6]:
apple = get_label(apple)
tesla = get_label(tesla)
google = get_label(google)
samsung = get_label(samsung)
kakao = get_label(kakao)

In [7]:
columns = ['Date', 'Close', 'label_desc', 'label']
apple = apple[columns].astype({'Date':str})
tesla = tesla[columns].astype({'Date':str})
google = google[columns].astype({'Date':str})
samsung = samsung[columns].astype({'Date':str})
kakao = kakao[columns].astype({'Date':str})

In [8]:
df = pd.read_pickle('./data/data_11_12.p')

In [9]:
tmp = []
for date in df['date']:
    tmp.append(date.split(' ')[0])
    
df['date'] = tmp

In [10]:
news_apple = df[df['name']=='애플'].rename(columns={'date':'Date'})
news_tesla = df[df['name']=='테슬라'].rename(columns={'date':'Date'})
news_google = df[df['name']=='구글'].rename(columns={'date':'Date'})
news_samsung = df[df['name']=='삼성'].rename(columns={'date':'Date'})
news_kakao = df[df['name']=='카카오'].rename(columns={'date':'Date'})

In [11]:
news_apple = news_apple[['Date', 'content']]
news_tesla = news_tesla[['Date', 'content']]
news_google = news_google[['Date', 'content']]
news_samsung = news_samsung[['Date', 'content']]
news_kakao = news_kakao[['Date', 'content']]

In [12]:
apple_join = pd.merge(apple, news_apple, on='Date')
tesla_join = pd.merge(tesla, news_tesla, on='Date')
google_join = pd.merge(google, news_google, on='Date')
samsung_join = pd.merge(samsung, news_samsung, on='Date')
kakao_join = pd.merge(kakao, news_kakao, on='Date')

In [13]:
df_list = [apple_join, tesla_join, google_join, samsung_join, kakao_join]
df_final = pd.concat(df_list, ignore_index=True)
df_final

Unnamed: 0,Date,Close,label_desc,label,content
0,2022-11-01,150.649994,down,1,[윤구 대표 돌연 사임 후 1년 만에 공석 채워美 해군사관학교 출신 반도체 장비 전...
1,2022-11-01,150.649994,down,1,[온라인 스토어·정보시스템 부문 담당 부사장 잇따라 사임최근 한 달새 애플의 고위 ...
2,2022-11-01,150.649994,down,1,[ *재판매 및 DB 금지[서울=뉴시스] 남정현 기자 = 카카오페이 측이 애플페이의...
3,2022-11-01,150.649994,down,1,[중국 허난성 정저우시 폭스콘 노동자들로 추정되는 이들이 공장을 떠나 고속도로를 걸...
4,2022-11-01,150.649994,down,1,[공장 노동자들이 폐쇄된 출입문을 뚫고 쏟아져 나갑니다.일부는 외곽 철조망을 넘습니...
...,...,...,...,...,...
976,2022-12-16,54400.000000,down,1,[싱가포르의 테슬라 서비스 센터 전경./사진=로이터 미국 전기차 기업 테슬라 주가가...
977,2022-12-16,54400.000000,down,1,[캐시 우드 아크인베스트먼트 최고경영자(CEO)가 미국 전기차업체 테슬라 주식을 집...
978,2022-12-16,54400.000000,down,1,[ 7일(현지시간) 태국 방콕의 테슬라 공식 출시 행사장에 테슬라 모델 Y가 검은 ...
979,2022-12-16,54400.000000,down,1,[일론 머스크 잇단 주식 매각에 주가 ‘반 토막’“수익성‧성장성 변화 없어…사업영역...


In [14]:
summ_tokenizer = PreTrainedTokenizerFast.from_pretrained('digit82/kobart-summarization')
summ_model = BartForConditionalGeneration.from_pretrained('digit82/kobart-summarization')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [15]:
def get_summary(df):
    summaries = []
    for content in tqdm(df['content']):
        txt = content.replace('[', '').replace(']', '').replace('=', '')
        raw_input_ids = summ_tokenizer.encode(txt)
        
        input_ids = [summ_tokenizer.bos_token_id] + raw_input_ids + [summ_tokenizer.eos_token_id]
        # input_ids = raw_input_ids + [summ_tokenizer.eos_token_id]
        # print(input_ids)
        if len(input_ids) > 800:
            input_ids = input_ids[:800]
        
        
        with torch.no_grad():
            summ_ids = summ_model.generate(torch.tensor([input_ids]), num_beams=4, max_length=1024, eos_token_id=1)
            summ = summ_tokenizer.decode(summ_ids.squeeze().tolist(), skip_special_tokens=True)
        summaries.append(summ)
        
    df['summary'] = summaries
    return df

In [16]:
df_summ = get_summary(df_final)

100%|██████████| 981/981 [41:23<00:00,  2.53s/it]  


In [17]:
df_summ['summary'][7]

'경기 불황으로 부품사들이 줄줄이 3분기 어닝쇼크를 기록했지만 애플을 최대 고객사로 두고 있는 삼성디스플레이와 LG이노텍은 오히려 최대 분기 실적을 기록하며 애플에 OLED(유기발광다이오드)디스플레이 패널을, LG이노텍은 카메라모듈을 공급했다.'

In [18]:
df_summ['content'][7]

'[전세계를 덮친 경기 불황으로 부품사들이 줄줄이 3분기 어닝쇼크를 기록했지만 애플이 깔아둔 \'꽃길\'은 견고했다. 애플을 최대 고객사로 두고 있는 삼성디스플레이와 LG이노텍은 오히려 최대 분기 실적을 기록했다. 삼성디스플레이는 애플에 OLED(유기발광다이오드)디스플레이 패널을, LG이노텍은 카메라모듈을 공급한다.1일 업계에 따르면 삼성디스플레이와 LG이노텍 모두 올해 3분기 영업이익이 지난해 같은 기간과 비교해 30% 넘게 증가했다. 삼성디스플레이의 영업이익은 32.9% 늘어난 1조9800억원, LG이노텍은 32.5% 늘어 3357억원을 기록했다.  애플이 지난 9월 출시한 스마트폰 신제품인 아이폰 14 시리즈가 두 회사의 실적 호조를 견인했다. 경기 침체와 물가 상승에 따라 IT(정보기술)제품 수요가 특히 줄었지만 애플은 예외였다. 소비가 줄어들면서 미들엔드 스마트폰 시장은 직격탄을 받았지만 하이엔드 스마트폰 인기는 유지됐다. 특히 애플 스마트폰은 단순한 전자기기를 뛰어넘어 일종의 명품으로 자리잡으면서 경기 불황에 영향을 크게 받지 않았다는 분석이다. 자국 브랜드를 애용하는 중국 스마트폰 시장에서도 애플 점유율은 15%가 넘는다.삼성디스플레이는 아이폰 14와 프로, 맥스, 프로맥스 등 모든 14시리즈에 OLED 패널을 공급한다. LG디스플레이와 중국의 BOE도 아이폰용 OLED 패널을 공급하지만 삼성디스플레이 비중이 70%에 달한다. 특히 LG디스플레이가 아이폰 14시리즈에 공급하는 LTPO(저온다결정산화물)기술을 적용한 디스플레이가 다소 지연되면서 출시 초기 공급 모델에서 삼성디스플레이가 LTPO 디스플레이를 대부분 공급한 점도 호재로 작용했다. 애플은 고급 모델에 LPTO OLED를 사용하는데, 기존 OLED보다 화면이 더 부드럽고 배터리 성능이 더 효율적이지만 가격이 더 비싸다. 삼성전자가 폴더블폰 신제품인 플립4와 폴드4를 내놓은 것도 삼성디스플레이에 긍정적 영향을 줬다. 삼성디스플레이는 "스마트폰 신제품 출시에 따른 수요 증가에 더불어 고객사 신

In [69]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np

import os

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split

In [90]:
device = torch.device('cpu')

In [91]:
bertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/hoon/classification/.cache/kobert_v1.zip
using cached model. /home/hoon/classification/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /home/hoon/classification/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [92]:
'''
maximum length of contents = 6303
'''

config = {
    'max_len':64,
    'batch_size':64,
    'warmup_ratio':0.1,
    'num_epochs':25,
    'max_grad_norm':1,
    'log_interval':200,
    'dr_rate':0.1,
    'lr':5e-5
}

In [93]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair
        )
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return (len(self.labels))

In [111]:
df_train = df_summ[['summary', 'label']].values

In [112]:
df_train, df_test = train_test_split(df_train, test_size=0.2, shuffle=True)

In [113]:
train_dataset = BERTDataset(df_train, 0, 1, tok, config['max_len'], True, False)
test_dataset = BERTDataset(df_test, 0, 1, tok, config['max_len'], True, False)

In [114]:
train_dataset[0]

(array([   2, 3969, 5436,  693,  264,  311,  307,  693,  264,  311,  307,
        7495, 7495, 6964, 3332, 6078,  839, 7147, 4675, 7706, 6040, 6664,
        5026,  712, 3647, 7753, 4640,  517,  498,  517, 5330, 4304,  533,
        7028,  543, 7126, 2724, 3963, 3538, 2476, 5468,  843, 2632, 1050,
        6079, 4451, 7018, 7248, 7075,  529, 5712, 1946, 4092, 6286, 5808,
        2482, 6079, 1734, 6572, 5782,  517,   54,    3,    1], dtype=int32),
 array(63, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 2)

In [115]:
train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=config['batch_size'],
                                               num_workers=4,
                                               shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=config['batch_size'],
                                              num_workers=4,
                                              shuffle=True)

In [116]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f23db285d90>

In [117]:
class BERT_Classifier(nn.Module):
    def __init__(self, bert_model, hidden_size=768, num_classes=3, dr_rate=config['dr_rate'], params=None):
        super(BERT_Classifier, self).__init__()
        self.bert_model = bert_model
        self.dr_rate = dr_rate
        
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(p=dr_rate)
        
    def gen_attn_mask(self, token_ids, valid_len):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_len):
            attention_mask[i][:v] = 1
        return attention_mask.float()
    
    def forward(self, token_ids, valid_len, segment_ids):
        attention_mask = self.gen_attn_mask(token_ids, valid_len)
        _, pooler = self.bert_model(input_ids=token_ids,
                                    token_type_ids=segment_ids.long(),
                                    attention_mask=attention_mask.float().to(token_ids.device),
                                    return_dict=False)
        
        out = self.dropout(pooler)
        return self.classifier(out)

In [118]:
model = BERT_Classifier(bert_model=bertmodel).to(device)

In [119]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [120]:
optimizer = AdamW(optimizer_grouped_parameters, lr=config['lr'])
loss_fn = nn.CrossEntropyLoss()



In [121]:
t_total = len(train_dataloader) * config['num_epochs']
warmup_step = int(t_total * config['warmup_ratio'])

In [122]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [123]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [124]:
def calc_f1(X, Y):
    f1 = f1_score(Y, X)
    return f1

In [125]:
for epoch in range(config['num_epochs']):
    train_acc = 0.0
    test_acc = 0.0
    train_f1 = 0.0
    test_f1 = 0.0
    
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        # train_f1 += calc_f1(out, label)
        print(f'epoch: {epoch} | train_acc: {train_acc / (batch_id+1)}')
        
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
        # test_f1 +=calc_f1(out, label)
    print(f'epoch: {epoch} | test_acc: {test_acc / (batch_id+1)}')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [127]:
torch.save(model.state_dict(), "news.pt")
modelload = BERT_Classifier(bertmodel,  dr_rate=0.1).to(device)
modelload.load_state_dict(torch.load("news.pt", device))

<All keys matched successfully>

In [128]:
def testModel(model, seq):
    cate = ['up', 'down', 'same']
    tmp = [seq]
    transform = nlp.data.BERTSentenceTransform(tok, config['max_len'], pad=True, pair=False)
    tokenized = transform(tmp)

    modelload.eval()
    result = model(torch.tensor([tokenized[0]]).to(device), [tokenized[1]], torch.tensor(tokenized[2]).to(device))
    idx = result.argmax().cpu().item()
    print("뉴스의 카테고리는:", cate[idx])

In [178]:
txt = df_summ['summary'][306]
txt

"'돈나무 언니'로 알려진 캐시 우드(Cathie Wood) 아크 인베스트먼트 최고경영자(CEO)는 지난 9일(현지시간) 아크 이노베이션 ETF(ARKK)에 테슬라 주식 약 2만 7,594주를 추가하고 코인베이스 주식은 지난 8일(현지시간) 42만 주, 9일(현지시간) 20만 7,527주 매수한 것으로 확인됐다."

In [179]:
testModel(model, txt)

뉴스의 카테고리는: up


In [155]:
for i in range(5):
    print(df_summ['summary'][i])

애플이 한국코리아의 신임 사장으로 마크 리(Mark lee) 전 어플라이드 머티어리얼즈 코리아 대표를 선임한 것으로 밝혀졌다.
한 달새 애플의 고위 임원 3명이 회사를 떠난 것으로 알려지면서 그 배경을 두고 관심이 쏠리고 있다.
백승준 카카오페이 사업총괄 리더는 NFC 방식(의 카드단말기)를 쓰고 있는데 국내 오프라인 시장은 대부분 MTS 방식이어서 결제 디바이스를 NFC 방식으로 전환해야 하는 어려움이 있고 전환을 위해 상당한 비용이 소요되는 것으로 알고 있다며 애플페이의 국내 도입에 대해 "예의주시하고 있다"고 밝혔다.
11월 1일 월스트리트저널(WSJ) 등 외신에 따르면 지난 주말 중국 허난성 정저우시 폭스콘 공장에서 탈출하는 것으로 추정되는 근로자의 모습이 담긴 동영상이 공개됐다.
코로나19로 봉쇄된 중국 정저우 아이폰 조립 공장에서 탈출해 집으로 돌아가는 길에는 쓰레기가 산더미처럼 쌓여있고, 먹을 것조차 제대로 배급되지 않아 탈출극이 벌어졌다.


In [156]:
for i in range(5):
    print(df_summ['content'][i])

[윤구 대표 돌연 사임 후 1년 만에 공석 채워美 해군사관학교 출신 반도체 장비 전문가삼성전자, 영업 총괄 경험…韓 IT 이해 밝아애플코리아, 과다수수료 논란 등 숙제 ‘산더미’마크 리 신임 애플코리아 한국 사업 총괄 겸 사장. /링크드인 캡처        애플코리아가 한국 사업을 총괄하는 신임 사장으로 마크 리(Mark lee) 전 어플라이드 머티어리얼즈 코리아 대표를 선임한 것으로 확인됐다. 지난해 11월 윤구(브랜든 윤) 대표가 돌연 사임한 뒤 1년 만 공석이 채워지는 셈이다.1일 정보통신기술(ICT)업계에 따르면, 애플은 최근 마크 리 전 대표를 한국 영업을 총괄하는 국내 영업 총괄(사장)로 선임했다. 2018년 이후 현재까지 애플코리아의 법적 대표이사는 피터 덴우드다. 하지만 애플은 한국에서 실질적인 활동을 위해 국내 영업 총괄 사장이 제너럴 매니저(총괄적인 책임을 맡은 사람) 역할을 수행해왔다. 마크 리 대표는 애플코리아의 대표 역할을 수행한다.앞서 지난해 11월 삼성전자 출신의 윤구 대표가 일신상의 이유로 돌연 사임한 바 있다. 이후 애플코리아의 사장 자리는 1년간 공석이었다. 정치권에서는 인앱결제 이슈 등 국정감사나 정부의 조사 과정에서 책임을 지지 않기 위해 일부러 사장 자리를 공석으로 두는 게 아니냐는 지적도 나왔다.IT업계 관계자는 “마크 리는 오랜 기간 반도체 장비 업체에서 근무하면서 고객사인 삼성전자의 영업 총괄을 맡아 왔다”며 “누구보다 삼성과 한국에 대한 이해도가 높다”고 했다. 그는 “주한미국상공회의소(암참)에서도 활발히 활동해 국내 IT업계의 사정이나 인맥을 구축한 인물이다”라고 덧붙였다.마크 리 신임 사장은 반도체 장비 전문가다. 그는 1995년 미국 해군사관학교에서 기계공학 학사학위를 받고, 1995년부터 2000년까지 장교로 복무했다. 이후 2000년 미국 반도체 장비 노벨러스시스템즈(현재 램리서치와 합병)에 입사한 뒤, 2003년 어플라이드 머티어리얼즈로 이직했다. 이후 본사에서 전략적 사업 개발, 운영, 글로벌 책임자로 