# 📰 어린이 뉴스 Summarization

## import

In [1]:
import os
import numpy as np
import pandas as pd
import re
import io
import boto3

import warnings
warnings.filterwarnings(action='ignore')

## Amazon S3 to pd

In [2]:
_key = pd.read_csv('/notebooks/rootkey.csv',sep='=',header=None)
prefix = 'kid_news/'
bucket_name = 'nft-newsdb'

In [3]:
# s3 connection check
def s3_connection():
    try:
        s3 = boto3.client(
            service_name="s3",
            region_name="ap-northeast-2",
            aws_access_key_id=_key[1][0],
            aws_secret_access_key=_key[1][1],
        )
    except Exception as e:
        print(e)
    else:
        print("s3 bucket connected!")
        return s3

In [4]:
s3 = s3_connection()

s3 bucket connected!


In [5]:
# S3 전체 파일목록
def s3_get_all_keys(**args):
    keys = []
    page_iterator = s3.get_paginator("list_objects_v2")

    for page in page_iterator.paginate(**args):
        try:
            contents = page['Contents']
        except KeyError:
            break
            
        for item in contents:
            keys.append(item['Key'])
            
    return keys

In [6]:
def read_json_from_s3(filename):
    obj = s3.get_object(Bucket=bucket_name,Key=filename)
    result_byte= io.BytesIO(obj["Body"].read())
    return result_byte

In [7]:
json_file_list = s3_get_all_keys(Bucket=bucket_name, Prefix=prefix)
json_file = []
for f in json_file_list : 
    json_file.append(read_json_from_s3(f))

In [8]:
df = pd.DataFrame()
for file in json_file:
    data = pd.read_json(file)
    df = pd.concat([df, data])
df = df.reset_index(drop=True)

---

## Data Preprocessing

In [9]:
df['news_len'] = df['news_article'].apply(lambda x: len(x))

In [10]:
# 기사의 길이가 300개 아래인 경우 버림
# 문장 요약을 위해 세 문장보다 많은 기사를 확보하기 위함
df = df[df['news_len'] > 300].reset_index(drop=True)
df.head(2)

Unnamed: 0,news_url,news_title,news_subtitle,news_writer,news_date,news_article,news_img,news_source,news_len
0,http://kid.chosun.com/site/data/html_dir/2022/...,"[아는 것이 힘이다!] 아메리카 인디언의 벌새 전설, 기생충과 박테리아",,현기성 기자,2022-04-19 00:01,"아메리카 인디언의 벌새 전설 아주 먼 옛날, 인간이 생기기도 전에 거대한 불길이 느...",http://kid.chosun.com/site/data/img_dir/2022/0...,어린이조선일보,830
1,http://kid.chosun.com/site/data/html_dir/2022/...,"거리 두기 끝… 일상 되찾는 학교, 확진돼도 기말고사 치를 수 있다",‘5월 이후 정상 등교’ 여부 검토 중,진현경 기자,2022-04-19 00:01,어제(18일)부터 마스크 착용을 제외한 '사회적 거리 두기'가 전면 해제됐다. 사적...,http://kid.chosun.com/site/data/img_dir/2022/0...,어린이조선일보,838


In [11]:
def clean_text(text): 
    text = re.sub(r'(\[)(.*?)(\])','',str(text)) # remove [title]
    text = re.sub(r'(\()(.*?)(\))', '', text)  # 소괄호 (세부 설명
    text = re.sub(r'[?!]', '.', text)          # ?! => 마침표 처리
    text = re.sub(r'[\,\·\:\-\_]', ' ', text)  # 문장부호 구분자 => 공백 처리
    text = text.lower() #lower case 
    text = re.sub(r'\s+', ' ', text) #remove extra space
    # text = re.sub(r'<[^>]+>','',text) #remove Html tags 
    text = re.sub(r'<[^가-힣]+>','',text)# 홑화살괄호 => 한글만 빼고 제거
    text = re.sub(r'^\s+', '', text) #remove space from start
    text = re.sub(r'\s+$', '', text) #remove space from the end
    text = re.sub('[^가-힣\w. ]', '', text)    # 한글, 문자, 숫자, 마침표, 공백 제외 제거
    text = re.sub(r'[一-龥]', '', text) #remove Chinese character
    text = re.sub('\s{2,}', ' ', text)        # 2번 이상의 space 제거
    text = text.strip()
    return text

In [12]:
df["news_title"] = df["news_title"].apply(clean_text)
df["news_article"] = df["news_article"].apply(clean_text)

In [13]:
news_df = df[['news_source', 'news_date','news_url','news_title','news_img','news_article']]

```
+--------------+--------------+------+-----+---------+----------------+
| Field        | Type         | Null | Key | Default | Extra          |
+--------------+--------------+------+-----+---------+----------------+
| id           | bigint       | NO   | PRI | NULL    | auto_increment |
| news_source  | varchar(20)  | NO   |     | NULL    |                |
| news_date    | datetime(6)  | NO   |     | NULL    |                |
| news_url     | varchar(500) | NO   |     | NULL    |                |
| news_title   | varchar(200) | NO   |     | NULL    |                |
| news_image   | varchar(500) | YES  |     | NULL    |                |
| news_article | longtext     | YES  |     | NULL    |                |
+--------------+--------------+------+-----+---------+----------------+
```

In [14]:
news_df.head(1)

Unnamed: 0,news_source,news_date,news_url,news_title,news_img,news_article
0,어린이조선일보,2022-04-19 00:01,http://kid.chosun.com/site/data/html_dir/2022/...,아메리카 인디언의 벌새 전설 기생충과 박테리아,http://kid.chosun.com/site/data/img_dir/2022/0...,아메리카 인디언의 벌새 전설 아주 먼 옛날 인간이 생기기도 전에 거대한 불길이 느닷...


## Summarization

In [15]:
from modeling import summary

In [16]:
news_df['sum_article'] = news_df.news_article.apply(summary)

[2022-04-23 20:31:41,401 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:31:47,017 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:31:47,022 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:31:50,063 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:31:54,967 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:32:00,953 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:32:00,958 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:32:04,063 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:32:08,982 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:32:15,079 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:32:15,084 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:32:18,236 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:32:22,896 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:32:29,021 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:32:29,026 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:32:32,163 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:32:37,137 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:32:42,631 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:32:42,635 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:32:45,587 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:32:50,161 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:32:55,732 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:32:55,736 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:32:58,657 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:33:03,419 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:33:08,881 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:33:08,887 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:33:11,809 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:33:16,750 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:33:22,389 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:33:22,394 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:33:25,449 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:33:30,386 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:33:36,335 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:33:36,340 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:33:39,478 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:33:44,062 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:33:50,097 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:33:50,103 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:33:53,346 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:33:58,467 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:34:04,588 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:34:04,594 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:34:07,673 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:34:12,229 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:34:17,951 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:34:17,957 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:34:20,912 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:34:25,406 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:34:30,835 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:34:30,839 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:34:33,777 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:34:38,732 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:34:44,753 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:34:44,757 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:34:47,815 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:34:53,162 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:34:59,319 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:34:59,324 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:35:02,422 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:35:07,138 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:35:13,202 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:35:13,207 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:35:16,187 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:35:21,179 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:35:27,422 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:35:27,426 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:35:30,697 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:35:35,670 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:35:41,958 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:35:41,964 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:35:45,075 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:35:49,800 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:35:55,643 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:35:55,647 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:35:58,692 INFO] * number of parameters: 109350145


gpu_rank 0


[2022-04-23 20:36:03,673 INFO] Loading checkpoint from /notebooks/KorBertSum/models/bert_classifier/backup/model_step_10000.pt
[2022-04-23 20:36:09,223 INFO] loading archive file /notebooks/model/001_bert_morp_pytorch
[2022-04-23 20:36:09,228 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 30349
}

[2022-04-23 20:36:12,401 INFO] * number of parameters: 109350145


gpu_rank 0


## import to Amazon RDS

In [None]:
news_df_rds = news_df[['news_source','news_date','news_url','news_title','news_img','sum_article']]
news_df_rds.columns = ['news_source','news_date','news_url','news_title','news_image','news_article']

In [19]:
news_df_rds.head(1)

Unnamed: 0,news_source,news_date,news_url,news_title,news_image,news_article
0,어린이조선일보,2022-04-19 00:01,http://kid.chosun.com/site/data/html_dir/2022/...,아메리카 인디언의 벌새 전설 기생충과 박테리아,http://kid.chosun.com/site/data/img_dir/2022/0...,기겁한 동물들이 사방으로 흩어져 달아났다\n\n그런데 유독 한 동물만은 자리를 지켰...


In [20]:
news_df_rds.to_csv('/notebooks/news_df_rds.csv')

In [21]:
news_df = pd.read_csv('/notebooks/news_df_rds.csv',sep=',').drop('Unnamed: 0', axis=1)

In [23]:
news_df.head(1)

Unnamed: 0,news_source,news_date,news_url,news_title,news_image,news_article
0,어린이조선일보,2022-04-19 00:01,http://kid.chosun.com/site/data/html_dir/2022/...,아메리카 인디언의 벌새 전설 기생충과 박테리아,http://kid.chosun.com/site/data/img_dir/2022/0...,기겁한 동물들이 사방으로 흩어져 달아났다\n\n그런데 유독 한 동물만은 자리를 지켰...


In [24]:
from sqlalchemy import create_engine
import pymysql
import pandas as pd
db_connection_str = 'mysql+pymysql://root:1234@172.19.0.2:3306/news-kids'
db_connection = create_engine(db_connection_str)
conn = db_connection.connect()

In [401]:
news_df_rds.to_sql(name='news', con=db_connection, if_exists='append',index=False) 