In [58]:
from http.server import BaseHTTPRequestHandler, HTTPServer
import logging
import requests
import urllib.request
import json
import numpy as np
from tensorflow.keras.models import load_model
from gensim.models import FastText
from konlpy.tag import Okt
import boto3
import sys
import pandas as pd
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.parse
import urllib.request
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import math

MAX_NB_WORDS = 100000
SNU_model_All_rare = load_model('SNU_LSTM_Model_All.h5')

SNU_model_All = load_model('SNU_LSTM_Model_All_balance.h5')
SNU_model_pol = load_model('SNU_LSTM_Model_정치_balance.h5')
SNU_model_eco = load_model('SNU_LSTM_Model_경제_balance.h5')
SNU_model_soc = load_model('SNU_LSTM_Model_사회_balance.h5')
SNU_model_others = load_model('SNU_LSTM_Model_기타_balance.h5')

Naver_Comments_All = load_model('Naver_Comments_Model_All.h5')
Naver_Comments_pol = load_model('Naver_Comments_Model_정치.h5')
Naver_Comments_eco = load_model('Naver_Comments_Model_경제.h5')
Naver_Comments_soc = load_model('Naver_Comments_Model_사회.h5')
Naver_Comments_others = load_model('Naver_Comments_Model_기타.h5')

kakao_model = load_model('kakao_LSTM_model.h5')
# SNU_model_shuffle = load_model('SNU_LSTM_Model_shuffle.h5')
similar_model = FastText.load('similar_keyword_model')

class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger

    def __call__(self, sent):
        pos = self.tagger.pos(sent)
        clean_words = []  # 정제된 단어 리스트
        for word in pos:
            # word[1]은 품사를 의미하며, 여기서는 조사, 문장기호, 접두사, Foreign('\n'을 빼주기 위함)인 것은 제외시킴.
            if word[1] not in ['Josa', 'Punctuation', 'Suffix', 'Foreign']:
                if len(word[0]) >= 2:  # 한 글자인 단어들도 의미가 없는 경우가 많으므로 일단 제외.
                    #if word[0] not in ['있다', '했다', '한다', '없다', '된다']:
                    clean_words.append(word[0])
        return clean_words


my_Tokenizer = MyTokenizer(Okt())
okt = Okt()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [59]:
if sys.version_info[0] < 3:
    from io import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# get your credentials from environment variables
aws_id = ''
aws_secret = ''

client = boto3.client('s3', aws_access_key_id=aws_id,
        aws_secret_access_key=aws_secret)

bucket_name = 'snucsv'
label_names = ["label"]

def s3_load(category):
    object_key = category
    csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('cp949')

    # load data
    train_df = pd.read_csv(StringIO(csv_string))
    y_train = train_df[label_names].values

    train_df['doc_len'] = train_df['document'].apply(lambda words: len(words.split(" ")))
    max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int)
    
    return train_df, y_train, max_seq_len

def s3_Comments_load(category):
    object_key = category
    csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    # load data
    train_df = pd.read_csv(StringIO(csv_string))
    y_train = train_df[label_names].values

    train_df['doc_len'] = train_df['Comments'].apply(lambda words: len(words.split(" ")))
    max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int)
    
    return train_df, y_train, max_seq_len

train_df_All_rare, y_train_All_rare, max_seq_len_rare = s3_load('SNU_All.csv')

train_df_All_, y_train_All_, max_seq_len = s3_load('SNU_All_.csv')
train_df_All_b, y_train_All_b, max_seq_len_All_b = s3_load('SNU_All_b.csv')
train_df_eco, y_train_eco, max_seq_len_eco = s3_load('SNU_경제_b.csv')
train_df_pol, y_train_pol, max_seq_len_pol = s3_load('SNU_정치_b.csv')
train_df_soc, y_train_soc, max_seq_len_soc = s3_load('SNU_사회_b.csv')
train_df_others, y_train_others, max_seq_len_others = s3_load('SNU_기타_b.csv')

train_df_Comments_All, y_train_Comments_All, max_seq_len_Comments = s3_Comments_load('Naver_Comments_All.csv')
train_df_Comments_pol, y_train_Comments_pol, max_seq_len_Comments_pol = s3_Comments_load('Naver_Comments_정치.csv')
train_df_Comments_eco, y_train_Comments_eco, max_seq_len_Comments_eco = s3_Comments_load('Naver_Comments_경제.csv')
train_df_Comments_soc, y_train_Comments_soc, max_seq_len_Comments_soc = s3_Comments_load('Naver_Comments_사회.csv')
train_df_Comments_others, y_train_Comments_All, max_seq_len_Comments_others = s3_Comments_load('Naver_Comments_기타.csv')

In [60]:
train_df_All_rare['label'] = train_df_All_rare['label'].map({'전혀 사실 아님':0, '대체로 사실 아님':0, '사실':1, '대체로 사실':1})
train_df_All_rare.dropna(axis=0, inplace=True)
train_df_All_rare.reset_index(drop=True, inplace=True)
train_df_All_rare['label'].astype(int)
y_train_All_rare = train_df_All_rare['label'].values

In [61]:
def embedding_padding(train_df):
    raw_docs_train = train_df['document'].tolist()
    num_classes = len(label_names)
    print(num_classes)
    processed_docs_train = []

    for doc in tqdm(raw_docs_train):
        tokens = my_Tokenizer(doc)
        processed_docs_train.append(tokens)

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_train)
    return tokenizer

def Comments_embedding_padding(train_df):
    raw_docs_train = train_df['Comments'].tolist()
    num_classes = len(label_names)
    print(num_classes)
    processed_docs_train = []

    for doc in tqdm(raw_docs_train):
        tokens = my_Tokenizer(doc)
        processed_docs_train.append(tokens)

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_train)
    return tokenizer

tokenizer_All_rare = embedding_padding(train_df_All_rare)

tokenizer_All = embedding_padding(train_df_All_b)
tokenizer_pol = embedding_padding(train_df_pol)
tokenizer_soc = embedding_padding(train_df_soc)
tokenizer_eco = embedding_padding(train_df_eco)
tokenizer_others = embedding_padding(train_df_others)

tokenizer_Comments_All = Comments_embedding_padding(train_df_Comments_All)
tokenizer_Comments_pol = Comments_embedding_padding(train_df_Comments_pol)
tokenizer_Comments_eco = Comments_embedding_padding(train_df_Comments_eco)
tokenizer_Comments_soc = Comments_embedding_padding(train_df_Comments_soc)
tokenizer_Comments_others = Comments_embedding_padding(train_df_Comments_others)

  1%|▏         | 49/3452 [00:00<00:07, 482.07it/s]

1


100%|██████████| 3452/3452 [00:23<00:00, 145.71it/s]
  4%|▎         | 51/1441 [00:00<00:02, 499.98it/s]

1


100%|██████████| 1441/1441 [00:09<00:00, 145.58it/s]
  0%|          | 0/509 [00:00<?, ?it/s]

1


100%|██████████| 509/509 [00:03<00:00, 133.85it/s]
  8%|▊         | 40/490 [00:00<00:01, 390.82it/s]

1


100%|██████████| 490/490 [00:03<00:00, 148.84it/s]
 19%|█▊        | 37/198 [00:00<00:00, 365.09it/s]

1


100%|██████████| 198/198 [00:01<00:00, 164.56it/s]
  0%|          | 0/257 [00:00<?, ?it/s]

1


100%|██████████| 257/257 [00:01<00:00, 128.75it/s]
  0%|          | 10/9615 [00:00<01:38, 97.47it/s]

1


100%|██████████| 9615/9615 [03:33<00:00, 44.95it/s]
  0%|          | 0/3478 [00:00<?, ?it/s]

1


100%|██████████| 3478/3478 [01:14<00:00, 46.54it/s]
  0%|          | 0/1231 [00:00<?, ?it/s]

1


100%|██████████| 1231/1231 [00:26<00:00, 47.10it/s]
  0%|          | 0/2934 [00:00<?, ?it/s]

1


100%|██████████| 2934/2934 [01:06<00:00, 44.15it/s]
  0%|          | 0/1537 [00:00<?, ?it/s]

1


100%|██████████| 1537/1537 [00:32<00:00, 47.86it/s]


In [62]:
# ===========================================================================================================================
# 단어 토큰화 및 빈도순 정렬 후 추출

def tfidf_Vectorizer(response):
    response_list = []
    response_list.append(response)
    tfidf_Vectorizer = TfidfVectorizer(tokenizer=my_Tokenizer, min_df=1) # df 값(단어가 몇 문장들에서 등장하였는지)을 최소 'min_df' 값으로 설정.
    X = tfidf_Vectorizer.fit_transform(response_list).toarray()
#     print(X.shape)    # X(2차원 배열)의 행,열 수를 출력.
#     print(tfidf_Vectorizer.vocabulary_)   # 각 단어들이 배열 X에서 몇번째 열(인덱스 값)에 해당하는지 출력.


    #pandas를 활용하여 각 단어들의 각 문장에서의 tf-idf 값들을 모두 더하고, 내림차순으로 정렬하여 상위 n개 출력
    count = X.sum(axis=0)    # 2차원 배열 X에서 각 열을 기준으로 합을 구함. (각 단어들의 '최종' tf-idf 값으로 간주.)
    word_count = pd.DataFrame({
        '단어' : tfidf_Vectorizer.get_feature_names(),
        '빈도' : count.flat
    })
    sorted_df = word_count.sort_values('빈도', ascending=False)
#     print(sorted_df.head(10), "\n")

    word_ = list(np.array(sorted_df['단어'].tolist()))
    return(word_)


# ===========================================================================================================================
# 네이버 댓글 분석

def Naver_Comments_score(response, tokenizer, tokenizer_Comments, SNU_Model, Naver_Comments_Model):
    score_list = []
    for i in tqdm(range(len(response))):
        response_ = re.compile("[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]+").sub('',response[i])
        response_ = okt.morphs(response_, stem=True) # 토큰화
        response_ = [word for word in response_ if not word in stopwords] # 불용어 제거
        response_ = ' '.join(response_)
        word_ = tfidf_Vectorizer(response_)
        word__ = ""
        if len(word_) <= 10:
            word__ = response_
        else:
            for i in range(1, min(len(word_), 11)):
                word__ += word_[i]
                word__ += " "

    # ================================================================

        tokens_response = []
        SNU_score_list = []

        tokens = my_Tokenizer(response_)
        tokens_response.append(tokens)

        word_seq_response = tokenizer.texts_to_sequences(tokens_response)
        word_seq_response = sequence.pad_sequences(word_seq_response, maxlen=max_seq_len)

        word_seq_response_to_score = word_seq_response.reshape(1,max_seq_len)
        SNU_score_list.append(float(SNU_Model.predict(word_seq_response_to_score)))

        df_qq = []
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}

        allComments = []
        def create_soup(url):
            res = requests.get(url, headers=headers)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, 'html.parser')
            return soup

        Comments_score_list = []
        # 네이버 뉴스 url을 입력합니다.
        url1 = "https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}".format(word__)
        soup = create_soup(url1)
        for i in range(1, 15):
            List = []
            try:
                url = soup.select_one("#sp_nws{} > div.news_wrap.api_ani_send > div > div.news_info > div.info_group > a:nth-of-type(2)".format(i))['href']
                oid = url.split("oid=")[1].split("&")[0] #422
                aid = url.split("aid=")[1] #0000430957
                page = 1
                header = {
                    "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
                    "referer": url,
                }
                # time.sleep(random.randint(1,2))

                while True:
                    c_url = "https://apis.naver.com/commentBox/cbox/web_neo_list_jsonp.json?ticket=news&templateId=default_society&pool=cbox5&_callback=jQuery1707138182064460843_1523512042464&lang=ko&country=&objectId=news" + oid + "%2C" + aid + "&categoryId=&pageSize=20&indexSize=10&groupId=&listType=OBJECT&pageType=more&page=" + str(
                        page) + "&refresh=false&sort=FAVORITE"
                    # 파싱하는 단계입니다.
                    r = requests.get(c_url, headers=header)
                    cont = BeautifulSoup(r.content, "html.parser")
                    total_comm = str(cont).split('comment":')[1].split(",")[0]

                    match = re.findall('"contents":([^\*]*),"userIdNo"', str(cont))
                    # 댓글을 리스트에 중첩합니다.
                    try:
                        for b in range(0,3):
                            List.append(match[b])
                    except:
                        pass
                    break

                tokens_response = []
                score_list_sum = []

                for i in range(len(List)):
                    tokens = my_Tokenizer(List[i])
                    tokens_response.append(tokens)

                word_seq_response2 = tokenizer_Comments.texts_to_sequences(tokens_response)
                word_seq_response2 = sequence.pad_sequences(word_seq_response2, maxlen=max_seq_len_Comments)
                
                for sco in range(len(List)):
                    word_seq_response_to_score2 = word_seq_response2[sco].reshape(1,max_seq_len_Comments)
                    score_list_sum.append(float(Naver_Comments_Model.predict(word_seq_response_to_score2)))

                Comments_score_list.append(np.mean(score_list_sum))

            except:
                pass
            
        Comments_score_list_final = []
        
        Comments_score_list = [x for x in Comments_score_list if math.isnan(x)==False]
        Comments_score_list_final.append(np.mean(Comments_score_list))

        a = [score * 0.6 for score in SNU_score_list]
        b = [score * 0.4 for score in Comments_score_list_final]
        for i in range(len(b)):
            if math.isnan(b[i]):
                b[i] = a[i]

        score_list_sum = list(map(lambda x,y:x+y, a,b))
        score_list.append(score_list_sum[0])

    return score_list

def Naver_Score(response, tokenizer, SNU_Model):
    SNU_score_list = []
    for i in tqdm(range(len(response))):
        response_ = re.compile("[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]+").sub('',response[i])
        response_ = okt.morphs(response_, stem=True) # 토큰화
        response_ = [word for word in response_ if not word in stopwords] # 불용어 제거
        response_ = ' '.join(response_)
        word_ = tfidf_Vectorizer(response_)
        word__ = ""
        if len(word_) <= 6:
            word__ = response
        else:
            for i in range(1, min(len(word_), 7)):
                word__ += word_[i]
                word__ += " "

    # ================================================================

        tokens_response = []

        tokens = my_Tokenizer(response_)
        tokens_response.append(tokens)

        word_seq_response = tokenizer.texts_to_sequences(tokens_response)
        word_seq_response = sequence.pad_sequences(word_seq_response, maxlen=max_seq_len)

        word_seq_response_to_score = word_seq_response.reshape(1,max_seq_len)
        SNU_score_list.append(float(SNU_Model.predict(word_seq_response_to_score)))
    return SNU_score_list

In [63]:
train_df_pol_test = train_df_pol.sample(frac=0.1).reset_index(drop=True)
y_train_pol_test = train_df_pol_test['label'].values

In [64]:
test_list = train_df_pol_test['document'].values.tolist()

In [65]:
len(test_list)

51

In [66]:
balance_Comments_score_pol = Naver_Comments_score(test_list, tokenizer_pol, tokenizer_Comments_pol, SNU_model_pol, Naver_Comments_pol)
balance_score_pol = Naver_Score(test_list, tokenizer_pol, SNU_model_pol)
All_score_pol = Naver_Score(test_list, tokenizer_All, SNU_model_All)
All_rare_score_pol = Naver_Score(test_list, tokenizer_All_rare, SNU_model_All_rare)

# balance_Comments_score_eco = Naver_Comments_score(test_list, tokenizer_eco, tokenizer_Comments_eco, SNU_model_eco, Naver_Comments_eco)
# balance_score_eco = Naver_Score(test_list, tokenizer_eco, SNU_model_eco)
# All_score_eco = Naver_Score(test_list, tokenizer_All, SNU_model_All)
# All_rare_score_eco = Naver_Score(test_list, tokenizer_All_rare, SNU_model_All_rare)

# balance_Comments_score_soc = Naver_Comments_score(test_list, tokenizer_soc, tokenizer_Comments_soc, SNU_model_soc, Naver_Comments_soc)
# balance_score_soc = Naver_Score(test_list, tokenizer_soc, SNU_model_soc)
# All_score_soc = Naver_Score(test_list, tokenizer_All, SNU_model_All)
# All_rare_score_soc = Naver_Score(test_list, tokenizer_All_rare, SNU_model_All_rare)

# balance_Comments_score_others = Naver_Comments_score(test_list, tokenizer_others, tokenizer_Comments_others, SNU_model_others, Naver_Comments_others)
# balance_score_others = Naver_Score(test_list, tokenizer_others, SNU_model_others)
# All_score_others = Naver_Score(test_list, tokenizer_All, SNU_model_All)
# All_rare_score_others = Naver_Score(test_list, tokenizer_All_rare, SNU_model_All_rare)

  0%|          | 0/51 [00:00<?, ?it/s]



  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 51/51 [01:06<00:00,  1.31s/it]
100%|██████████| 51/51 [00:04<00:00, 11.15it/s]
  0%|          | 0/51 [00:00<?, ?it/s]



100%|██████████| 51/51 [00:06<00:00,  7.66it/s]
100%|██████████| 51/51 [00:07<00:00,  6.98it/s]


In [67]:
from sklearn.metrics import accuracy_score

def data_(test_score):
    for i in range(len(test_score)):
        if test_score[i] >= 0.5:
            test_score[i]=1
        else:
            test_score[i]=0
    return test_score

balance_Comments_score_pol = data_(balance_Comments_score_pol)
balance_score_pol = data_(balance_score_pol)
All_score_pol = data_(All_score_pol)
All_rare_score_pol = data_(All_rare_score_pol)

# balance_Comments_score_eco = data_(balance_Comments_score_eco)
# balance_score_eco = data_(balance_score_eco)
# All_score_eco = data_(All_score_eco)
# All_rare_score_eco = data_(All_rare_score_eco)

# balance_Comments_score_soc = data_(balance_Comments_score_soc)
# balance_score_soc = data_(balance_score_soc)
# All_score_soc = data_(All_score_soc)
# All_rare_score_soc = data_(All_rare_score_soc)

# balance_Comments_score_others = data_(balance_Comments_score_others)
# balance_score_others = data_(balance_score_others)
# All_score_others = data_(All_score_others)
# All_rare_score_others = data_(All_rare_score_others)

print('정치 카테고리 (Under Sampling) with Comments : ', accuracy_score(balance_Comments_score_pol, y_train_pol_test))
print('정치 카테고리 (Under Sampling) : ', accuracy_score(balance_score_pol, y_train_pol_test))
print('All 카테고리 (underSampling) : ', accuracy_score(All_score_pol, y_train_pol_test))
print('All 카테고리 : ', accuracy_score(All_rare_score_pol, y_train_pol_test))

# print('경제 카테고리 (Under Sampling) with Comments : ', accuracy_score(balance_Comments_score_eco, y_train_eco_test))
# print('경제 카테고리 (Under Sampling) : ', accuracy_score(balance_score_eco, y_train_eco_test))
# print('All 카테고리 (underSampling) : ', accuracy_score(All_score_eco, y_train_eco_test))
# print('All 카테고리 : ', accuracy_score(All_rare_score_eco, y_train_eco_test))

# print('사회 카테고리 (Under Sampling) with Comments : ' accuracy_score(balance_Comments_score_soc, y_train_soc_test))
# print('사회 카테고리 (Under Sampling) : ' accuracy_score(balance_score_soc, y_train_soc_test))
# print('All 카테고리 (underSampling) : 'accuracy_score(All_score_soc, y_train_soc_test))
# print('All 카테고리 : 'accuracy_score(All_rare_score_soc, y_train_soc_test))

# print('기타 카테고리 (Under Sampling) with Comments : ' accuracy_score(balance_Comments_score_others, y_train_others_test))
# print('기타 카테고리 (Under Sampling) : ' accuracy_score(balance_score_others, y_train_others_test))
# print('All 카테고리 (underSampling) : 'accuracy_score(All_score_others, y_train_others_test))
# print('All 카테고리 : 'accuracy_score(All_rare_score_others, y_train_others_test))

정치 카테고리 (Under Sampling) with Comments :  0.7843137254901961
정치 카테고리 (Under Sampling) :  0.803921568627451
All 카테고리 (underSampling) :  0.8235294117647058
All 카테고리 :  0.6862745098039216


In [None]:
balanced_model = load_model('SNU_LSTM_Model_경제_balance.h5')
balanced_score = balanced_model.evaluate(word_seq_train, y_train, batch_size=1)
model = load_model('SNU_LSTM_Model_경제.h5')
scores = model.evaluate(word_seq_train, y_train, batch_size=1)
all_model = load_model('SNU_LSTM_Model_All.h5')
all_scores = all_model.evaluate(word_seq_train, y_train, batch_size=1)

In [None]:
print("경제 balanced data LSTM %s: %.2f%%" %(balanced_model.metrics_names[1], scores[1]*100))
print("경제 data LSTM %s: %.2f%%" %(model.metrics_names[1], scores[1]*100))
print("전체 data LSTM %s: %.2f%%" %(all_model.metrics_names[1], all_scores[1]*100))