In [1]:
!pip install newspaper3k
!pip install tqdm



In [2]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import json
import requests
import my_settings
import pandas as pd
from datetime import datetime
import urllib
from tqdm import tqdm
from newspaper import Article, ArticleException

In [3]:
def get_news(link):
    text_list = []
    article = Article(link, language='ko')
    try:
        article.download()
        article.parse()
    except ArticleException:
        return []
    title = article.title
    text = article.text
    text = title + '\n' + text
    text = text.replace('\n', '')
    if len(text) >= 2000:
        for i in range(1000,len(text)):
            if text[i] == '.':
                break
            else:
                i += 1
        text_fron = text[0:i]
        text_back = text[i+1:] 
        text_list.append(text_fron)
        text_list.append(text_back)
    else:
        text_list.append(text)
    return text_list

In [4]:
# 텍스트가 들어있는 리스트에 대해 api를 활용해서 요약하는 함수 (5줄)

def summary_news(news_list):
    result_list = []
    summary_url = 'https://naveropenapi.apigw.ntruss.com/text-summary/v1/summarize'
    headers = {
            'Accept': 'application/json;UTF-8',
            'Content-Type': 'application/json;UTF-8',
            'X-NCP-APIGW-API-KEY-ID': client_id,
            'X-NCP-APIGW-API-KEY': client_secret
        }
    for text in news_list:
        # 짧은 기사는 요약 없이 그대로 감정 분석
        if len(text.split('.')) < 5:
            result_list.append(text)
            continue
        else:
            data_summary = {
                "document": {
                    "content": text
                },
                "option": {
                    "language": "ko",
                    "model": "general",
                    "summaryCount": 5
                }
            }
            # API 요청 보내기
            response = requests.post(summary_url, data=json.dumps(data_summary), headers=headers)

            # API 응답 받기
            result = response.json()

            # 오류 걸러내기
            if "error" in result:
                continue
            else:
            # 요약 결과 출력
                text_sum = result["summary"]
                result_list.append(text_sum)
    text = ''.join(result_list)
    return text

        

In [5]:
# 요약된 텍스트에 대해 감정 분석 진행하여 감정 딕셔너리 생성
def sent_news(news):
    sentiments = {'neutral' : 0, 'positive' : 0, 'negative' : 0}
    sentiment_url = 'https://naveropenapi.apigw.ntruss.com/sentiment-analysis/v1/analyze'
    headers = {
            'Accept': 'application/json;UTF-8',
            'Content-Type': 'application/json;UTF-8',
            'X-NCP-APIGW-API-KEY-ID': client_id,
            'X-NCP-APIGW-API-KEY': client_secret
        }
    data_sentiment = {
        "content": news,
        "config": {
            "negativeClassification": True
        }
    }
    # API 요청 보내기
    response_sentiment = requests.post(sentiment_url, data=json.dumps(data_sentiment), headers=headers)

    # API 응답 받기
    result = response_sentiment.json()

    # 긍정, 부정, 중립 확률값 업데이트
    sentiments["neutral"] += result['document']["confidence"]["neutral"]
    sentiments["positive"] += result['document']["confidence"]["positive"]
    sentiments["negative"] += result['document']["confidence"]["negative"]
    return sentiments


In [6]:
# 해당하는 월로 파일 수정
news = pd.read_csv('news2023.csv')
for col in news.columns:
    try:
        pd.to_datetime(col)
    except ValueError:
        del news[col]

In [7]:
news_dict = {}
for col in news.columns:
    news_dict[col] = news[col].dropna().tolist()

In [8]:
sorted_keys = sorted(news_dict.keys())
start_date = sorted_keys[0]
end_date = sorted_keys[-1]

In [9]:
news_dict_int = {int(key): value for key, value in news_dict.items()}

In [10]:
datelist = pd.date_range(start=start_date, end=end_date).tolist()
datelist = [int(date.strftime("%Y%m%d")) for date in datelist]

In [11]:
df = pd.DataFrame(index=datelist, columns = {'pos','neu','neg'})

In [12]:
# 같은 폴더에 my_settings.py 파일 있어야 됨 -> 노션 참고
client_id = my_settings.CLIENT_ID
client_secret = my_settings.CLIENT_SECRET

In [13]:
# 실제 작업 하는 코드 -> 오래 걸림 !!!
avg_links = 67.4689042
for date, links in tqdm(news_dict_int.items()):
    # 뉴스 개수 세기
    num_link = 0
    pos = 0
    neg = 0
    neu = 0
    for link in links:
        news_list = get_news(link)
        if news_list == []:
            continue
        else:
            news_sum = summary_news(news_list)
            news_sent_dic = sent_news(news_sum)
            num_link += 1
            pos += news_sent_dic['positive']
            neg += news_sent_dic['negative']
            neu += news_sent_dic['neutral']
    weight = 1 / avg_links
    df.loc[date]['pos'] = pos * weight * 0.01
    df.loc[date]['neg'] = neg * weight * 0.01
    df.loc[date]['neu'] = neu * weight * 0.01

100%|█████████████████████████████████████████| 77/77 [1:32:33<00:00, 72.12s/it]


In [14]:
sentiment = df.reset_index().rename(columns={'index': 'date'})
sentiment['date'] = pd.to_datetime(sentiment['date'], format='%Y%m%d')

In [15]:
stock = pd.read_csv('stock_macroeco.csv')

In [16]:
# 'date' 열을 날짜 자료형으로 변환
stock['date'] = pd.to_datetime(stock['date'])
sentiment['date'] = pd.to_datetime(sentiment['date'])

# 'date' 열의 값을 문자열 포맷으로 변경
stock['date'] = stock['date'].dt.strftime('%Y-%m-%d')
sentiment['date'] = sentiment['date'].dt.strftime('%Y-%m-%d')

# 두 데이터프레임 합치기
merged_df = pd.merge(stock, sentiment, on='date', how='inner')


In [17]:
merged_df

Unnamed: 0,date,CPI,interest,oil_price,gold_price,exchange_rate(w/d),KOSPI200(open),pos,neu,neg
0,2023-01-02,109.33,4.03,1343,74400,1267.30,292.90,0.35206,0.200951,0.395573
1,2023-01-03,109.36,4.00,1366,75150,1268.90,290.64,0.408955,0.260933,0.263876
2,2023-01-04,109.39,3.94,1364,75610,1272.10,288.34,0.45672,0.422835,0.276532
3,2023-01-05,109.41,3.89,1367,75960,1274.70,297.65,0.488509,0.202006,0.183961
4,2023-01-06,109.44,3.85,1369,75750,1270.30,296.69,0.433137,0.291671,0.223777
...,...,...,...,...,...,...,...,...,...,...
57,2023-03-27,110.54,3.54,1305,82290,1289.70,314.83,0.345938,0.304357,0.29829
58,2023-03-28,110.54,3.53,1287,81740,1300.30,314.15,0.4314,0.282461,0.264368
59,2023-03-29,110.55,3.53,1285,81950,1297.40,315.68,0.401818,0.32393,0.267302
60,2023-03-30,110.55,3.52,1287,81980,1299.30,318.51,0.355095,0.316063,0.381179


In [18]:
# 월 이름 바꿔서 저장해야 중복 X
merged_df.to_csv('2023_stock_macroeco_news.csv')