In [None]:
!pip install konlpy # 코랩용 모듈 설치문

In [None]:
import pandas as pd
import re
from konlpy.tag import Okt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from google.colab import drive, files

In [None]:
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/stocker/daum_news_samsung.csv')
df_stock = pd.read_csv('/content/drive/MyDrive/stocker/samsung_stock.csv')
df['date'] = pd.to_datetime(df['날짜']).dt.date

# 텍스트 정제 함수
def clean_text(text):
    text = re.sub(r'[^가-힣a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_title'] = df['뉴스제목'].apply(clean_text) # 뉴스제목 정제

print(df.head(5))

In [None]:
okt = Okt() # 우선 Okt 사용
stopwords = {'은', '는', '이', '가', '하', '의', '에', '을', '를', '도', '과', '와', '으로', '로', '에서', '하다', '고'}

def preprocess(text):
    tokens = okt.morphs(text, stem=True)
    meaningful_words = [word for word in tokens if word not in stopwords and len(word) > 1]
    return ' '.join(meaningful_words)

df['processed_title'] = df['clean_title'].apply(preprocess)

print(df.head(5))

In [None]:
model_name = "snunlp/KR-FinBERT" # Finbert 모델 지정
tokenizer = AutoTokenizer.from_pretrained(model_name) # 토크나이징징
model = AutoModelForSequenceClassification.from_pretrained(model_name)
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

In [None]:
# 감성 분석 함수
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1) # 확률값으로 변환
    conf, pred_label = torch.max(probs, dim=1)
    return label_map[pred_label.item()], conf.item()

def get_sentiment_score(text):
    if not text.strip():
        return 0
    label, conf = predict_sentiment(text)

    # 긍부정 확률값 반환
    if label == 'positive':
        return 1 * conf
    elif label == 'negative':
        return -1 * conf
    else:
        return 0

In [None]:
df['sentiment_score'] = df['processed_title'].progress_apply(get_sentiment_score) # 날짜 기준으로 정렬

daily_sentiment = df.groupby('날짜').agg(
    avg_sentiment=('sentiment_score', 'mean'),
).reset_index()

print(daily_sentiment.head())

In [None]:
print(df_stock.isnull().sum())
df_stock.dropna(inplace=True) # 결측치 있는 행 삭제

df_stock = df_stock[df_stock['거래량'] > 0] # 거래량 0인 행 삭제

df_stock = df_stock.sort_values('Date')
df_stock['Close_Change_Rate'] = (df_stock['종가'].pct_change() * 100) # 변동률

print(df_stock[['날짜', '종가', '거래량', '변동률(%)', '상승 여부']].head(10))

In [None]:
df_stock = df_stock.loc[:, ~df_stock.columns.duplicated()]

# 날짜 컬럼 이름 통일일
daily_sentiment.rename(columns={'Date': '날짜'}, inplace=True)
df_stock.rename(columns={'Date': '날짜'}, inplace=True)

# 날짜 형식 변호나 & 시간 정규화
daily_sentiment['날짜'] = pd.to_datetime(daily_sentiment['날짜']).dt.normalize()
df_stock['날짜'] = pd.to_datetime(df_stock['날짜']).dt.normalize()

# 날짜 기준으로 두 데이터 내부 조인
df_merged = pd.merge(
    df_stock,
    on='날짜',
    how='inner',
    suffixes=('_stock', '_sentiment')
)

# 날짜 컬럼 정리 (날짜 겹침)
if '날짜_sentiment' in df_merged.columns:
    df_merged.drop(columns=['날짜_sentiment'], inplace=True)

if '날짜_stock' in df_merged.columns:
    df_merged.rename(columns={'날짜_stock': '날짜'}, inplace=True)

if 'avg_sentiment' in df_merged.columns: # 컬럼명 변경
    df_merged.rename(columns={'avg_sentiment': '감성 점수'}, inplace=True)

df_merged['감성 점수'] = df_merged['감성 점수'].fillna(0) # 결측값 0으로 채우기

df_merged = df_merged.sort_values('날짜')
df_merged['변동률(%)'] = df_merged['종가'].pct_change() * 100 # 변동률 재계산
df_merged['변동률(%)'] = df_merged['변동률(%)'].fillna(0)


In [None]:
print(df_merged.head())
print(df_merged.columns)

df_merged.to_csv('data_preprocessing.csv', index=False)
files.download('data_preprocessing.csv')