In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install faiss-gpu
!pip install sentence-transformers
!pip install kss
!pip install bertopic



In [None]:
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import kss
from bertopic import BERTopic
import matplotlib.pyplot as plt

# 유사도 계산
def find_similar_news(target_title, model, top_k=100):
    db_news = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/(찐0613) 뉴스 데이터_요약추가.xlsx')  # 파일위치에 맞게 변경
    db_title_embedding = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/(0613) 요약 임베딩.xlsx')  # 파일 위치에 맞게 변경
    db_title_embeddings = []
    for i in range(len(db_title_embedding)):
        db_title_embeddings.append(np.fromstring(db_title_embedding.iloc[i]['summary_embedding'][1:-1], dtype=np.float32, sep=' '))
    db_title_embeddings = np.array(db_title_embeddings)

    target_title_embedding = model.encode(
        target_title,
        normalize_embeddings=True,
        convert_to_tensor=True).cpu().numpy()

    # 유사도 계산
    index = faiss.IndexFlatIP(128)
    faiss.normalize_L2(db_title_embeddings)
    index.add(db_title_embeddings)

    # 유사도 상위 100개 뉴스의 유사도와 인덱스 구하기
    distances, indices = index.search(
        np.expand_dims(target_title_embedding, axis=0),
        top_k
    )

    # 상위 100개 뉴스 가져오기
    db_news = db_news.iloc[indices[0]]
    db_news['similarity'] = distances[0]

    return db_news  # 유사도 계산 결과 반환


  from tqdm.autonotebook import tqdm, trange


In [None]:
# 단락 생성기
def split_into_paragraphs(article, sentences_per_paragraph=3):
    sentences = kss.split_sentences(article)
    paragraphs = []
    paragraph = []

    for sentence in sentences:
        if len(sentence) > 15:
            paragraph.append(sentence)
        if len(paragraph) == sentences_per_paragraph:
            paragraphs.append(" ".join(paragraph))
            paragraph = []

    if paragraph and len(paragraph) > 15:
        paragraphs.append(" ".join(paragraph))

    return paragraphs

In [None]:
# 클러스터링
def clustering(target_article, similar_news, user_gender=None, user_age=None):
    paragraphs = similar_news['article'].apply(split_into_paragraphs)
    target_paragraphs = split_into_paragraphs(target_article)

    paragraph_data = []
    for i, data in enumerate(paragraphs.values):
        for j in range(len(data)):
            paragraph_data.append([paragraphs.index[i]] + [data[j]])

    paragraph_data = pd.DataFrame(
        data=paragraph_data,
        columns=['index', 'paragraph']
    )

    target_paragraph_data = []
    for data in target_paragraphs:
        target_paragraph_data.append([-1] + [data])

    target_paragraph_data = pd.DataFrame(
        data=target_paragraph_data,
        columns=['index', 'paragraph']
    )

    train_paragraph_data = pd.concat([target_paragraph_data, paragraph_data], axis=0)

    model = BERTopic(embedding_model='bongsoo/kpf-sbert-128d-v1', min_topic_size=5)
    topics, probs = model.fit_transform(documents=train_paragraph_data['paragraph'])
    train_paragraph_data['topic'] = topics

    paragraph_data = pd.merge(paragraph_data, train_paragraph_data[['paragraph', 'topic']], on='paragraph', how='inner')
    paragraph_data = paragraph_data[paragraph_data['topic'] > 0]

    target_paragraph_data = pd.merge(target_paragraph_data, train_paragraph_data[['paragraph', 'topic']], on='paragraph', how='inner')
    target_paragraph_data = target_paragraph_data[target_paragraph_data['topic'] > 0]

    if len(target_paragraph_data) == 0:
        print('토픽이 없어요ㅠ')
        return None

    topic_embeddings = model.topic_embeddings_
    topic_embeddings = topic_embeddings[1:]

    target_topic = target_paragraph_data['topic'].value_counts().idxmax()
    target_topic_embedding = topic_embeddings[target_topic]

    num_topics = len(model.get_topic_freq()) - 1

    index = faiss.IndexFlatL2(128)
    index.add(topic_embeddings)

    distances, indices = index.search(np.expand_dims(target_topic_embedding, axis=0), num_topics)

    #토픽 간 거리가 먼 순서대로 정렬 완료됨
    distances = distances[0][::-1]
    indices = indices[0][::-1]  #거리가 먼 순서대로 토픽이 저장됨
    indices = np.delete(indices, np.where(indices == 0)[0][0]) #0번 토픽들 지우는거
    paragraph_data['topic'] = pd.Categorical(paragraph_data['topic'], categories=indices, ordered=True)
    paragraph_data = paragraph_data.sort_values('topic')

    paragraph_data = paragraph_data.drop_duplicates(subset='index')
    paragraph_data = paragraph_data[paragraph_data['topic'] != target_topic]

    article_index = paragraph_data['index'].unique()

    # 유사도가 가장 낮은 뉴스 찾기(토픽 간 거리)
    if user_gender or user_age:
        filtered_news = similar_news[(similar_news['gender'] == user_gender) | (similar_news['age'] == user_age)]
        if not filtered_news.empty:
            filtered_indices = filtered_news.index.intersection(article_index)
            if not filtered_indices.empty:
                return similar_news.loc[filtered_indices].iloc[0]

    # indices를 활용해서 유사도가 가장 낮은 뉴스를 선택
    least_similar_index = article_index[0]  # 유사도가 가장 낮은 인덱스 선택
    least_similar_news = similar_news.loc[least_similar_index]

    return least_similar_news



In [None]:
def recommend_news(user_id, model):
    # 엑셀 파일에서 사용자 정보를 읽어옴
    user_data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/(찐0613) 뉴스 데이터_요약추가.xlsx')

    # user_id 값 전처리
    user_id = str(user_id).strip() # 수정된 부분: 공백 제거 및 소문자 변환

    # 모든 user id 값을 전처리하여 비교
    user_data['user id'] = user_data['user id'].astype(str).str.strip()  # 수정된 부분


    if user_id not in user_data['user id'].values:  # 수정된 부분
        print(f"User ID {user_id} does not exist.")
        return None

    user_info = user_data[user_data['user id'] == user_id].iloc[0]

    # 사용자가 본 기사를 가져옴
    user_article_title = user_info['title']
    user_article = user_info['article']
    user_gender = user_info['gender']
    user_age = user_info['age']

    # 유사한 기사 찾기
    similar_news = find_similar_news(user_article_title, model, 100)

    # 다른 관점의 뉴스 추천
    recommended_news = clustering(user_article, similar_news, user_gender, user_age)

    return recommended_news

model = SentenceTransformer('bongsoo/kpf-sbert-128d-v1')

In [None]:
#user_id = 'example_user_id'  # 예시 사용자 ID 입력
user_id = '1'
recommended_news = recommend_news(user_id, model)
print(recommended_news)




media                                                       YTN
reporter                               이경국(leekk0428@ytn.co.kr)
title                           이준석 vs 안철수' 신경전 격화...출구 없는 與 내홍
link          https://n.news.naver.com/mnews/article/052/000...
article       [앵커]국민의힘 이준석 대표와 안철수 의원이 연일 충돌하며 거센 신경전을 벌이고 있...
summary       ['여기에 이 대표와 친윤계의 갈등까지 계속되면서 여당 내 갈등은 좀처럼 출구를 찾...
user id                                                     142
gender                                                       남성
age                                                         40s
similarity                                             0.484413
Name: 141, dtype: object
