# 전처리

In [23]:
import json
import os
def save_to_json(data, filename):
    # data 폴더가 존재하지 않으면 생성
    if not os.path.exists('data'):
        os.makedirs('data')
    
    # 파일 경로를 data 폴더 아래로 설정
    filepath = os.path.join('data', filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
    print(f"데이터 저장: {filepath}")
    
    # JSON 데이터 로드 함수
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return json.load(file)
    
def merge_values(data):
    """
    key: value에서 value가 문자열 배열의 배열일 때,
    하나의 문자열로 변환하고 빈 문자열("")은 제외하는 함수.

    :param data: dict, key: list of lists (e.g., { "key1": [["문장1", "문장2"], ["문장3"]] })
    :return: dict, key: merged string (e.g., { "key1": "문장1 문장2 문장3" })
    """
    merged_data = {}
    for key, values in data.items():
        if isinstance(values, list):  # 리스트인지 확인
            merged_sentence = " ".join(
                s for sublist in values for s in sublist if s.strip()
            )  # 빈 문자열 제거 후 공백으로 연결
            merged_data[key] = merged_sentence  # key: 병합된 문자열 형태로 저장
            
            

    return merged_data

In [None]:
import time
from googleapiclient.discovery import build
import isodate


# 환경 변수에서 API 키 가져오기
API_KEY = os.getenv("YOUTUBE_API_KEY")
if not API_KEY:
    raise ValueError("API 키가 설정되지 않았습니다. 환경 변수 'YOUTUBE_API_KEY'를 설정하세요.")

# YouTube API 설정
youtube = build("youtube", "v3", developerKey=API_KEY)

# 카테고리 ID 설정
CATEGORIES = {
    "News & Politics": "25"
}

# 동영상 데이터 가져오기
def fetch_trending_videos(category_id, region_code="KR", max_results=200):
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        try:
            request = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                chart="mostPopular",
                regionCode=region_code,
                videoCategoryId=category_id,
                maxResults=min(50, max_results - len(videos)),
                pageToken=next_page_token
            )
            response = request.execute()

            for item in response.get("items", []):
                duration = isodate.parse_duration(item["contentDetails"]["duration"])
                duration_in_seconds = duration.total_seconds()  #초로 바꾸기기

                if duration_in_seconds > 80:  # 80초 이상의 동영상만 가져오기
                    videos.append({
                        "video_id": item["id"],
                        "title": item["snippet"]["title"],
                        "description": item["snippet"]["description"],
                        "tags": item["snippet"].get("tags", []),
                        "duration": str(duration),
                        "view_count": int(item["statistics"].get("viewCount", 0)),
                        "like_count": int(item["statistics"].get("likeCount", 0)),
                        "comment_count": int(item["statistics"].get("commentCount", 0)),
                        "category_id": category_id,
                    })

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

        except Exception as e:
            print(f"Error fetching videos: {e}")
            time.sleep(5)  # 잠시 대기 후 다시 시도

    return videos



# 실행
all_videos = {}

for category_name, category_id in CATEGORIES.items():
    print(f"Fetching trending videos for category: {category_name}")
    videos = fetch_trending_videos(category_id, region_code="KR", max_results=200)
    all_videos[category_name] = videos
    print(f"비디오 {len(videos)} 개 카테고리: {category_name} fetch 완료.")

# 결과를 하나의 JSON 파일로 저장
output_file = "raw_video_data.json"
save_to_json(all_videos, output_file)
print(f"데이터 저장 : data/{output_file}'")

# 결과 출력 예시
for category, videos in all_videos.items():
    print(f"\nCategory: {category}")
    for video in videos[:5]:
        print(f" - {video['title']} ({video['video_id']}), 조회수: {video['view_count']} 회, 좋아요: {video['like_count']} 개")

Fetching trending videos for category: News & Politics
비디오 44 개 카테고리: News & Politics fetch 완료.
데이터 저장: data\raw_video_data.json
데이터 저장 : data/raw_video_data.json'

Category: News & Politics
 - 김어준의 겸손은힘들다 뉴스공장 2025년 2월 11일 화요일 [이재명, 신장식, 박범계, 박시동, 이광수, 패션공장] (merQIjiKU3s), 조회수: 1683911 회, 좋아요: 152676 개
 - "그날의 진짜 이야기 담긴 녹취록을!" "아니 잠깐, 채택 안 됐잖아요"...야심차게 꺼냈지만 재판관이 끊은 이유가  / SBS / 바로 이 뉴스 (U8KJo_RKeXs), 조회수: 2878389 회, 좋아요: 29013 개
 - [오늘 이 뉴스] "그때 법무실장이 안 말렸으면.." 재판관 앞 '무시무시한' 증언 (2025.02.06/MBC뉴스) (zYsXdEKSmaA), 조회수: 1905473 회, 좋아요: 44445 개
 - 대구 찾은 전한길, 헌재 겨냥 “제2의 을사오적” / 채널A / 뉴스 TOP10 (Oqv0A6dyxw0), 조회수: 651235 회, 좋아요: 33556 개
 - 범여권 지지율 1위 김문수 장관... 이재명 대표연설 듣고 일침 작렬 (7Clx1bGNyX0), 조회수: 521968 회, 좋아요: 27753 개


In [None]:
import json
import os
from googleapiclient.discovery import build


# 비디오 댓글 가져오기 함수
def get_video_comments(youtube, video_id, max_results=100):
    comments = []
    next_page_token = None

    while len(comments) < max_results:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=min(50, max_results - len(comments)),
                pageToken=next_page_token
            )
            response = request.execute()

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                comments.append(comment)

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

        except Exception as e:
            print(f"Error fetching comments for video {video_id}: {e}")
            break

    return comments

# JSON 데이터 저장 함수
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# 파일 경로 설정
input_file_path = 'data/raw_video_data.json'
output_file_path = 'video_comments.json'

# JSON 데이터 로드
data = load_json(input_file_path)


# 비디오 댓글 가져오기 및 저장
all_comments = {}
for category, videos in data.items():
    for video in videos:
        video_id = video["video_id"]
        comments = get_video_comments(youtube, video_id, max_results=1000)
        all_comments[video_id] = comments

# 결과 저장
save_to_json(all_comments, output_file_path)
print(f"비디오 댓글이 '{output_file_path}'에 저장되었습니다.")

In [8]:
import re
from pykospacing import Spacing

def clean_text(text):   
    # text = re.sub(r'\s+', '', text)
    text = re.sub(r'http\S+|www\S+|@\S+|#\S+', '', text)
    text = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", text)
    # text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r"[a-zA-Z]", "", text)
    text = text.strip()
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

data = load_json('data/video_comments.json')

processed_data = {}


for key, values in data.items():
    if isinstance(values, list):  # 값이 리스트인지 확인
        processed_data[key] = [clean_text(value) for value in values]
        
# 총 문장 수 계산
total_sentences = sum(len(value) for value in processed_data.values())
print(f"총 문장 수: {total_sentences}")
        
save_to_json(processed_data, 'cleaned_video_comments.json')

총 문장 수: 30147
데이터 저장: data\cleaned_video_comments.json


# 토크나이저 생성 Soynlp

In [14]:
from soynlp.tokenizer import MaxScoreTokenizer
from soynlp.utils import DoublespaceLineCorpus
from soynlp.tokenizer import LTokenizer
from soynlp.word import WordExtractor

# Load the cleaned video comments
data = load_json('data/cleaned_video_comments.json')

# 모든 댓글을 하나의 텍스트로 합치기
corpus = []
for key, values in data.items():
    if isinstance(values, list):  # Check if the value is a list
        corpus.extend(values)

# DoublespaceLineCorpus 객체 생성
corpus[5]

# WordExtractor 객체 생성 및 학습
word_extractor = WordExtractor()
word_extractor.train(corpus)
word_scores = word_extractor.extract()

word_scores["이재명"]


training was done. used memory 0.557 Gbory 0.549 Gb
all cohesion probabilities was computed. # words = 27413
all branching entropies was computed # words = 37145
all accessor variety was computed # words = 37145


Scores(cohesion_forward=np.float64(0.3958368792415143), cohesion_backward=np.float64(0.22787631744074252), left_branching_entropy=3.6319164965071917, right_branching_entropy=3.3525386724289588, left_accessor_variety=115, right_accessor_variety=103, leftside_frequency=1502, rightside_frequency=66)

In [27]:
from soynlp.tokenizer import LTokenizer
from soynlp.noun import LRNounExtractor_v2

cohesion_score = {word:score.cohesion_forward for word, score in word_scores.items()}

tokenizer = MaxScoreTokenizer(scores=cohesion_score)


noun_extractor = LRNounExtractor_v2()
nouns = noun_extractor.train_extract(corpus) # list of str like

noun_scores = {noun:score.score for noun, score in nouns.items()}
combined_scores = {noun:score + cohesion_score.get(noun, 0)
    for noun, score in noun_scores.items()}
combined_scores.update(
    {subword:cohesion for subword, cohesion in cohesion_score.items()
    if not (subword in combined_scores)}
)


# Load the cleaned video comments
data = load_json('data/cleaned_video_comments.json')
tokenizer = LTokenizer(scores=combined_scores)

# Tokenize each comment
tokenized_comments = {}
for key, values in data.items():
    print(values[1])
    if isinstance(values, list):  # Check if the value is a list
        tokenized_comments[key] = [tokenizer.tokenize(value) for value in values]

# Save the tokenized comments back to JSON
save_to_json(tokenized_comments, 'tokenized_video_comments.json')



[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 124921 from 30147 sents. mem=0.580 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=311011, mem=0.764 Gb
[Noun Extractor] batch prediction was completed for 41747 words
[Noun Extractor] checked compounds. discovered 22581 compounds
[Noun Extractor] postprocessing detaching_features : 30190 -> 23169
[Noun Extractor] postprocessing ignore_features : 23169 -> 22967
[Noun Extractor] postprocessing ignore_NJ : 22967 -> 22535
[Noun Extractor] 22535 nouns (22581 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=0.820 Gb                    
[Noun Extractor] 66.60 % eojeols are covered
그날 대한민국을 구한 시민들에게 경의를 표합니다 감사드립니다
이새끼는 희기종이네
한국군 제1군인곽종근
전한길 선생님은 애국자이십니다 감사합니다
남한내 중국인 전원미국처럼 추방조치 해야 한다 중국인 입국 불허조치 해야 한다
미국이 산업기반을 동맹국과는 다르게 해왔는데 그것을 모두 독식하기

In [28]:
from soynlp.noun import LRNounExtractor_v2
noun_extractor = LRNounExtractor_v2()
nouns = noun_extractor.train_extract(corpus) # list of str like

noun_scores = {noun:score.score for noun, score in nouns.items()}
combined_scores = {noun:score + cohesion_score.get(noun, 0)
    for noun, score in noun_scores.items()}
combined_scores.update(
    {subword:cohesion for subword, cohesion in cohesion_score.items()
    if not (subword in combined_scores)}
)

tokenizer = LTokenizer(scores=combined_scores)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 124921 from 30147 sents. mem=0.753 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=311011, mem=0.780 Gb
[Noun Extractor] batch prediction was completed for 41747 words
[Noun Extractor] checked compounds. discovered 22581 compounds
[Noun Extractor] postprocessing detaching_features : 30190 -> 23169
[Noun Extractor] postprocessing ignore_features : 23169 -> 22967
[Noun Extractor] postprocessing ignore_NJ : 22967 -> 22535
[Noun Extractor] 22535 nouns (22581 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=0.830 Gb                    
[Noun Extractor] 66.60 % eojeols are covered


# Konlpy

In [None]:
from konlpy.tag import Okt

okt = Okt()
data = load_json('data/cleaned_video_comments.json')

# Tokenize each comment using Okt
tokenized_comments_okt = {}
for key, values in data.items():
    if isinstance(values, list):  # Check if the value is a list
        tokenized_comments_okt[key] = [okt.morphs(value) for value in values]

# Save the tokenized comments back to JSON
save_to_json(tokenized_comments_okt, 'okt_tokenized_video_comments.json')





데이터 저장: data\okt_tokenized_video_comments.json


In [30]:
# Extract nouns from each comment using Okt
nouns_comments_okt = {}
for key, values in data.items():
    if isinstance(values, list):  # Check if the value is a list
        nouns_comments_okt[key] = [okt.nouns(value) for value in values]

# Save the nouns extracted comments back to JSON
save_to_json(nouns_comments_okt, 'okt_nouns_video_comments.json')

데이터 저장: data\okt_nouns_video_comments.json


# Kiwi

In [12]:
from kiwipiepy import Kiwi

# Kiwi 객체 생성
kiwi = Kiwi()

# Load the cleaned video comments
data = load_json('data/cleaned_video_comments.json')

corpus = []
for key, values in data.items():
    if isinstance(values, list):  # Check if the value is a list
        corpus.extend(values)
        

kiwi.extract_add_words(corpus)





[('부정선거', 0.7329921722412109, 713, -2.150365114212036),
 ('곽종근', 0.6221180558204651, 571, -2.5893051624298096),
 ('내란수괴', 0.41759631037712097, 161, -2.3843839168548584),
 ('문형배', 0.40476036071777344, 578, -2.8553996086120605),
 ('윤석열대통령', 0.40355342626571655, 139, 0.11226491630077362),
 ('우리법연구회', 0.40248537063598633, 36, -2.949073553085327),
 ('사전투표', 0.40150853991508484, 91, -1.8540213108062744),
 ('계엄선포', 0.37189987301826477, 38, -0.5901609063148499),
 ('석열', 0.36449238657951355, 1481, -2.452401876449585),
 ('윤대통령', 0.3614285886287689, 469, 0.24186985194683075),
 ('그라운드씨', 0.34617775678634644, 40, -1.5572603940963745),
 ('렉스턴', 0.34515896439552307, 25, -2.0683038234710693),
 ('사법리스크', 0.34044161438941956, 13, -1.316566824913025),
 ('대통령탄핵', 0.333201140165329, 28, -1.916110873222351),
 ('더불어공산당', 0.33175933361053467, 21, -2.059748649597168),
 ('국민들', 0.31844037771224976, 1041, -2.1782732009887695),
 ('구준엽씨', 0.3088982105255127, 51, -2.8865554332733154),
 ('홍장원', 0.3052136301994324, 3

In [None]:
# ✅ Kiwi를 이용한 토큰화 & 명사 추출
tokenized_nouns = {}


for key, values in data.items():
    if isinstance(values, list):  # 리스트인지 확인
        tokenized_nouns[key] = [
            [token.form for token in kiwi.tokenize(value) if token.tag in ["NNG", "NNP"] and len(token.form) > 1]  # 명사만 추출, 1글자 제외
            for value in values if value  # 빈 문자열 제외
        ]

# ✅ 결과 JSON 저장
save_to_json(tokenized_nouns, 'kiwi_nouns_video_comments.json')

SyntaxError: invalid syntax (1630096044.py, line 10)

# KRWordRank

In [None]:
# Load the cleaned video comments
data = load_json('data/kiwi_nouns_video_comments.json')

    
processed_data = merge_values(data)

save_to_json(processed_data,'kiwi_data.json')
     

데이터 저장: data\kiwi_data.json


In [None]:
from krwordrank.word import KRWordRank

min_count = 10   # 단어의 최소 출현 빈도수 (그래프 생성 시)
max_length = 10 # 단어의 최대 길이
wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length)

data = load_json('data/kiwi_data.json')

documents = list(data.values())  # 각 영상의 댓글을 리스트로 저장

beta = 0.85    # PageRank의 감쇄 계수
max_iter = 10  # 반복 횟수

keywords, rank, graph = wordrank_extractor.extract(documents, beta, max_iter)

top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]


In [None]:
keywords, rank, graph = wordrank_extractor.extract(documents, beta, max_iter)

top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]


In [22]:
top_keywords

[('국민', 23.804733480548844),
 ('대통령', 23.177865811936364),
 ('나라', 17.927918897117337),
 ('사람', 17.014688347002217),
 ('이재명', 14.159581424001649),
 ('민주당', 13.578475480755616),
 ('탄핵', 11.904426635750083),
 ('윤석열', 11.296851125747969),
 ('중국', 11.097538875608626),
 ('내란', 10.85458190281492)]

# -------------------------------

In [None]:
# Load the cleaned video comments
data = load_json('data/merged_kiwi_video_comments.json')

    
processed_data = {}
for key, values in data.items():
    if isinstance(values, list):  # 값이 리스트인지 확인
        merged_sentence = " ".join([s for s in values if s.strip()])  # 빈 문자열 제거 후 공백으로 연결        
        processed_data[key] = merged_sentence  # key: value 형태로 저장

save_to_json(processed_data,'kiwi_data.json')