In [40]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [41]:
# YouTube API key
API_KEY = "API"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [42]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [43]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [44]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [62]:
participants = ["챗지피티","챗GPT","cursor","커서","AI","노아","크롤링","crawling","생성형","자동화","클로드","무료","유료"]

In [63]:
video_comments = {}

start = time.time()
query_baisic = "인공지능"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  8%|▊         | 1/13 [00:05<01:08,  5.69s/it]

5.691386699676514s for query: 인공지능 챗지피티


 15%|█▌        | 2/13 [00:10<00:53,  4.91s/it]

10.05193042755127s for query: 인공지능 챗GPT


 23%|██▎       | 3/13 [00:12<00:38,  3.85s/it]

12.63810920715332s for query: 인공지능 cursor


 31%|███       | 4/13 [00:14<00:28,  3.15s/it]

14.728315114974976s for query: 인공지능 커서


 38%|███▊      | 5/13 [00:18<00:27,  3.38s/it]

18.52113914489746s for query: 인공지능 AI
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=ByrLa-fi2yo&maxResults=100&textFormat=plainText&key=AIzaSyA6bcM_TMKH2wRNO7QX78lElGSH4z-v3VM&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 46%|████▌     | 6/13 [00:21<00:22,  3.17s/it]

21.28913164138794s for query: 인공지능 노아
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=j1NgntvN6KA&maxResults=100&textFormat=plainText&key=AIzaSyA6bcM_TMKH2wRNO7QX78lElGSH4z-v3VM&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 54%|█████▍    | 7/13 [00:23<00:17,  2.88s/it]

23.555012226104736s for query: 인공지능 크롤링
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=Cb3ko_Ij99Q&maxResults=100&textFormat=plainText&key=AIzaSyA6bcM_TMKH2wRNO7QX78lElGSH4z-v3VM&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=2DLzoAat-5Y&maxResults=100&textFormat=plainText&key=AIzaSyA6bcM_TMKH2wRNO7QX78lElGSH4z-v3VM&alt=json returned "The video identified by the <code><a href="/youtube/

 62%|██████▏   | 8/13 [00:26<00:13,  2.76s/it]

26.073259353637695s for query: 인공지능 crawling


 69%|██████▉   | 9/13 [00:28<00:11,  2.75s/it]

28.799410104751587s for query: 인공지능 생성형


 77%|███████▋  | 10/13 [00:31<00:08,  2.76s/it]

31.592339754104614s for query: 인공지능 자동화


 85%|████████▍ | 11/13 [00:33<00:05,  2.64s/it]

33.959898233413696s for query: 인공지능 클로드


 92%|█████████▏| 12/13 [00:39<00:03,  3.63s/it]

39.85611128807068s for query: 인공지능 무료


100%|██████████| 13/13 [00:43<00:00,  3.35s/it]

43.51683044433594s for query: 인공지능 유료





## Merge youtube_comments with movie_rating_dataset

In [64]:
comments = pd.read_csv("youtube_comments.csv")

In [None]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [None]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

In [None]:
comments.head()

In [None]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

In [None]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

In [None]:
# NULL check
print(merged_df.isnull().values.any())

In [None]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

In [None]:
print(len(merged_df)) 

In [None]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [47]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [48]:
from konlpy.tag import Okt
okt = Okt()

In [76]:
# NULL check
print(merged_df.isnull().values.any())
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

False
False


In [77]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []
for sentence in tqdm.tqdm(comments['Comment']):
    sentence=str(sentence).strip()
    
    if not sentence: #빈 문자열이면 건더뛰기
        continue
        
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords
                                     and len(word) >= 2
                                     and word.isalpha()] #  불용어 제거

    if stopwords_removed_sentence: #빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 17410/17410 [01:11<00:00, 241.98it/s]


In [84]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [85]:
model.wv.vectors.shape

(6088, 100)

In [86]:
print(model.wv.most_similar("커서"))

[('쓸다', 0.9975815415382385), ('ㅠㅠ', 0.9974247813224792), ('가요', 0.9939100742340088), ('코드', 0.9929696321487427), ('이미지', 0.9928091168403625), ('먹다', 0.9910512566566467), ('영어', 0.9906067252159119), ('무료', 0.9903480410575867), ('클로드', 0.9896458983421326), ('그런데', 0.9895000457763672)]


In [87]:
print(model.wv.most_similar("자동화"))

[('노력', 0.9983398914337158), ('직접', 0.9982675313949585), ('생기다', 0.997830331325531), ('업무', 0.997702956199646), ('대로', 0.9974852800369263), ('통해', 0.997167706489563), ('다양하다', 0.9971607327461243), ('개인', 0.9971402287483215), ('에도', 0.9970384240150452), ('조금', 0.9969872236251831)]


## Save W2V model

In [88]:
model.wv.save_word2vec_format('ko_w2v')

In [90]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v


## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv