In [1]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [2]:
# YouTube API key
API_KEY = "your api key"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [5]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [6]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [7]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [8]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [9]:
video_comments = {}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  5%|▌         | 1/20 [00:04<01:22,  4.34s/it]

4.348242998123169s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:07<01:10,  3.89s/it]

7.926848411560059s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:11<01:01,  3.64s/it]

11.262072086334229s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:14<00:55,  3.47s/it]

14.466182231903076s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:17<00:48,  3.21s/it]

17.234874725341797s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [00:20<00:45,  3.25s/it]

20.548239946365356s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [00:23<00:41,  3.22s/it]

23.706836700439453s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [00:27<00:39,  3.33s/it]

27.26221489906311s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [00:30<00:35,  3.19s/it]

30.152323007583618s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [00:32<00:30,  3.09s/it]

33.003620624542236s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [00:36<00:28,  3.12s/it]

36.21546149253845s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [00:39<00:24,  3.08s/it]

39.19672250747681s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [00:42<00:21,  3.13s/it]

42.43573570251465s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [00:45<00:18,  3.14s/it]

45.60477352142334s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [00:48<00:15,  3.11s/it]

48.651283264160156s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [00:51<00:11,  2.94s/it]

51.1911187171936s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [00:54<00:09,  3.04s/it]

54.460227489471436s for query: 흑백요리사 이영숙
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=_yOU-oKKSXg&maxResults=100&textFormat=plainText&key=AIzaSyAaSJ4ttVS5DmwqUyn-QEWfXK7avxhhAxk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 90%|█████████ | 18/20 [00:57<00:06,  3.14s/it]

57.843557596206665s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [01:00<00:02,  2.98s/it]

60.454062938690186s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [01:03<00:00,  3.18s/it]

63.5085768699646s for query: 흑백요리사 박준우





## Merge youtube_comments with movie_rating_dataset

In [10]:
comments = pd.read_csv("youtube_comments.csv")

In [11]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,3ZUL9il_beI,"《흑백요리사: 요리 계급 전쟁》, 지금 넷플릭스에서 시청하세요: https://ww..."
1,3ZUL9il_beI,흑백요리사 2기다리고 있다
2,3ZUL9il_beI,"이미 셰프로써 이룰걸 다 이룬 베테랑과 이제 막 이름을 알려야 하는 패기있는 루키,..."
3,3ZUL9il_beI,나폴리 뭐시기는 그냥 흑이 백을 이기는 구도연출을 위해서 우승시켜준거고 ㄹㅇ우승자는...
4,3ZUL9il_beI,나폴리가 두부지옥을 안해서 인정을 못받는거.


In [13]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x786630f36d10>)

In [None]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

In [None]:
comments.head()

In [None]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

In [None]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

In [None]:
# NULL check
print(merged_df.isnull().values.any())

In [None]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

In [None]:
print(len(merged_df)) 

In [None]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [22]:
#skip end

In [14]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.8/493.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
from konlpy.tag import Okt
okt = Okt()

In [18]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text']=>comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']):
    sentence = str(sentence).strip()

    if not sentence:
        continue
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords #조건1
                                    and len(word) >=2 #조건2
                                    and word.isalpha() ] #한글이나 영어
    
    if stopwords_removed_sentence: # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 44204/44204 [01:54<00:00, 384.42it/s]


In [19]:
!pip install gensim

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
libpysal 4.9.2 requires packaging>=2

In [20]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [21]:
model.wv.vectors.shape

(6665, 100)

In [22]:
print(model.wv.most_similar("백종원"))

[('기준', 0.9068042039871216), ('안성', 0.9037588238716125), ('의원', 0.9000785946846008), ('성재', 0.8870858550071716), ('위원', 0.8847585916519165), ('블라인드', 0.8733348250389099), ('한테', 0.8586214184761047), ('받다', 0.8472944498062134), ('재는', 0.8400877714157104), ('고든', 0.8323085904121399)]


In [23]:
print(model.wv.most_similar("최현석"))

[('여경', 0.9525101780891418), ('정지선', 0.9480866193771362), ('셰프', 0.922046422958374), ('이랑', 0.9095296263694763), ('쉐프', 0.9080196022987366), ('성재', 0.9013208746910095), ('이영숙', 0.8923614621162415), ('헤드', 0.8846525549888611), ('안유', 0.8795832991600037), ('호준', 0.8780210614204407)]


## Save W2V model

In [24]:
model.wv.save_word2vec_format('ko_w2v')

In [25]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [26]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv