In [13]:
from modules.search import Search
from modules.video import Video

import pandas as pd
import yaml
from datetime import datetime
import os
from tqdm import tqdm
import re

In [2]:
youtube_search_obj = Search()
youtube_video_obj = Video()

# Variables loaded from the variables.yaml file
with open("variables.yaml") as file_yaml:
    dict_variables = yaml.load(file_yaml, Loader=yaml.FullLoader)

In [3]:
list_queries = dict_variables["query_terms"]
number_of_search_results = dict_variables["number_of_search_results"]
number_of_comment_threads = dict_variables["number_of_comment_threads"]

list_video_IDs_all = []

In [4]:
for query in tqdm(list_queries, desc="Queries..."):
    df_search = youtube_search_obj.search(query=query, max_results=number_of_search_results, type="video", return_table=True, save=False)
    list_video_IDs = df_search[df_search["kind"].str.contains("video")]["kind_ID"].to_list()
    list_video_IDs_all = list_video_IDs_all + [video_ID for video_ID in list_video_IDs]

Queries...: 100%|██████████| 3/3 [00:01<00:00,  2.57it/s]


In [5]:
list_video_IDs_all = list(set(list_video_IDs_all))

list_dict_video_info = []
list_df_comment_threads = []

In [6]:
for video_ID in tqdm(list_video_IDs_all, desc="Videos..."):
    try:
        dict_video_info = youtube_video_obj.get_video_info(video_ID=video_ID, save=False)
        list_dict_video_info.append(dict_video_info)
    except Exception as e:
        print(e)
        print(video_ID)
    try:
        df_comment_thread = youtube_video_obj.get_comment_threads_to_video(video_ID=video_ID, return_table=True, save=False)
        list_df_comment_threads.append(df_comment_thread)
    except Exception as e:
        print(e)
        print(video_ID)

Videos...:  16%|█▌        | 24/148 [00:07<00:31,  3.96it/s]<HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=7wF_7kBOiDQ&key=AIzaSyCWyZ3PO_m9uBwnWG6aKg-vRtZrZYyPElM&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
7wF_7kBOiDQ
Videos...:  24%|██▎       | 35/148 [00:10<00:27,  4.06it/s]<HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=P9_wGTGeXuY&key=AIzaSyCWyZ3PO_m9uBwnWG6aKg-vRtZrZYyPElM&alt=json returned "The video identified by the <code><a href="/youtube/

In [7]:
df_video_info_all = pd.DataFrame(list_dict_video_info)
df_comment_threads_all = pd.concat(list_df_comment_threads, ignore_index=True)

dir_saved = "generated_tables"

In [9]:
# Save the dataframes as CSV files
df_video_info_all.to_csv(os.path.join(dir_saved, "video_info") + ".csv", index=False)
df_comment_threads_all.to_csv(os.path.join(dir_saved, "comment_threads") + ".csv", index=False)

In [11]:
df_video_info_all.head()

Unnamed: 0,video_url,video_title,video_description,view_count,like_count,dislike_count,favorite_count,comment_count,channel_url,publish_time,video_duration
0,https://youtube.com/watch?v=yt_hTuwD32I,NEWhere Electronic Cigs Review - Great Diposab...,http://ecigarettenow.com/ne-where-electronic-c...,18648,65,3,0,16,https://youtube.com/channel/UCbzNa9wN67RKJcCv7...,2013-12-30T20:27:54Z,PT5M46S
1,https://youtube.com/watch?v=8rXgTYuRUCw,Aegis X by Geek Vape - Love That Display!,In this video we check out the Aegis X kit by ...,469214,3601,200,0,438,https://youtube.com/channel/UCaABLqKOy33BcQBU0...,2019-10-05T21:33:38Z,PT10M57S
2,https://youtube.com/watch?v=VkErnpTrqxk,Platinum E Hookahs Night clubs Model Pheenix D...,,51,0,0,0,0,https://youtube.com/channel/UCFfDKqJe8HF_MffZi...,2014-07-25T02:52:30Z,PT55S
3,https://youtube.com/watch?v=ELmdFELWwNc,Adam Curry on the Great Vape Scare,Taken from JRE #1436 w/Adam Curry: https://you...,1200971,23431,654,0,3297,https://youtube.com/channel/UCnxGkOGNMqQEUMvro...,2020-03-04T20:30:00Z,PT10M6S
4,https://youtube.com/watch?v=331UY3qvTNw,Seduction of Smoking - Are E Cigarettes Less H...,Award winning journalist Peter Taylor gets exc...,105261,759,75,0,159,https://youtube.com/channel/UCFrO-dKhooOuTtix5...,2018-09-23T16:00:00Z,PT51M30S


In [12]:
df_comment_threads_all.head()

Unnamed: 0,video_ID,comment_thread_ID,comment_ID,comment,author_channel_ID,like_count,updated_at,publish_time
0,yt_hTuwD32I,UgwFXaiyi0yQXEPv5JJ4AaABAg,UgwFXaiyi0yQXEPv5JJ4AaABAg,Can u change this,UCCdMcf0ZGUYbaxus4MoxNZA,0,2018-09-05T04:38:38Z,2018-09-05T04:38:38Z
1,yt_hTuwD32I,Ugh5KeeDMtEtDngCoAEC,Ugh5KeeDMtEtDngCoAEC,"Can you refill them? Like you know, maybe ther...",UCo0pou2cvs7XN9vy5DHtoGA,2,2016-07-03T04:42:37Z,2016-07-03T04:42:37Z
2,yt_hTuwD32I,UggHCvqOae-a7ngCoAEC,UggHCvqOae-a7ngCoAEC,how many puffs?,UCcfvk_Fn0qp9XS0uF-2WWMQ,1,2016-02-06T01:53:28Z,2016-02-06T01:53:28Z
3,yt_hTuwD32I,UghcmrQbBKZab3gCoAEC,UghcmrQbBKZab3gCoAEC,"Just got my first one today, Pink Limonade!",UCXbzIloeL5tqC7B962MX5qQ,5,2015-12-14T01:39:34Z,2015-12-14T01:39:34Z
4,yt_hTuwD32I,UghcmrQbBKZab3gCoAEC,UghcmrQbBKZab3gCoAEC.87ilfcPrmTE9-fmQfNu0Ai,I just had got the same one and I love it,UCZOJoxFrJB8g4qrPrRPJ1tQ,0,2019-10-04T19:06:28Z,2019-10-04T19:06:28Z


In [44]:
def convert_ISO_8061_to_secs(dur):
    s = dur[2:]
    try:
        if("H" in s):
            h = int(s.split("H")[0])
            s = s.split("H")[1]
        else:
            h = 0
        if("M" in s):
            m = int(s.split("M")[0])
            s = s.split("M")[1]
        else:
            m = 0
        if("S" in s):
            s = int(s.split("S")[0])
        else:
            s = 0
    except Exception:
        print(h, m, s)
        print(type(h), type(m), type(s))
        print(dur)
    return(int(h)*3600 + int(m)*60 + int(s))

In [47]:
df_video_info_all["secs"] = df_video_info_all["video_duration"].apply(convert_ISO_8061_to_secs)
df_video_info_all["secs"].mean()

539.5202702702703

In [66]:
def dislikes_to_likes_count(like_count, dislike_count):
    try:
        like_count = int(like_count)
        dislike_count = int(dislike_count)
    except Exception as e:
        print(e)
        print(like_count)
        print(dislike_count)
        return
    if(dislike_count < 0):
        return(0)
    if(like_count == 0):
        like_count = 1
    return(round(dislike_count/like_count, 2))

df_video_info_all["dislikes_to_likes_count"] = df_video_info_all.apply(lambda row: dislikes_to_likes_count(row["like_count"], row["dislike_count"]), axis=1)
df_video_info_all[["video_url", "video_title", "like_count", "dislike_count", "dislikes_to_likes_count"]].sort_values("dislikes_to_likes_count", ascending=False)

Unnamed: 0,video_url,video_title,like_count,dislike_count,dislikes_to_likes_count
144,https://youtube.com/watch?v=n__I7IeYEpU,FDA issues ban on most flavored e-cigarettes,447,901,2.02
13,https://youtube.com/watch?v=SH9XOeZVaMk,E-Hookahs 800 puffs,4,7,1.75
80,https://youtube.com/watch?v=jiUXXx6kPbk,The Dirty Truth About E-Cigs,10486,15965,1.52
36,https://youtube.com/watch?v=tpfbBm7IofQ,The DeNoble Files: e-Cigarettes,64,92,1.44
118,https://youtube.com/watch?v=UiCR7-PWaGs,E-Cigarettes & JUUL School Presentation,12,13,1.08
...,...,...,...,...,...
12,https://youtube.com/watch?v=dexNU06rqG4,Platinum E Hookahs Arkansas,0,0,0.00
11,https://youtube.com/watch?v=cLUjVl3Zno8,E Hookahs,0,0,0.00
96,https://youtube.com/watch?v=A04QI926sdo,Cigavette E-Hookah 'Peach Bellini',0,0,0.00
2,https://youtube.com/watch?v=VkErnpTrqxk,Platinum E Hookahs Night clubs Model Pheenix D...,0,0,0.00
