<a href="https://colab.research.google.com/github/fravitt/YouTube-data/blob/main/Copy_of_YT_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
from googleapiclient.discovery import build
import pandas as pd
from time import sleep
import traceback
import os

#change with your API key
api_key = '' #replace with a valid API key
youtube = build('youtube', 'v3', developerKey=api_key)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Function to retrieve comments from a YouTube video and save them in a CSV file
def get_comments(api_key, video_id, folder_name='data'):
    youtube = build('youtube', 'v3', developerKey=api_key)

    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id,
        textFormat="plainText"
    )

    df = pd.DataFrame(columns=['comment', 'replies', 'date', 'user_name'])

    while request:
        replies = []
        comments = []
        dates = []
        user_names = []

        try:
            response = request.execute()
            for item in response['items']:
                # Extracting comments
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)

                user_name = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
                user_names.append(user_name)

                date = item['snippet']['topLevelComment']['snippet']['publishedAt']
                dates.append(date)

                # counting number of reply of comment
                replycount = item['snippet']['totalReplyCount']

                # if reply is there
                if replycount > 0:
                    # append empty list to replies
                    replies.append([])
                    # iterate through all reply
                    for reply in item['replies']['comments']:
                        # Extract reply
                        reply = reply['snippet']['textDisplay']
                        # append reply to last element of replies
                        replies[-1].append(reply)
                else:
                    replies.append([])

            # create new dataframe
            df2 = pd.DataFrame({"comment": comments, "replies": replies, "user_name": user_names, "date": dates})
            df = pd.concat([df, df2], ignore_index=True)
            # Save comments to a CSV file
            os.makedirs(f"./drive/MyDrive/youtube/{folder_name}/", exist_ok=True)
            df.to_csv(f"./drive/MyDrive/youtube/{folder_name}/{video_id}_user_comments.csv", index=False, encoding='utf-8')
            sleep(2)
            request = youtube.commentThreads().list_next(request, response)
            print("Iterating through next page")
        except Exception as e:
            print(str(e))
            print(traceback.format_exc())
            print("Sleeping for 10 seconds")
            sleep(10)
            break

In [None]:
# Function to retrieve video details (title, author, description)
def get_video_details(youtube, video_id, df_video_details):
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()

    video_details = response['items'][0]['snippet']
    video_title = video_details['title']
    video_description = video_details['description']
    video_author = video_details['channelTitle']

    # Create a new DataFrame with the new row
    new_row = pd.DataFrame({
        'video_id': video_id,
        'video_title': video_title,
        'video_description': video_description,
        'video_author': video_author
    }, index=[0])

    # Concatenate the new row with the existing DataFrame
    df_video_details = pd.concat([df_video_details, new_row], ignore_index=True)

    return df_video_details

In [None]:
# Function to retrieve all video IDs from a specified YouTube channel
def get_video_ids(youtube, channel_id):
    request = youtube.search().list(
        part='id',
        channelId=channel_id,
        maxResults=50,
        type='video'
    )
    response = request.execute()

    video_ids = [item['id']['videoId'] for item in response['items']]

    next_page_token = response.get('nextPageToken')
    while next_page_token:
        request = youtube.search().list(
            part='id',
            channelId=channel_id,
            maxResults=50,
            type='video',
            pageToken=next_page_token
        )
        response = request.execute()

        video_ids += [item['id']['videoId'] for item in response['items']]
        next_page_token = response.get('nextPageToken')

    return video_ids

In [None]:
# If you want to download all the channel's videos do this
channel_id = '' # Change with the channel ID you need
folder_name = ''
ids = get_video_ids(youtube, channel_id)

print(f'video ids:{ids}')
df_video_details = pd.DataFrame(columns=['video_id', 'video_title', 'video_description', 'video_author'])

# Retrieve video details and comments for each video
for video_id in ids:
    df_video_details = get_video_details(youtube, video_id, df_video_details)
    get_comments(api_key, video_id, folder_name=folder_name)

os.makedirs(f"./drive/MyDrive/youtube/{folder_name}/", exist_ok=True)
# Save all video details in the same file
df_video_details.to_csv(f"./drive/MyDrive/youtube/{folder_name}/video_details.csv", index=False, encoding='utf-8')

In [None]:
# Check which videos have already been saved
saved_videos = set()
saved_folder = "./drive/MyDrive/youtube/{folder_name}/"
if os.path.exists(saved_folder):
    saved_files = os.listdir(saved_folder)
    saved_videos = {file.split('.')[0] for file in saved_files}

# If you want to download all the channel's videos do this
channel_id = '' # Change with the channel ID you need
folder_name = ''
ids = get_video_ids(youtube, channel_id)

print(f'video ids:{ids}')
df_video_details = pd.DataFrame(columns=['video_id', 'video_title', 'video_description', 'video_author'])

# Retrieve video details and comments for each video
for video_id in ids:
    if video_id in saved_videos:
        print(f"Skipping video {video_id}, already saved.")
        continue

    df_video_details = get_video_details(youtube, video_id, df_video_details)
    get_comments(api_key, video_id, folder_name=folder_name)

os.makedirs(f"./drive/MyDrive/youtube/{folder_name}/", exist_ok=True)
# Save all video details in the same file
df_video_details.to_csv(f"./drive/MyDrive/youtube/{folder_name}/video_details.csv", index=False, encoding='utf-8')