In [None]:
import os
import googleapiclient.discovery
import googleapiclient.errors
from dotenv import load_dotenv

load_dotenv()    

api_service_name = "youtube"
api_version = "v3"
api_key = os.getenv('yt_api_key')

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key
)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
    id="UCX6OQ3DkcsbYNE6H8uQQuVA"
)
response = request.execute()

print(response)

In [None]:
from IPython.display import JSON


# print the response in json format 
JSON(response)


In [None]:
import json
import pandas as pd

# Summary stats of the channel

# Extract the relevant data
channel_data = response['items'][0]
snippet = channel_data['snippet']
statistics = channel_data['statistics']
content_details = channel_data['contentDetails']['relatedPlaylists']

# Create a DataFrame
df = pd.DataFrame({
    "Channel ID": [channel_data['id']],
    "Title": [snippet['title']],
    "Description": [snippet['description']],
    "Custom URL": [snippet['customUrl']],
    "Published At": [snippet['publishedAt']],
    "Country": [snippet.get('country', 'N/A')],
    "Subscriber Count": [statistics['subscriberCount']],
    "View Count": [statistics['viewCount']],
    "Video Count": [statistics['videoCount']],
    "Uploads Playlist ID": [content_details['uploads']]
})

# Display the DataFrame
print(df)



In [27]:
def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids



In [28]:
# get the list of videos

videos_list = get_video_ids(youtube, playlist_id='UUX6OQ3DkcsbYNE6H8uQQuVA')

print(len(videos_list))

809


In [29]:
def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

In [None]:
df = get_video_details(youtube, videos_list)

# print dataframe shape

print(df.shape)

In [None]:
# EDA dataframe

df.info()

In [None]:
# head dataframe

print(df.head())

In [None]:
import pandas as pd
import isodate

# Step 1: Convert columns to numeric types and create df_2
df_2 = df.copy()
df_2['viewCount'] = pd.to_numeric(df_2['viewCount'], errors='coerce')
df_2['likeCount'] = pd.to_numeric(df_2['likeCount'], errors='coerce')
df_2['commentCount'] = pd.to_numeric(df_2['commentCount'], errors='coerce')

# Step 2: Drop the 'favouriteCount' column and save to df_2
df_2.drop(columns=['favouriteCount'], inplace=True)

# Step 3: Convert 'publishedAt' to datetime format and save to df_2
df_2['publishedAt'] = pd.to_datetime(df_2['publishedAt'])

# Step 4: Extract the day of the week from 'publishedAt' and save to df_2
df_2['day_of_week'] = df_2['publishedAt'].dt.day_name()

# Step 5: Convert 'duration' to total seconds and save to df_2
df_2['duration'] = df_2['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())

# Step 6: Fill NaN values for specific columns and save to df_2
df_2['tags'].fillna('', inplace=True)
df_2['description'].fillna('', inplace=True)

# Display the first few rows of df_2 to verify changes
print(df_2.head())


In [None]:
import os

# Define the path to the 'dataset' folder
folder_path = 'dataset'

# Create the 'dataset' folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Define the path for the CSV file
file_path = os.path.join(folder_path, 'mrbeast_channel.csv')

# Save df_2 to a CSV file
df_2.to_csv(file_path, index=False)

print(f"File saved to {file_path}")
