In [64]:
%pip install -q python-youtube pandas google-api-python-client deep_translator
!python -m pip install -q --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [65]:
import os, requests, json
from pyyoutube import Client, Api
import pandas as pd
from pyyoutube import Client
from googleapiclient.discovery import build
from deep_translator import GoogleTranslator

In [82]:
yt_api_key = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\youtube_api.json"

with open(yt_api_key, 'r') as file:
    content = json.load(file)

api_key = content["api_key"]

channel_ids = ['UCtYLUTtgS3k1Fg4y5tAhLbw', # Statquest
               'UCCezIgC97PvUuR4_gbFUs5g', # Corey Schafer
               'UCfzlCWGWYyIQ0aLC5w48gBQ', # Sentdex
               'UCNU_lfiiWBdtULKOw6X0Dig', # Krish Naik
               'UCzL_0nIe8B4-7ShhVPfJkgw', # DatascienceDoJo
               'UCLLw7jmFsvfIVaUFsLs8mlQ', # Luke Barousse 
               'UCiT9RITQ9PW6BhXK0y2jaeg', # Ken Jee
               'UC7cs8q-gJRlGwj4A8OmCmXg', # Alex the analyst
               'UC2UXDak6o7rBm23k3Vv5dww', # Tina Huang
              ]

youtube = build('youtube', 'v3', developerKey=api_key)


Functions

In [75]:
def translate_keywords(keywords, lang):
    """Traduit une liste de mots-clés dans plusieurs langues si `lang` est défini."""
    keywords = normalize_to_list(keywords)  # Assurer que c'est une liste

    if lang is None:
        return keywords  # Pas de traduction

    lang = normalize_to_list(lang)  # Assurer que lang est une liste
    translated_keywords = set(keywords)  # Garder les mots-clés originaux

    for keyword in keywords:
        for language in lang:
            try:
                translated = GoogleTranslator(source='auto', target=language).translate(keyword)
                translated_keywords.add(translated)
            except Exception as e:
                print(f"Erreur lors de la traduction de '{keyword}' en {language}: {e}")

    return list(translated_keywords)

def get_channel_stats(channel_ids, youtube=youtube):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(playlist_id, youtube=youtube) :
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(video_ids, youtube=youtube):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(video_ids, youtube=youtube):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

def search_channels_by_tags(keywords=None, main_kw = None, lang=None, maxResults=50):
    """Recherche des chaînes YouTube où TOUS les mots de `main_kw` doivent être présents."""
    
    if main_kw is None and keywords is None:
        raise ValueError("Les deux `main_kw` et `keywords` ne peuvent pas être None. Au moins un doit être fourni.")
    if main_kw is None :
        main_kw = " "
    if keywords is None:
        keywords = " "

    keywords = translate_keywords(keywords, lang)
    main_kw = translate_keywords(main_kw, lang)

    channels = set()

    # On ne fait pas de recherche si `main_kw` est vide
    for keyword in keywords:
        query = " ".join(main_kw)  # On force TOUS les `main_kw` dans la requête
        if keyword:
            query += f" {keyword}"  # Ajoute un mot-clé secondaire s'il y en a

        request = youtube.search().list(
            q=query,
            type='channel',
            part='snippet',
            maxResults=maxResults
        )
        response = request.execute()
        
        for item in response['items']:
            channels.add(item['snippet']['channelTitle'])

    return list(channels)

def search_videos_and_playlists(keywords = None, main_kw= None, lang=None):
    """Recherche des vidéos et playlists YouTube où TOUS les mots de `main_kw` doivent être présents."""
    
    if main_kw is None and keywords is None:
        raise ValueError("Les deux `main_kw` et `keywords` ne peuvent pas être None. Au moins un doit être fourni.")
    if main_kw is None :
        main_kw = " "
    if keywords is None:
        keywords = " "

    keywords = translate_keywords(keywords, lang)
    main_kw = translate_keywords(main_kw, lang)

    results = {"videos": [], "playlists": []}

    # On ne fait pas de recherche si `main_kw` est vide
    for keyword in keywords:
        query = " ".join(main_kw)  # TOUS les `main_kw` sont dans la requête
        if keyword:
            query += f" {keyword}"  # Ajoute un mot-clé secondaire s'il y en a

        # Recherche de vidéos
        video_request = youtube.search().list(
            q=query,
            type='video',
            part='snippet',
            maxResults=50
        )
        video_response = video_request.execute()
        
        for item in video_response['items']:
            results["videos"].append(item)  # Stocke toutes les infos de la vidéo

        # Recherche de playlists
        playlist_request = youtube.search().list(
            q=query,
            type='playlist',
            part='snippet',
            maxResults=50
        )
        playlist_response = playlist_request.execute()
        
        for item in playlist_response['items']:
            results["playlists"].append(item)  # Stocke toutes les infos de la playlist

    return results


In [76]:
lang = ["fr","en","es", "fa"]

In [77]:
channel_data = get_channel_stats(channel_ids, youtube=youtube)

In [None]:
ML_keywords = ["Machine Learning", "Artificial Intelligence","Data", "Data Science","Data Engineering","Python"]
main_kw = ["xgboost"]
ML_channels = search_channels_by_tags(ML_keywords, lang=lang)
results = search_videos_and_playlists(ML_keywords, main_kw, lang)
print(results)

In [80]:
len(ML_channels)

976