In [1]:
def VideoDetailExtraction(kw_list,maxResults = 50):
   try:
        request = youtube.search().list(part='snippet',
                                        order='viewCount',
                                        q=kw_list,
                                        relevanceLanguage='en',
                                        type='video',
                                        # videoCategoryId=26, regionCode='IN',
                                        maxResults=maxResults,
                                        videoCaption = 'closedCaption')
                                       
        response = request.execute()
        return response
   except Exception as e:
        print(f"Error during VideoDetailExtraction(): {e}")
        return None

def VideoDetailExtractionNextPageToken(kw_list, nextPageToken, maxResults = 50):
    try:
        request = youtube.search().list(part='snippet',
                                        order='viewCount',
                                        q=kw_list,
                                        relevanceLanguage='en',
                                        type='video',
                                        # videoCategoryId=26, regionCode='IN',
                                        maxResults=maxResults,
                                        pageToken=nextPageToken,
                                        videoCaption = 'closedCaption')
                                        
                        
        response = request.execute()
        return response
    except Exception as e:
        print(f"Error during VideoDetailExtractionNextPageToken(): {e}")
        return None

In [2]:
def VideoDataFrame(response):
    try:
        videoDetails = []
        videoIds = []
        channelIds = []
        channelDetails = []
        '''
        Video Search Block
        '''
        for i in range(len(response['items'])):
            publishedOn = response['items'][i].get('snippet','0000-00-00T00:00:00Z').get('publishTime','0000-00-00T00:00:00Z')
            publishTime = re.split(r'[TZ-]',publishedOn)
            total_seconds = 0
            if not publishedOn == '0000-00-00T00:00:00Z':
                dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%SZ")
                epoch = datetime.datetime(1970, 1, 1)
                total_seconds = int((dt - epoch).total_seconds())
            videoDetails.append({
                'channelId' : response['items'][i]['snippet']['channelId'],
                'channelName' : response['items'][i]['snippet']['channelTitle'],
                'videoId' : response['items'][i]['id']['videoId'],
                'videoTitle' : response['items'][i]['snippet']['title'],
                'publishYear' : publishTime[0], #year
                'publishMonth' : publishTime[1], #month
                'publishDay' : publishTime[2], #day
                'publishTime' : publishTime[3], #hh:mm:ss
                'publishedOn' : publishedOn,
                'publishedOnInSeconds' : total_seconds
                })
    
            videoIds.append(response['items'][i]['id']['videoId'])
            channelIds.append(response['items'][i]['snippet']['channelId'])
    
        # print('Video Search Block')
        nextPageToken = response.get("nextPageToken",None)
        
        
        '''
        Video Block
        '''
        
        request = youtube.videos().list(
            part='id,statistics,snippet,contentDetails,localizations,status,liveStreamingDetails,paidProductPlacementDetails,player,recordingDetails,topicDetails',
            id=videoIds
        )
        # print(len(videoIds))
        response = request.execute()
        
    
        for i in range(len(response['items'])):
            video = response['items'][i]
            # Fetch video statistics
            videoDetails[i]['videoUniqueId'] = video['id']
            statistics = video.get('statistics', {})
            videoDetails[i]['videoViewCount'] = statistics.get('viewCount', 0)
            videoDetails[i]['videoLikeCount'] = statistics.get('likeCount', 0)
            videoDetails[i]['videoFavoriteCount'] = statistics.get('favoriteCount', 0)
            videoDetails[i]['videoCommentCount'] = statistics.get('commentCount', 0)
        
            # Fetch video snippet details
            snippet = video.get('snippet', {})
            videoDetails[i]['videoDescription'] = snippet.get('description', None)
            videoDetails[i]['videoTags'] = snippet.get('tags', [])
            videoDetails[i]['videoCategoryId'] = snippet.get('categoryId', None)
            videoDetails[i]['videoLiveBroadcastContent'] = snippet.get('liveBroadcastContent', None)
            videoDetails[i]['videoDefaultLanguage'] = snippet.get('defaultLanguage', None)
            videoDetails[i]['videoDefaultAudioLanguage'] = snippet.get('defaultAudioLanguage', None)
        
            # Handle video duration and convert to seconds
            duration = video.get('contentDetails', {}).get('duration', None)
            if duration:
                match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", duration)
                hours = int(match.group(1) or 0)
                minutes = int(match.group(2) or 0)
                seconds = int(match.group(3) or 0)
                videoDetails[i]['videoDuration'] = timedelta(hours=hours, minutes=minutes, seconds=seconds)
                videoDetails[i]['videoDurationInSeconds'] = hours * 3600 + minutes * 60 + seconds
            else:
                videoDetails[i]['videoDuration'] = None
                videoDetails[i]['videoDurationInSeconds'] = None
        
            # Fetch video content details
            content_details = video.get('contentDetails', {})
            videoDetails[i]['videoDimension'] = content_details.get('dimension', None)
            videoDetails[i]['videoDefinition'] = content_details.get('definition', None)
            videoDetails[i]['videoCaption'] = content_details.get('caption', None)
            videoDetails[i]['videoLicensedContent'] = content_details.get('licensedContent', False)
            videoDetails[i]['videoProjection'] = content_details.get('projection', False)
        
            # Fetch video status details
            status = video.get('status', {})
            videoDetails[i]['videoUploadStatus'] = status.get('uploadStatus', None)
            videoDetails[i]['videoPrivacyStatus'] = status.get('privacyStatus', None)
            videoDetails[i]['videoLicense'] = status.get('license', None)
            videoDetails[i]['videoEmbeddable'] = status.get('embeddable', False)
            videoDetails[i]['videoPublicStatsViewable'] = status.get('publicStatsViewable', False)
            videoDetails[i]['videoMadeForKids'] = status.get('madeForKids', False)
            videoDetails[i]['videoHasPaidProductPlacement'] = status.get('hasPaidProductPlacement', False)
        
            # Fetch video player details
            player = video.get('player', {})
            videoDetails[i]['videoPlayerEmbedHtml'] = player.get('embedHtml', None)
        
            # Fetch recording details
            recording_details = video.get('recordingDetails', {})
            videoDetails[i]['videoRecordingLocationDescription'] = recording_details.get('locationDescription', None)
            videoDetails[i]['videoRecordingDate'] = recording_details.get('recordingDate', None)
        
            # Fetch location within recording details
            location = recording_details.get('location', {})
            videoDetails[i]['videoRecordingLocationLatitude'] = location.get('latitude', 0)
            videoDetails[i]['videoRecordingLocationLongitude'] = location.get('longitude', 0)
            videoDetails[i]['videoRecordingLocationAltitude'] = location.get('altitude', 0)
        
            # Fetch topic details
            videoDetails[i]['videotopicDetailsUrls'] = video.get('topicDetails', {}).get('topicCategories', [])
        
        # Ensure that videoDetails has been populated correctly before returning
        # print("Video Block")


        '''
        Channel Block
        '''
        videoDetails = pd.DataFrame(videoDetails)

        Unique_ChannelIds = list(set(videoDetails['channelId']))

        request = youtube.channels().list(part='id,contentDetails,brandingSettings,contentOwnerDetails,localizations,snippet,statistics,status,topicDetails',
                                          id=Unique_ChannelIds)

        response = request.execute()

        for i in range(len(response['items'])):
            item = response['items'][i]
            snippet = item.get('snippet',{})
            publishedOn = snippet.get('publishedAt','0000-00-00T00:00:00Z')
            publishedAt =  re.split(r'[TZ-]', publishedOn)
            total_seconds = 0
            # if publishedOn != '0000-00-00T00:00:00Z':
            #     dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%SZ")
            #     epoch = datetime.datetime(1970, 1, 1)
            #     total_seconds = int((dt - epoch).total_seconds())
            if publishedOn != '0000-00-00T00:00:00Z':
                try:
                    dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%S.%fZ")  # Updated format
                except ValueError:
                    dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%SZ")  # Fallback for no microseconds
                epoch = datetime.datetime(1970, 1, 1)
                total_seconds = int((dt - epoch).total_seconds())

            thumbnails = item['snippet'].get('thumbnails', {})
            contentDetails = item.get('contentDetails', {}).get('relatedPlaylists', {})
            statistics = item.get('statistics', {})
            brandingSettings = item.get('brandingSettings', {}).get('channel', {})
            brandingImage = item.get('brandingSettings', {}).get('image', {})
            
            channelDetails.append({
            'channelIdUnique':item['id'],
            'channelTitleCheck': snippet.get('title',None),
            'channelDescription': snippet.get('description',None),
            'channelCustomUrl': snippet.get('customUrl',None),
        
            'channelPublishYear': publishTime[0],  # year
            'channelPublishMonth': publishTime[1], # month
            'channelPublishDay': publishTime[2],  # day
            'channelPublishTime': publishTime[3],  # hh:mm:ss
            'channelPublishedOn': publishedOn,
            'channelPublishedOnInSeconds': total_seconds,
            'channelCountry': item['snippet'].get('country', None),
        
            # Handle thumbnails and avoid KeyError
            'channelThumbnailDefaultUrl': thumbnails.get('default', {}).get('url', None),
            'channelThumbnailDefaultWidth': thumbnails.get('default', {}).get('width', 0),
            'channelThumbnailDefaultHeight': thumbnails.get('default', {}).get('height', 0),
            'channelThumbnailMediumUrl': thumbnails.get('medium', {}).get('url', None),
            'channelThumbnailMediumWidth': thumbnails.get('medium', {}).get('width', 0),
            'channelThumbnailMediumHeight': thumbnails.get('medium', {}).get('height', 0),
            'channelThumbnailHighUrl': thumbnails.get('high', {}).get('url', None),
            'channelThumbnailHighWidth': thumbnails.get('high', {}).get('width', 0),
            'channelThumbnailHighHeight': thumbnails.get('high', {}).get('height', 0),
        
            # Handle contentDetails and statistics data
            'channelPlaylistsLikes': contentDetails.get('likes', 0),
            'channelPlaylistsUploads': contentDetails.get('uploads', None),
        
            'channelViewCount': statistics.get('viewCount', 0),
            'channelSubscriberCount': statistics.get('subscriberCount', 0),
            'channelHiddenSubscriberCount': statistics.get('hiddenSubscriberCount', 0),
            'channelVideoCount': statistics.get('videoCount', 0),
        
            # Handle topicDetails and status
            'channelTopicCategories': item.get('topicDetails', {}).get('topicCategories', []),
            'channelPrivacyStatus': item.get('status', {}).get('privacyStatus', None),
            'channelIsLinked': item.get('status', {}).get('isLinked', False),
            'channelLongUploadsStatus': item.get('status', {}).get('longUploadsStatus', None),
            'channelMadeForKids': item.get('status', {}).get('madeForKids', False),
        
            # Handle brandingSettings and other optional fields
            'channelKeywords': brandingSettings.get('keywords', None),
            'channelAnalyticsAccountId': brandingSettings.get('trackingAnalyticsAccountId', None),
            'channelBrandCountry': brandingSettings.get('country', None),
        
            'channelBannerExternalUrl': brandingImage.get('bannerExternalUrl', None)
            })
        channelDetails = pd.DataFrame(channelDetails)
        # display(channelDetails)
                
        # print("Channel Block")

        #Result
        resultDataFrame = pd.merge(videoDetails, channelDetails, left_on='channelId', right_on='channelIdUnique', how='left')
        # print("ResultDataFrame")
        return resultDataFrame,nextPageToken  
    except Exception as e:
        print(f"Error while processing VideoDataFrame(): {e}")
        return None, None

In [3]:
def VideoDetailsStructuring(max_record_count,kw_list):
    try:
        resultDataFrame = pd.DataFrame()
        nextPageToken = None
        record_fetching_batches = [50]
        if max_record_count>50:
            quotient = max_record_count // 50  # Integer division
            remainder = [max_record_count % 50]  # Remainder
            record_fetching_batches = record_fetching_batches*quotient
            if remainder[0] > 0:
                record_fetching_batches.extend(remainder)
            # print(len(record_fetching_batches))
        else:
            record_fetching_batches = [max_record_count]
            
        if len(record_fetching_batches) == 1:            
            response = VideoDetailExtraction(kw_list,record_fetching_batches[0])
            if response is None:
                print("Failed to fetch initial video details - VideoDetailExtraction() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()
                

            resultDataFrame, nextPageToken = VideoDataFrame(response)
            nextPageToken = None
            if resultDataFrame is None:
                print("Failed to process video data frame - VideoDataFrame() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()
            return resultDataFrame
        
        elif len(record_fetching_batches) > 1:
            response = VideoDetailExtraction(kw_list,record_fetching_batches[0])
            if response is None:
                print("Failed to fetch initial video details - VideoDetailExtraction() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()
                    
            resultDataFrame, nextPageToken = VideoDataFrame(response)
            if resultDataFrame is None:
                print("Failed to process video data frame - VideoDataFrame() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()

            for batch in record_fetching_batches[1:]:
                response = VideoDetailExtractionNextPageToken(kw_list, nextPageToken, batch)
                if response is None:
                    print("Failed to fetch next page of video details - VideoDetailExtractionNextPageToken() returned None, hence returned till now fetched videoDetails.")
                    break
                resultDataFrame_next, nextPageToken = VideoDataFrame(response)
                if resultDataFrame_next is not None:
                    # videoDetails = videoDetails.append(videoDetails_next, ignore_index=True) # DataFrame has no opbject append
                    resultDataFrame = pd.concat([resultDataFrame, resultDataFrame_next], ignore_index=True)
                # Break the loop if we've reached the max record count or no more pages
                if len(resultDataFrame) >= max_record_count or not nextPageToken:
                    break
      
        return resultDataFrame
    except Exception as e:
        print(f"Error during VideoDetailsStructuring(), hence returned empty DataFrame: {e}")
        return pd.DataFrame()

In [4]:
def RawFile(max_record_count):
    try:
        timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        filename = f"{timestamp}.json"
        # Save the DataFrame as a JSON file
        dataframe = VideoDetailsStructuring(max_record_count,kw_list)
        if not dataframe.empty:
            dataframe.to_json(filename, orient="records", indent=4)
            print(f"DataFrame saved as {filename}")
        else:
            print("No data to save since empty DataFrame returned.")
        return True
    except Exception as e:
        print(f"Error during raw file creation: {e}")
        return False

In [5]:
def PushToGithub():
    output_files = os.listdir('/kaggle/working')
    try:
        # Find the most recent .json file
        json_files = [file for file in output_files if file.endswith(".json")]
        
        if json_files:
            LatestFiles = max(json_files, key=os.path.getctime)
        else:
            raise ValueError("No JSON files found!")

    except ValueError as e:
        print(f"An error occurred at fetching recent .json file: {e}")
        return
        
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'
    
    print(LatestFiles)
    try:
        if os.path.exists(kaggle_repo_url):
            print("Already cloned and the repo file exist")
            repo = git.Repo(kaggle_repo_url) 
            origin = repo.remote(name='origin') 
            origin.pull()
            print("successfully pulled the git repo before push")
        else:
            #repo_url Global
            repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
            print("successfully cloned the git repo")
    
        
        if os.path.exists(destination_path):
            shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
                
        else:
            os.makedirs(destination_path)
            shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
           
                
        repo = Repo(kaggle_repo_url)
        # repo.git.add(all=True)
        repo.index.add([f"{destination_path}/{LatestFiles}"])
        timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {LatestFiles}")
        origin = repo.remote(name="origin")
        origin.push()
        print("Output files successfully pushed to GitHub!")
    
    except Exception as e:
        print(f"An error occurred at git automation code: {e}")

In [6]:
def main(max_record_count):
    RawFile(max_record_count)
    PushToGithub()

In [7]:
if __name__=="__main__":
    from googleapiclient.discovery import build
    from IPython.display import JSON, display
    import re
    import datetime
    from dateutil.relativedelta import relativedelta
    import pandas as pd
    import os
    from kaggle_secrets import UserSecretsClient
    import git
    from git import Repo
    import shutil
    from pytz import timezone
    from datetime import timedelta

    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("sourceApiKey")
    secret_value_1 = user_secrets.get_secret("sourceRepoUrl")

    api_key = secret_value_0
    repo_url = secret_value_1
    api_service_name = "youtube"
    api_version = "v3"
    youtube = build(api_service_name, api_version, developerKey=api_key)

    ist = timezone('Asia/Kolkata')
    max_record_count = 3000
    kw_list =  "devops"
    main(max_record_count)

DataFrame saved as 2025-01-22_01:33:47.json
2025-01-22_01:33:47.json
successfully cloned the git repo
Output files successfully pushed to GitHub!
