In [1]:
import datetime
from pytz import timezone
import pandas as pd


from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("api_key")
secret_value_1 = user_secrets.get_secret("repo_url_youtube_analysis")

# Assigning secrets to variables
api_key = secret_value_0
repo_url = secret_value_1

# Initialize an empty DataFrame with required columns
FileExecution = pd.DataFrame(columns=['ScriptFile', 'StartTime', 'EndTime', 'TimeTaken', 'Date'])

# SourceDaily

In [2]:
# Recording the start time of execution
start_time = datetime.datetime.now()
# Code block for which execution time need to measure
print("Execution started...")

Execution started...


In [3]:
def VideoDetailExtraction(kw_list, maxResults=50):
    """
    Fetches a list of video details from YouTube based on the given keyword(s) for the initial batch.

    Args:
        kw_list (str): The keyword(s) to search for.
        maxResults (int, optional): The maximum number of results to fetch in this request (default is 50).

    Returns:
        dict: The API response containing video details. Returns None if an error occurs.
    """
    try:
        # Prepare the API request to fetch video details
        request = youtube.search().list(
            part='snippet',            # Fetch metadata such as title, description, and thumbnails
            order='viewCount',         # Order results by view count
            q=kw_list,                 # Search query
            relevanceLanguage='en',    # Limit results to English-relevant videos
            type='video',              # Restrict results to videos only
            # videoCategoryId=26,      # Optional: Filter by specific category (currently commented)
            # regionCode='IN',         # Optional: Restrict to a specific region (currently commented)
            maxResults=maxResults,     # Number of results to fetch (up to 50 per API limits)
            videoCaption='closedCaption'  # Restrict results to videos with closed captions
        )

        # Execute the API request
        response = request.execute()
        return response

    except Exception as e:
        # Log any errors encountered during the API call
        print(f"Error during VideoDetailExtraction(): {e}")
        return None


def VideoDetailExtractionNextPageToken(kw_list, nextPageToken, maxResults=50):
    """
    Fetches the next page of video details from YouTube using a continuation token.

    Args:
        kw_list (str): The keyword(s) to search for.
        nextPageToken (str): The token for fetching the next page of results.
        maxResults (int, optional): The maximum number of results to fetch in this request (default is 50).

    Returns:
        dict: The API response containing video details for the next page. Returns None if an error occurs.
    """
    try:
        # Prepare the API request to fetch the next page of video details
        request = youtube.search().list(
            part='snippet',            # Fetch metadata such as title, description, and thumbnails
            order='viewCount',         # Order results by view count
            q=kw_list,                 # Search query
            relevanceLanguage='en',    # Limit results to English-relevant videos
            type='video',              # Restrict results to videos only
            # videoCategoryId=26,      # Optional: Filter by specific category (currently commented)
            # regionCode='IN',         # Optional: Restrict to a specific region (currently commented)
            maxResults=maxResults,     # Number of results to fetch (up to 50 per API limits)
            pageToken=nextPageToken,   # Token for fetching the next page
            videoCaption='closedCaption'  # Restrict results to videos with closed captions
        )

        # Execute the API request
        response = request.execute()
        return response

    except Exception as e:
        # Log any errors encountered during the API call
        print(f"Error during VideoDetailExtractionNextPageToken(): {e}")
        return None


def VideoDataFrame(response):
    """
    Processes video and channel details from the YouTube API response, structures the data into DataFrames,
    and merges them to create a comprehensive dataset.

    Args:
        response (dict): The response object returned by the YouTube API containing video details.

    Returns:
        tuple:
            - pd.DataFrame: A DataFrame containing merged video and channel details.
            - str or None: The next page token if available, otherwise None.
    """
    try:
        # Initialize lists to store video and channel details
        videoDetails = []
        videoIds = []
        channelIds = []
        channelDetails = []
        
        '''
        Video Search Block: Extract basic video details from the response.
        '''
        for i in range(len(response['items'])):
            # Extract publication time and convert to components
            publishedOn = response['items'][i].get('snippet', '0000-00-00T00:00:00Z').get('publishTime', '0000-00-00T00:00:00Z')
            publishTime = re.split(r'[TZ-]', publishedOn)
            total_seconds = 0
            if publishedOn != '0000-00-00T00:00:00Z':
                try:
                    dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%S.%fZ")
                except ValueError:
                    dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%SZ")
                epoch = datetime.datetime(1970, 1, 1)
                total_seconds = int((dt - epoch).total_seconds())
            
            # Append extracted video details
            videoDetails.append({
                'channelId': response['items'][i]['snippet']['channelId'],
                'channelName': response['items'][i]['snippet']['channelTitle'],
                'videoId': response['items'][i]['id']['videoId'],
                'videoTitle': response['items'][i]['snippet']['title'],
                'videoPublishYear': publishTime[0],  # Extracted year
                'videoPublishMonth': publishTime[1],  # Extracted month
                'videoPublishDay': publishTime[2],  # Extracted day
                'videoPublishTime': publishTime[3],  # Extracted time
                'videoPublishedOn': publishedOn,
                'videoPublishedOnInSeconds': total_seconds
            })
            
            # Collect video and channel IDs
            videoIds.append(response['items'][i]['id']['videoId'])
            channelIds.append(response['items'][i]['snippet']['channelId'])
        
        # Extract next page token if available
        nextPageToken = response.get("nextPageToken", None)
        
        '''
        Video Block: Fetch additional details about each video using its ID.
        '''
        try:
            request = youtube.videos().list(
                part='id,statistics,snippet,contentDetails,localizations,status,liveStreamingDetails,paidProductPlacementDetails,player,recordingDetails,topicDetails',
                id=videoIds
            )
            response = request.execute()
        except Exception as e:
            # Log any errors encountered during the API call
            print(f"Error during videos().list(): {e}")
            return None
        
        for i in range(len(response['items'])):
            video = response['items'][i]

            # Video id
            videoDetails[i]['videoUniqueId'] = video.get('id',None)
            
            # Video statistics
            statistics = video.get('statistics', {})
            videoDetails[i]['videoViewCount'] = statistics.get('viewCount', 0)
            videoDetails[i]['videoLikeCount'] = statistics.get('likeCount', 0)
            videoDetails[i]['videoFavoriteCount'] = statistics.get('favoriteCount', 0)
            videoDetails[i]['videoCommentCount'] = statistics.get('commentCount', 0)
            
            # Video snippet details
            snippet = video.get('snippet', {})
            videoDetails[i]['videoDescription'] = snippet.get('description', None)
            videoDetails[i]['videoTags'] = snippet.get('tags', [])
            videoDetails[i]['videoCategoryId'] = snippet.get('categoryId', None)
            videoDetails[i]['videoLiveBroadcastContent'] = snippet.get('liveBroadcastContent', None)
            videoDetails[i]['videoDefaultLanguage'] = snippet.get('defaultLanguage', None)
            videoDetails[i]['videoDefaultAudioLanguage'] = snippet.get('defaultAudioLanguage', None)
            
            # Video duration (convert ISO 8601 to seconds)
            duration = video.get('contentDetails', {}).get('duration', None)
            if duration:
                # Match ISO 8601 duration format
                match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", duration)
                
                if match:
                    # Extract hours, minutes, and seconds; fallback to 0 if missing
                    hours = int(match.group(1) or 0)
                    minutes = int(match.group(2) or 0)
                    seconds = int(match.group(3) or 0)
                    
                    # Calculate total duration in seconds
                    total_duration_in_seconds = hours * 3600 + minutes * 60 + seconds
                    
                    # Assign duration and classify content type
                    videoDetails[i]['videoDuration'] = duration
                    videoDetails[i]['videoDurationInSeconds'] = total_duration_in_seconds
                    
                    if total_duration_in_seconds <= 60:
                        videoDetails[i]['videoContentType'] = 'Short'
                    elif total_duration_in_seconds <= 90 and video.get('snippet', {}).get('liveBroadcastContent', '') == 'short':
                        videoDetails[i]['videoContentType'] = 'Short'
                    else:
                        videoDetails[i]['videoContentType'] = 'Video'
                else:
                    # Handle invalid duration format
                    videoDetails[i]['videoDuration'] = None
                    videoDetails[i]['videoDurationInSeconds'] = None
                    videoDetails[i]['videoContentType'] = 'Unknown'
            else:
                # If duration is missing
                videoDetails[i]['videoDuration'] = None
                videoDetails[i]['videoDurationInSeconds'] = None
                videoDetails[i]['videoContentType'] = 'Unknown'
    
            # Additional video details
            content_details = video.get('contentDetails', {})
            videoDetails[i]['videoDimension'] = content_details.get('dimension', None)
            videoDetails[i]['videoDefinition'] = content_details.get('definition', None)
            videoDetails[i]['videoCaption'] = content_details.get('caption', None)
            videoDetails[i]['videoLicensedContent'] = content_details.get('licensedContent', False)
            videoDetails[i]['videoProjection'] = content_details.get('projection', False)
        
        '''
        Channel Block: Fetch details for channels associated with the videos.
        '''
        videoDetails = pd.DataFrame(videoDetails)
        Unique_ChannelIds = list(set(videoDetails['channelId']))
        try:
            request = youtube.channels().list(
                part='id,contentDetails,brandingSettings,contentOwnerDetails,localizations,snippet,statistics,status,topicDetails',
                id=Unique_ChannelIds
            )
            response = request.execute()
        except Exception as e:
            # Log any errors encountered during the API call
            print(f"Error during channels().list(): {e}")
            return None
        
        for i in range(len(response['items'])):
            item = response['items'][i]
            snippet = item.get('snippet', {})
            publishedOn = snippet.get('publishedAt', '0000-00-00T00:00:00Z')
            publishedAt = re.split(r'[TZ-]', publishedOn)
            total_seconds = 0
            if publishedOn != '0000-00-00T00:00:00Z':
                try:
                    dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%S.%fZ")
                except ValueError:
                    dt = datetime.datetime.strptime(publishedOn, "%Y-%m-%dT%H:%M:%SZ")
                epoch = datetime.datetime(1970, 1, 1)
                total_seconds = int((dt - epoch).total_seconds())
            
            # Extract channel details
            channelDetails.append({
                'channelIdUnique': item['id'],
                'channelTitleCheck': snippet.get('title', None),
                'channelDescription': snippet.get('description', None),
                'channelCustomUrl': snippet.get('customUrl', None),
                'channelPublishYear': publishedAt[0],
                'channelPublishMonth': publishedAt[1],
                'channelPublishDay': publishedAt[2],
                'channelPublishTime': publishedAt[3],
                'channelPublishedOn': publishedOn,
                'channelPublishedOnInSeconds': total_seconds,
                'channelCountry': snippet.get('country', None),
                'channelViewCount': item.get('statistics', {}).get('viewCount', 0),
                'channelSubscriberCount': item.get('statistics', {}).get('subscriberCount', 0),
                'channelVideoCount': item.get('statistics', {}).get('videoCount', 0),
            })
        
        # Convert channel details to DataFrame
        channelDetails = pd.DataFrame(channelDetails)
        
        '''
        Result: Merge video and channel details into a single DataFrame.
        '''
        resultDataFrame = pd.merge(videoDetails, channelDetails, left_on='channelId', right_on='channelIdUnique', how='left')
        return resultDataFrame, nextPageToken
    
    except Exception as e:
        print(f"Error while processing VideoDataFrame(): {e}")
        return None, None

def VideoDetailsStructuring(max_record_count, kw_list):
    """
    Fetches and structures video details into a DataFrame, handling pagination if necessary.

    Args:
        max_record_count (int): The maximum number of video records to fetch.
        kw_list (str): The keyword(s) to use for fetching video details.

    Returns:
        pd.DataFrame: A Pandas DataFrame containing video details. Returns an empty DataFrame on failure.
    """
    try:
        # Initialize an empty DataFrame to store results
        resultDataFrame = pd.DataFrame()

        # Initialize the nextPageToken for pagination
        nextPageToken = None

        # Define the batch sizes for video fetching
        record_fetching_batches = [50]  # Default batch size for YouTube API requests

        # Adjust the batch sizes based on the max_record_count
        if max_record_count > 50:
            quotient = max_record_count // 50  # Number of full batches
            remainder = [max_record_count % 50]  # Remaining records in the last batch
            record_fetching_batches = record_fetching_batches * quotient
            if remainder[0] > 0:
                record_fetching_batches.extend(remainder)  # Add the remainder as a batch
        else:
            record_fetching_batches = [max_record_count]  # Single batch if max_record_count <= 50

        # Case 1: Only one batch needed
        if len(record_fetching_batches) == 1:
            # Fetch video details for the single batch
            response = VideoDetailExtraction(kw_list, record_fetching_batches[0])
            if response is None:
                print("Failed to fetch initial video details - VideoDetailExtraction() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()

            # Process the response into a DataFrame and get the nextPageToken
            resultDataFrame, nextPageToken = VideoDataFrame(response)
            nextPageToken = None  # Reset the token as no further pages are needed
            if resultDataFrame is None:
                print("Failed to process video data frame - VideoDataFrame() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()
            return resultDataFrame

        # Case 2: Multiple batches needed
        elif len(record_fetching_batches) > 1:
            # Fetch initial batch of video details
            response = VideoDetailExtraction(kw_list, record_fetching_batches[0])
            if response is None:
                print("Failed to fetch initial video details - VideoDetailExtraction() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()

            # Process the response into a DataFrame and get the nextPageToken
            resultDataFrame, nextPageToken = VideoDataFrame(response)
            if resultDataFrame is None:
                print("Failed to process video data frame - VideoDataFrame() returned None, hence returned empty DataFrame.")
                return pd.DataFrame()

            # Loop through subsequent batches
            for batch in record_fetching_batches[1:]:
                # Fetch details for the next batch using nextPageToken
                response = VideoDetailExtractionNextPageToken(kw_list, nextPageToken, batch)
                if response is None:
                    print("Failed to fetch next page of video details - VideoDetailExtractionNextPageToken() returned None, hence returned till now fetched videoDetails.")
                    break

                # Process the response into a DataFrame
                resultDataFrame_next, nextPageToken = VideoDataFrame(response)
                if resultDataFrame_next is not None:
                    # Concatenate the new DataFrame to the result DataFrame
                    resultDataFrame = pd.concat([resultDataFrame, resultDataFrame_next], ignore_index=True)

                # Break the loop if we've reached the max record count or no more pages are available
                if len(resultDataFrame) >= max_record_count or not nextPageToken:
                    break

        return resultDataFrame  # Return the final result DataFrame
    except Exception as e:
        print(f"Error during VideoDetailsStructuring(), hence returned empty DataFrame: {e}")
        return pd.DataFrame()


def RawFile(max_record_count):
    """
    Processes video details, structures the data, and saves it as a JSON file.

    Args:
        max_record_count (int): The maximum number of records to process.

    Returns:
        bool: True if the file is successfully created and saved, False otherwise.
    """
    try:
        # Call the function to structure video details and return a DataFrame.
        # `kw_list` is assumed to be a global variable containing the search keyword(s).
        dataframe = VideoDetailsStructuring(max_record_count, kw_list)
        
        # Check if the DataFrame is not empty before saving.
        if not dataframe.empty:
            # Count the number of records (rows) in the DataFrame
            record_count = len(dataframe)
            
            # Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
            timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
        
            # Create a filename using the generated timestamp to ensure uniqueness with number of records.
            filename = f"S_{timestamp}_{record_count}_records.json"
            
            # Save the DataFrame to a JSON file with readable formatting.
            dataframe.to_json(filename, orient="records", indent=4)
            print(f"DataFrame saved as {filename}")
        else:
            # Log a message if the DataFrame is empty.
            print("No data to save since empty DataFrame returned.")
        
        # Return True indicating the process was successful.
        return True
    except Exception as e:
        # Handle and log any errors that occur during the process.
        print(f"Error during raw file creation: {e}")
        
        # Return False indicating the process failed.
        return False

def PushToGithub():
    """
    Automates the process of identifying the latest .json file, copying it 
    to a GitHub repository, and pushing the changes to the same branch.

    Args:
        None

    Returns:
        bool: True if the process completes successfully and the file is pushed to GitHub, 
              False if an error occurs during any step.
    """
    # List all files in the working directory
    output_files = os.listdir('/kaggle/working')
    
    try:
        # Filter and find the most recent .json file
        json_files = [file for file in output_files if file.startswith("S_") and file.endswith("_records.json")]
        if json_files:
            LatestFiles = max(json_files, key=os.path.getctime)  # Get the latest file based on creation time
        else:
            raise ValueError("No JSON files found!")  # Raise an error if no JSON files are found
    except ValueError as e:
        print(f"An error occurred at fetching recent .json file: {e}")
        return False  # Exit the function if there's an error in fetching JSON files
    
    # Define repository and destination paths
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'

    
    print(LatestFiles)  # Print the latest JSON file name
    try:
        # Check if the repository already exists
        if os.path.exists(kaggle_repo_url):
            print("Already cloned and the repo file exists")
            repo = git.Repo(kaggle_repo_url)  # Access the existing repository
            origin = repo.remote(name='origin')  # Get the remote repository
            origin.pull()  # Pull the latest changes from the repository
            print("Successfully pulled the git repo before push")
        else:
            # Clone the repository if it doesn't exist
            repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
            print("Successfully cloned the git repo")
        
        # Check if the destination path exists, and copy the latest file
        if os.path.exists(destination_path):
            shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
        else:
            # Create the destination directory if it doesn't exist
            os.makedirs(destination_path)
            shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
        
        # Initialize the repository for git operations
        repo = Repo(kaggle_repo_url)
        
        # Add the copied file to the staging area
        repo.index.add([f"{destination_path}/{LatestFiles}"])
        
        timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        # Commit the changes with a message including the timestamp and file name
        repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {LatestFiles}")
        
        # Push the changes to the remote repository
        origin = repo.remote(name="origin")
        push_result = origin.push()
        if push_result:
            print("Output files successfully pushed to GitHub!")
        else:
            print("Output files pushed to GitHub failed:(")
        return True  # Return True if the process completes successfully
    
    except Exception as e:
        # Handle any errors that occur during the git automation process
        print(f"An error occurred at git automation code: {e}")
        return False  # Return False if an error occurs
        
def main(max_record_count):
    """
    Main function to orchestrate the execution of raw data extraction and pushing data to GitHub.

    Args:
        max_record_count (int): The maximum number of records to process.
    """
    # Call the RawFile function to process and extract raw data.
    # This function likely handles fetching data, processing it, and storing it in a file.
    RawFile(max_record_count)
    
    # Call the PushToGithub function to push the processed data to a GitHub repository.
    # This function likely handles staging, committing, and pushing the file to the repository.
    PushToGithub()

# Entry point of the script
if __name__ == "__main__":
    # Importing necessary libraries
    from googleapiclient.discovery import build  # For interacting with YouTube API
    from IPython.display import JSON, display  # For displaying JSON responses in Jupyter Notebooks
    import re  # For regular expressions
    import datetime  # For date and time manipulations
    # from dateutil.relativedelta import relativedelta  # For handling relative date differences
    import pandas as pd  # For data manipulation and analysis
    import os  # For interacting with the operating system
    from kaggle_secrets import UserSecretsClient  # For securely managing API keys in Kaggle
    import git  # For Git-related operations
    from git import Repo  # For working with repositories
    import shutil  # For file and directory operations
    from pytz import timezone  # For handling time zones
    from datetime import timedelta  # For handling time differences
    
    # Fetching secrets from Kaggle's secure environment
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("api_key")  # Fetch the YouTube API key
    secret_value_1 = user_secrets.get_secret("repo_url_youtube_analysis")  # Fetch the source repository URL
    
    # Assigning secrets to variables
    api_key = secret_value_0
    repo_url = secret_value_1
    
    # Setting up YouTube API details
    api_service_name = "youtube"
    api_version = "v3"
    youtube = build(api_service_name, api_version, developerKey=api_key)  # Initialize YouTube API client
    
    # Setting the timezone to Indian Standard Time (IST)
    ist = timezone('Asia/Kolkata')
    
    # Maximum number of records to fetch
    max_record_count = 4000
    
    # Keyword list for searching YouTube videos
    kw_list = "devops"
    
    # Call the main function with the maximum record count as an argument
    main(max_record_count)

Error during VideoDetailExtraction(): <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&order=viewCount&q=devops&relevanceLanguage=en&type=video&maxResults=50&videoCaption=closedCaption&key=AIzaSyCSSL9uXJ7kJlKpfLGFaEQ43rkyBr25zsQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Failed to fetch initial video details - VideoDetailExtraction() returned None, hence returned empty DataFrame.
No data to save since empty DataFrame returned.
An error occurred at fetching recent .json file: No JSON files found!


In [4]:
print("Execution ended.")

# Record the end time of execution
end_time = datetime.datetime.now()

# Calculate the time taken for execution
time_taken = end_time - start_time

# Get the current time in the 'Asia/Kolkata' timezone
current_time = datetime.datetime.now(timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H_%M_%S")

# Create a new row as a DataFrame
new_row = pd.DataFrame([{
    'ScriptFile': 'sourcedaily.ipynb',
    'StartTime': start_time.strftime('%Y-%m-%d %H:%M:%S'),
    'EndTime': end_time.strftime('%Y-%m-%d %H:%M:%S'),
    'TimeTaken': str(time_taken),
    'Date': current_time
}])

# Append the new row using pd.concat()
FileExecution = pd.concat([FileExecution, new_row], ignore_index=True)

# Display the DataFrame
# display(FileExecution)

# Save the DataFrame to a JSON file
FileExecution.to_json(f"{current_time}_ScriptFileExecution.json", orient="records", indent=4)
# print(FileExecution)

Execution ended.


# DataCleaning

In [5]:
# Recording the start time of execution
start_time = datetime.datetime.now()
# Code block for which execution time need to measure
print("Execution started...")

Execution started...


In [6]:
def Source_File_Extraction(repo_url, kaggle_repo_url, source_path):
    """
    This function checks if a specified Git repository already exists in the local system.
    If the repository exists, it pulls the latest changes from the remote repository.
    If the repository doesn't exist, it clones the repository from the provided URL.
    
    After ensuring the repository is up-to-date, it searches for a JSON file that starts with "S_" 
    and ends with "records.json" in the specified source directory, loads the file using pandas, 
    and returns the data as a DataFrame.

    Args:
    - repo_url (str): The URL of the Git repository to clone if not already present.
    - kaggle_repo_url (str): The local path where the repository is stored or will be cloned to.
    - source_path (str): The directory where the JSON file is stored.

    Returns:
    - pd.DataFrame: The data from the JSON file as a pandas DataFrame.
    """
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository
        origin.pull()  # Pull the latest changes from the repository
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    
    # List all files in the source path and find the relevant JSON file
    output_files = os.listdir(source_path)
    Source_File = max([i for i in output_files if i.startswith("S_") and i.endswith('records.json')])
    
    # Read the found JSON file into a pandas DataFrame
    Source_File = pd.read_json(f'{source_path}/{Source_File}')
    
    return Source_File

def DataCleaning(Target_File):
    """
    Cleans the input DataFrame by performing the following operations:
    1. Drops irrelevant columns.
    2. Removes duplicate rows.
    3. Filters videos based on language (only those with 'videoDefaultAudioLanguage' starting with 'en').
    4. Translates non-ASCII characters in 'channelName' and 'videoTitle' to English.
    5. Removes emojis and decodes HTML entities from 'channelName' and 'videoTitle'.
    6. Removes non-ASCII characters from 'channelName' and 'videoTitle'.
    7. Fills missing values in 'channelCountry' with 'Unknown'.
    8. Returns the cleaned DataFrame.

    Args:
    - Target_File (pd.DataFrame): The DataFrame to clean.

    Returns:
    - pd.DataFrame: The cleaned DataFrame.
    """
    
    # Drop irrelevant columns
    Target_File = Target_File.drop(['videoDescription', 'videoLiveBroadcastContent', 'videoFavoriteCount',
                                    'videoTags', 'videoUniqueId', 'channelIdUnique', 'channelTitleCheck', 'channelDescription'], axis=1)
    
    # Identify and keep all duplicates
    duplicates = Target_File[Target_File.duplicated(keep=False)]  # Selects all duplicates, including the first occurrence
    
    # Remove duplicates
    Target_File = Target_File.drop_duplicates(ignore_index=True)
    
    # Filter for videos with 'videoDefaultAudioLanguage' starting with 'en'
    Target_File_EN = Target_File[Target_File['videoDefaultAudioLanguage'].str.startswith("en", na=False)].reset_index(drop=True)

    # Iterate through each row in 'Target_File_EN' to clean 'channelName' and 'videoTitle'
    for i in range(len(Target_File_EN['channelName'])):
        try:
            # Translate non-ASCII characters in 'channelName' and 'videoTitle' to English
            if not Target_File_EN['channelName'][i].isascii():
                Target_File_EN.loc[i, 'channelName'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['channelName'][i])
            if not Target_File_EN['videoTitle'][i].isascii():
                Target_File_EN.loc[i, 'videoTitle'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['videoTitle'][i])

            # Remove emojis
            Target_File_EN.loc[i, 'channelName'] = emoji.replace_emoji(Target_File_EN['channelName'][i], replace='')
            Target_File_EN.loc[i, 'videoTitle'] = emoji.replace_emoji(Target_File_EN['videoTitle'][i], replace='')

            # Decode HTML entities like &amp; and &#39;
            Target_File_EN.loc[i, 'channelName'] = html.unescape(Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = html.unescape(Target_File_EN['videoTitle'][i])

            # Remove non-ASCII characters from 'channelName' and 'videoTitle'
            Target_File_EN.loc[i, 'channelName'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['videoTitle'][i])

        except Exception as e:
            print(e)
    
    # Remove duplicates after the transformations
    Target_File_EN = Target_File_EN.drop_duplicates(ignore_index=True)
    
    # Drop 'videoDefaultLanguage' column as it is no longer needed
    Target_File_EN = Target_File_EN.drop(['videoDefaultLanguage'], axis=1)
    
    # Fill missing values in 'channelCountry' with 'Unknown'
    Target_File_EN['channelCountry'] = Target_File_EN['channelCountry'].fillna('Unknown')
    
    return Target_File_EN

def GitHubPush(Target_File_EN):
    """
    This function handles the process of saving a cleaned and processed DataFrame as a JSON file, 
    pushing it to a GitHub repository. It ensures that the file is properly named with a timestamp 
    and number of records, creates necessary directories, and commits the changes to the repository.
    
    Args:
    - Target_File_EN (pd.DataFrame): The DataFrame that contains the processed data to be saved and pushed.
    
    Returns:
    - None: This function performs file handling and Git operations but does not return anything.
    """

    # Count the number of records in the DataFrame
    record_count = len(Target_File_EN)
    
    # Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
    
    # Create a filename using the generated timestamp and number of records to ensure uniqueness.
    filename = f"DC_{timestamp}_{record_count}_records.json"
    
    # Save the DataFrame to a JSON file in a readable format (with indentation)
    Target_File_EN.to_json(filename, orient="records", indent=4)
    print(f"DataFrame saved as {filename}")
    
    # Check if the destination directory exists
    if not os.path.exists(destination_path):
        # If the directory does not exist, create it
        os.makedirs(destination_path)
        print('Created the destination directory, DataCleaning/Daily')
        # Copy the saved file into the newly created directory
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    else:
        print('Destination directory already exists')
        # Copy the file to the existing directory
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    
    # Initialize the repository for git operations using the local GitHub repository URL
    repo = Repo(kaggle_repo_url)
    
    # Add the copied file to the staging area for git commit
    repo.index.add([f"{destination_path}/{filename}"])
    
    # Create a timestamp for the commit message
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    # Commit the changes with a message that includes the timestamp and the filename
    repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
    
    # Push the changes to the remote repository
    origin = repo.remote(name="origin")
    push_result = origin.push()
    
    # Check if the push was successful and print the result
    if push_result:
        print("Push successful.")
    else:
        print("Push failed.")

def main():
    """
    The main function orchestrates the entire data pipeline by:
    1. Extracting the source data from the given repository URL.
    2. Cleaning the extracted data using the DataCleaning function.
    3. Pushing the final cleaned file to a GitHub repository.
    
    This function executes the steps in sequence to process and upload data.
    
    Args:
    - None: This function does not accept any arguments. It uses predefined repository URLs and paths.
    
    Returns:
    - None: This function does not return anything but performs data processing and Git operations.
    """
    
    # Step 1: Extract the source file from the repository based on the provided URL and path.
    Source_File = Source_File_Extraction(repo_url, kaggle_repo_url, source_path)
    
    # Step 2: Clean the extracted data using the DataCleaning function.
    Cleaned_File = DataCleaning(Source_File)
    
    # Optional: Uncomment to display the cleaned file sorted by video duration.
    # display(Cleaned_File.sort_values(by='videoDurationInSeconds', ascending=True))
    
    # Step 3: Push the processed and feature-engineered data to GitHub using GitHubPush function.
    GitHubPush(Cleaned_File)

if __name__ == "__main__":
    """
    This script is the entry point for the data cleaning pipeline.
    It performs the following tasks:
    1. Imports necessary libraries for data processing, file handling, and Git operations.
    2. Retrieves user secrets for repository URL.
    3. Sets up paths for different directories (source, destination, etc.).
    4. Configures pandas to display all columns and rows without truncation.
    5. Calls the main function to execute the pipeline.

    The script is designed to be executed as the main module in a Python environment.
    It ensures that all necessary operations are performed, including fetching source data, 
    cleaning, and pushing the final data to a GitHub repository.
    """

    # Import necessary libraries
    import os  
    import git  # Git library for interacting with repositories
    from git import Repo  # GitHub repository interaction
    import time  # For time-related operations
    import datetime  # For working with date and time
    from pytz import timezone  # For timezone management
    import pytz  # Timezone handling
    import pandas as pd  # For data manipulation and analysis
    import deep_translator  # For translation services
    from deep_translator import GoogleTranslator  # Google Translate API integration
    import shutil  # For file operations like copying or removing
    import emoji  # For handling emojis in the data
    import re  # For regular expression operations
    import html  # For HTML parsing and escaping
    from kaggle_secrets import UserSecretsClient  # For accessing Kaggle's secret management system
    
    # Retrieve secret value for repository URL from Kaggle secrets storage
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("repo_url_youtube_analysis")
    repo_url = secret_value_0  # URL for the GitHub repository used in this pipeline
    
    # Set timezone to Indian Standard Time (IST)
    ist = timezone('Asia/Kolkata')
    
    # Define paths for different directories
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'  # Path to the working repository on Kaggle
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'  # Path to store cleaned data
    source_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'  # Path to source raw data
    
    # Configure pandas to display all columns and rows without truncation for easier debugging
    pd.set_option("display.max_columns", None)  # Prevent truncating columns
    pd.set_option("display.max_rows", None)  # Prevent truncating rows
    
    # Call the main function to execute the data pipeline
    main()

Successfully cloned the git repo
DataFrame saved as DC_2025-02-07_06_36_57_406_records.json
Destination directory already exists
Push successful.


In [7]:
print("Execution ended.")

# Record the end time of execution
end_time = datetime.datetime.now()

# Calculate the time taken for execution
time_taken = end_time - start_time

# Get the current time in the 'Asia/Kolkata' timezone
current_time = datetime.datetime.now(timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H_%M_%S")

# Create a new row as a DataFrame
new_row = pd.DataFrame([{
    'ScriptFile': 'dataCleaning.ipynb',
    'StartTime': start_time.strftime('%Y-%m-%d %H:%M:%S'),
    'EndTime': end_time.strftime('%Y-%m-%d %H:%M:%S'),
    'TimeTaken': str(time_taken),
    'Date': current_time
}])

# Append the new row using pd.concat()
FileExecution = pd.concat([FileExecution, new_row], ignore_index=True)

# Display the DataFrame
# display(FileExecution)

# Save the DataFrame to a JSON file
FileExecution.to_json(f"{current_time}_ScriptFileExecution.json", orient="records", indent=4)
# print(FileExecution)

Execution ended.


# Feature Engineering

In [8]:
# Recording the start time of execution
start_time = datetime.datetime.now()
# Code block for which execution time need to measure
print("Execution started...")

Execution started...


In [9]:
def DataCleaning_File_Extraction(repo_url, kaggle_repo_url, DataCleaning_path):
    """
    Extracts and processes a data cleaning file from a specified GitHub repository.

    This function checks if the repository is already cloned locally. If found, it pulls the latest changes;
    otherwise, it clones the repository. It then searches for a JSON file in the specified directory that starts
    with 'DC_' and ends with 'records.json', reads it into a pandas DataFrame, and returns the DataFrame.

    Parameters:
    repo_url (str): The URL of the GitHub repository to clone or update.
    kaggle_repo_url (str): The local path where the repository is cloned.
    DataCleaning_path (str): The directory path where the data cleaning files are stored.

    Returns:
    pd.DataFrame: A pandas DataFrame containing the extracted data from the JSON file.
    """

    # Check if the repository already exists locally
    if os.path.exists(kaggle_repo_url):
        print("Repository already exists locally.")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository reference
        origin.pull()  # Pull the latest updates from the remote repository
        print("Successfully pulled the latest changes.")
    else:
        # Clone the repository if it does not exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the repository.")

    # List all files in the specified directory and filter for the relevant JSON file
    output_files = os.listdir(DataCleaning_path)
    DataCleaning_File = max(
        [file for file in output_files if file.startswith("DC_") and file.endswith('records.json')]
    )

    # Read the identified JSON file into a pandas DataFrame
    DataCleaning_File = pd.read_json(os.path.join(DataCleaning_path, DataCleaning_File))

    return DataCleaning_File

def Requirement_File_Extraction(repo_url, kaggle_repo_url, requirement_path):
    """
    Ensures the repository is up-to-date by either pulling the latest changes or cloning it.
    Then, extracts and returns the most recent JSON file starting with "RE_" and ending with 
    "country_details.json" from the specified requirement directory as a pandas DataFrame.

    Args:
    - repo_url (str): Git repository URL to clone if not present.
    - kaggle_repo_url (str): Local directory path for the repository.
    - requirement_path (str): Directory containing the JSON files.

    Returns:
    - pd.DataFrame: Data from the most recent JSON file.
    """
    
    # Check if the repository already exists locally
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        
        # Access the existing repository and pull the latest changes
        repo = git.Repo(kaggle_repo_url)
        origin = repo.remote(name='origin')
        origin.pull()  # Pull the latest changes
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist locally
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    
    # List all files in the requirement directory
    output_files = os.listdir(requirement_path)
    
    # Find the most recent JSON file that starts with "RE_" and ends with "country_details.json"
    Requirement_File = max([i for i in output_files if i.startswith("RE_") and i.endswith('country_details.json')])
    
    # Read the found JSON file into a pandas DataFrame
    Requirement_File = pd.read_json(f'{requirement_path}/{Requirement_File}')
    
    return Requirement_File

def videoDurationClassification(videoDurationInSeconds):
    """
    Classifies the video duration into categories based on its length in seconds.

    Args:
    - videoDurationInSeconds (int): Duration of the video in seconds.

    Returns:
    - str: A string indicating the classification of the video duration.
    """
    
    # Classifying the video duration into different categories
    if 0 <= videoDurationInSeconds <= 60:
        return "Very Short"  # Video duration between 0 and 60 seconds
    elif 61 <= videoDurationInSeconds <= 120:
        return "Short"  # Video duration between 61 and 120 seconds
    elif 121 <= videoDurationInSeconds <= 300:
        return "Medium"  # Video duration between 121 and 300 seconds (2-5 minutes)
    elif 301 <= videoDurationInSeconds <= 600:
        return "Long"  # Video duration between 301 and 600 seconds (5-10 minutes)
    elif 601 <= videoDurationInSeconds <= 3600:
        return "Very Long"  # Video duration between 601 seconds (10 minutes) and 3600 seconds (1 hour)
    elif 3601 <= videoDurationInSeconds <= 10800:
        return "Extended"  # Video duration between 3601 seconds (1 hour) and 10800 seconds (3 hours)
    elif videoDurationInSeconds > 10800:
        return "Ultra Long"  # Video duration greater than 10800 seconds (3 hours)
    else:
        return "Invalid video duration"  # Invalid value for video duration

def normalize(series):
    """
    Applies Min-Max normalization to a Pandas Series, scaling the values to a range between 0 and 1.

    Args:
    - series (pd.Series): The input data series to normalize.

    Returns:
    - pd.Series: A normalized series with values scaled between 0 and 1.
    """
    
    # Min-Max normalization formula: (x - min) / (max - min)
    return (series - series.min()) / (series.max() - series.min())

def parse_datetime(value):
    """
    Parses a datetime string into a Pandas datetime object based on specific formats.
    
    Handles two datetime formats:
    1. "%Y-%m-%dT%H:%M:%SZ" for ISO 8601 format without fractional seconds.
    2. "%Y-%m-%dT%H:%M:%S.%fZ" for ISO 8601 format with fractional seconds.

    Args:
    - value (str): The input datetime string to be parsed.

    Returns:
    - pd.Timestamp or pd.NaT: A Pandas Timestamp object if the format matches, otherwise pd.NaT.
    """
    
    # Check for the presence of "Z" and determine the format based on whether the string contains a decimal point
    if "Z" in value and "." not in value:
        return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%SZ")  # Format without fractional seconds
    elif "Z" in value and "." in value:
        return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%fZ")  # Format with fractional seconds
    else:
        return pd.NaT  # Return Not a Time (NaT) if the format doesn't match

def calculate_channel_growth(Cleaned_File):
    """
    Calculates the growth score for channels and engagement score for videos based on various metrics.
    
    The growth score for each channel is calculated using the normalized values of:
    - View count
    - Subscriber count
    - Video count
    - Channel age (in years)
    
    The engagement score for each video is calculated using:
    - Views per day
    - Like-to-view ratio
    - Comment-to-view ratio

    Args:
    - Cleaned_File (pd.DataFrame): The dataframe containing channel and video data to calculate growth and engagement scores.

    Returns:
    - pd.DataFrame: The cleaned dataframe with the added columns for growth and engagement scores.
    """
    
    # Get the current IST time (Indian Standard Time) to calculate age-based metrics
    utc_timestamp = int(time.time())
    zone = pytz.timezone('Asia/Kolkata')
    current_ist_time = datetime.datetime.fromtimestamp(utc_timestamp, zone).replace(tzinfo=None) 
    
    # Channel Age Calculation (in years)
    channelPublishedOn = Cleaned_File["channelPublishedOn"].apply(parse_datetime)
    Cleaned_File['channelAgeInYears'] = (current_ist_time - channelPublishedOn).dt.total_seconds() / (365 * 24 * 60 * 60)
    
    # Min-Max normalization for channel metrics
    Cleaned_File['channelNormalizedViewCount'] = normalize(Cleaned_File['channelViewCount'])
    Cleaned_File['channelNormalizedSubscriberCount'] = normalize(Cleaned_File['channelSubscriberCount'])
    Cleaned_File['channelNormalizedVideoCount'] = normalize(Cleaned_File['channelVideoCount'])
    Cleaned_File['channelNormalizedChannelAge'] = normalize(Cleaned_File['channelAgeInYears'])
    
    # Define weights for each metric in the growth score calculation
    weight_views = 0.5
    weight_subscribers = 0.3
    weight_videos = 0.2
    
    # Growth Score Calculation for the channel
    Cleaned_File['channelGrowthScore'] = (
        (Cleaned_File['channelNormalizedViewCount'] * weight_views) +
        (Cleaned_File['channelNormalizedSubscriberCount'] * weight_subscribers) +
        (Cleaned_File['channelNormalizedVideoCount'] * weight_videos)
    ) / Cleaned_File['channelNormalizedChannelAge']
    
    # Video Age Calculation (in days)
    videoPublishedOn = Cleaned_File["videoPublishedOn"].apply(parse_datetime)
    Cleaned_File["videoAgeInDays"] = (current_ist_time - videoPublishedOn).dt.total_seconds() / (24 * 60 * 60)
    
    # Engagement Metrics for videos
    Cleaned_File["videoViewsPerDay"] = Cleaned_File["videoViewCount"] / Cleaned_File["videoAgeInDays"]
    Cleaned_File["videoLikeToViewRatio"] = Cleaned_File["videoLikeCount"] / Cleaned_File["videoViewCount"]
    Cleaned_File["videoCommentToViewRatio"] = Cleaned_File["videoCommentCount"] / Cleaned_File["videoViewCount"]
    
    # Engagement Score Calculation for the video
    Cleaned_File["videoEngagementScore"] = (
        (Cleaned_File["videoViewsPerDay"] * 0.5) +
        (Cleaned_File["videoLikeToViewRatio"] * 100 * 0.3) +
        (Cleaned_File["videoCommentToViewRatio"] * 100 * 0.2)
    )
    
    # Return the dataframe with added growth and engagement scores
    return Cleaned_File

def FeatureEngineering(Cleaned_File):
    """
    This function performs feature engineering to enhance the dataset for analysis by creating new features 
    and transforming existing ones, such as categorizing video duration, calculating channel growth and 
    video engagement scores, and enriching geographic details.

    The key steps include:
    - Extracting the day of the week from the video publish timestamp.
    - Classifying video durations into predefined categories.
    - Calculating channel growth and video engagement scores.
    - Ranking channels and videos based on their growth and engagement scores.
    - Merging geographic details like country, continent, and IT hub information with the dataset.
    
    Args:
    - Cleaned_File (pd.DataFrame): The input dataframe containing video and channel data for feature engineering.

    Returns:
    - pd.DataFrame: The transformed dataframe with newly engineered features.
    """

    # Feature: videoPublishedWeekDay - Derive the day of the week from the videoPublishedOn timestamp.
    Cleaned_File['videoPublishedWeekDay'] = pd.to_datetime(Cleaned_File["videoPublishedOn"]).dt.day_name()
    
    # Feature: videoDurationClassification - Categorize videos based on their duration in seconds into predefined segments.
    # Categories:
    #     Very Short (0 - 60 sec), Short (61 sec - 2 min), Medium (2 min 1 sec - 5 min),
    #     Long (5 min 1 sec - 10 min), Very Long (10 min 1 sec - 1 hour),
    #     Extended (1 hour 1 sec - 3 hours), Ultra Long (3 hours 1 sec and above)
    Cleaned_File['videoDurationClassification'] = Cleaned_File['videoDurationInSeconds'].apply(videoDurationClassification)
    
    # Feature: channelGrowth metric - Calculate channel growth using factors such as views, subscribers, video count, and age.
    # Normalization of key columns: channelPublishedOn, channelViewCount, channelSubscriberCount, and channelVideoCount
    Cleaned_File = calculate_channel_growth(Cleaned_File)
    
    # Feature: videoEngagementScore - Calculate the video engagement score using video views, likes, and comments.
    # Normalization of key columns: videoPublishedOn, videoViewCount, videoLikeCount, and videoCommentCount
    Cleaned_File = calculate_channel_growth(Cleaned_File)  # This also handles the video engagement scores
    
    # Feature: channelGrowthScoreRank - Rank channels based on their growth score.
    Cleaned_File["channelGrowthScoreRank"] = Cleaned_File["channelGrowthScore"].rank()
    
    # Feature: videoEngagementScoreRank - Rank videos based on their engagement score.
    Cleaned_File["videoEngagementScoreRank"] = Cleaned_File["videoEngagementScore"].rank()
    
    # Feature: Geographic Classification - Enrich dataset with geographic details (country, continent, IT hub classification).
    # This merges additional country and continent details from an external source based on the channel's country.
    
    # Fetch geographic details (ISO codes, country names, continent, etc.) from an external file
    Country_Details_ISO = Requirement_File_Extraction(repo_url, kaggle_repo_url, requirement_path).transpose()
    Country_Details_ISO = Country_Details_ISO.reset_index()
    Country_Details_ISO.rename(columns={'index': 'country_code'}, inplace=True)
    
    # Merge geographic details (from Country_Details_ISO) with the cleaned file
    resultDataFrame = pd.merge(Cleaned_File, Country_Details_ISO, left_on='channelCountry', right_on='country_code', how='left')
    
    # Fill missing geographic data with 'Unknown' (in case a country code doesn't match)
    cols_to_fill = ['country_code', 'country_name', 'continent', 'continent_code', 'it_hub_country']
    resultDataFrame[cols_to_fill] = resultDataFrame[cols_to_fill].fillna('Unknown')
    
    # Return the enriched dataframe with new features
    return resultDataFrame

def GitHubPush(Feature_File):
    """
    Saves a DataFrame as a JSON file and pushes it to a GitHub repository.

    This function:
    - Counts the number of records in the DataFrame.
    - Generates a unique filename using a timestamp in IST (Indian Standard Time) and the record count.
    - Saves the DataFrame as a JSON file in a readable format.
    - Checks if the destination directory exists; if not, creates it.
    - Copies the saved file to the destination directory.
    - Commits and pushes the file to a GitHub repository.

    Parameters:
    Feature_File (pd.DataFrame): The DataFrame to be saved and pushed.

    Returns:
    None
    """

    # Count the number of records in the DataFrame
    record_count = len(Feature_File)
    
    # Generate a timestamp for the filename using the current time in IST (Indian Standard Time)
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
    
    # Create a unique filename using the timestamp and number of records
    filename = f"FE_{timestamp}_{record_count}_records.json"
    
    # Save the DataFrame as a JSON file with indentation for readability
    Feature_File.to_json(filename, orient="records", indent=4)
    print(f"DataFrame saved as {filename}")
    
    # Check if the destination directory exists
    if not os.path.exists(destination_path):
        # Create the directory if it does not exist
        os.makedirs(destination_path)
        print("Created the destination directory: FeatureEngineering/Daily")
    
    # Copy the saved file to the destination directory
    shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    
    # Initialize the local Git repository
    repo = Repo(kaggle_repo_url)
    
    # Add the copied file to the Git staging area
    repo.index.add([f"{destination_path}/{filename}"])
    
    # Create a commit message including the timestamp and filename
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
    
    # Push the committed changes to the remote GitHub repository
    origin = repo.remote(name="origin")
    push_result = origin.push()
    
    # Check if the push was successful and print the result
    if push_result:
        print("Push successful.")
    else:
        print("Push failed.")

def main():
    """
    Main function to execute the data extraction, feature engineering, and GitHub push process.

    Steps:
    1. Extracts the cleaned data file from the repository using the provided URL and path.
    2. Applies feature engineering to enhance the cleaned data.
    3. Pushes the processed and feature-engineered data to GitHub.

    Returns:
    None
    """

    # Step 1: Extract the cleaned data file from the repository using the provided URL and path.
    DataCleaning_File = DataCleaning_File_Extraction(repo_url, kaggle_repo_url, DataCleaning_path)
    
    # Optional: Uncomment the following line to display the cleaned file sorted by video duration.
    # display(DataCleaning_File.sort_values(by='videoDurationInSeconds', ascending=True))
    
    # Step 2: Apply feature engineering transformations to the cleaned data.
    Feature_File = FeatureEngineering(DataCleaning_File)
    
    # Optional: Uncomment the following line to display the feature-engineered file.
    # display(Feature_File)
    
    # Step 3: Push the processed and feature-engineered data to GitHub.
    GitHubPush(Feature_File)


if __name__ == "__main__":
    """
    Entry point for the data pipeline execution. 

    This script:
    - Imports necessary libraries for file handling, Git operations, time management, data manipulation, and Kaggle secret access.
    - Retrieves the GitHub repository URL from Kaggle secrets.
    - Sets up the Indian Standard Time (IST) timezone for consistent timestamping.
    - Defines paths for various directories used in the pipeline, including repositories, data cleaning, and feature engineering storage.
    - Configures pandas to display all columns and rows for better debugging.
    - Calls the `main()` function to execute the full data pipeline, including data extraction, feature engineering, and pushing data to GitHub.

    Returns:
    None
    """

    # Import necessary libraries
    import os  # For file and directory operations
    import git  # For interacting with Git repositories
    from git import Repo  # For handling GitHub repository interactions
    import time  # For time-related operations
    import datetime  # For date and time manipulations
    import pytz  # For timezone handling
    from pytz import timezone  # To manage different timezones
    import pandas as pd  # For data manipulation and analysis
    import shutil  # For file operations like copying and removing files
    from kaggle_secrets import UserSecretsClient  # For securely accessing secrets in Kaggle

    # Retrieve the GitHub repository URL stored in Kaggle's secret management system
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("repo_url_youtube_analysis")
    repo_url = secret_value_0  # URL for the GitHub repository used in this pipeline

    # Set the timezone to Indian Standard Time (IST) for consistent timestamping
    ist = timezone('Asia/Kolkata')

    # Define paths for various directories used in the data pipeline
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'  # Local path to the cloned GitHub repository
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/FeatureEngineering/Daily'  # Directory for storing feature-engineered data
    DataCleaning_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'  # Directory for cleaned data files
    requirement_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Requirement/Daily'  # Directory for requirement-related files

    # Configure pandas display settings to show all columns and rows for better visibility during debugging
    pd.set_option("display.max_columns", None)  # Display all columns without truncation
    pd.set_option("display.max_rows", None)  # Display all rows without truncation

    # Execute the main function to run the data pipeline
    main()

Repository already exists locally.
Successfully pulled the latest changes.
Already cloned and the repo file exists
Successfully pulled the git repo before push
DataFrame saved as FE_2025-02-07_06_36_59_406_records.json
Push successful.


In [10]:
print("Execution ended.")

# Record the end time of execution
end_time = datetime.datetime.now()

# Calculate the time taken for execution
time_taken = end_time - start_time

# Get the current time in the 'Asia/Kolkata' timezone
current_time = datetime.datetime.now(timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H_%M_%S")

# Create a new row as a DataFrame
new_row = pd.DataFrame([{
    'ScriptFile': 'feature-engineering.ipynb',
    'StartTime': start_time.strftime('%Y-%m-%d %H:%M:%S'),
    'EndTime': end_time.strftime('%Y-%m-%d %H:%M:%S'),
    'TimeTaken': str(time_taken),
    'Date': current_time
}])

# Append the new row using pd.concat()
FileExecution = pd.concat([FileExecution, new_row], ignore_index=True)

# Display the DataFrame
# display(FileExecution)

# Save the DataFrame to a JSON file
FileExecution.to_json(f"{current_time}_ScriptFileExecution.json", orient="records", indent=4)
# print(FileExecution)

Execution ended.


In [11]:
"""
Automates the process of identifying the latest .json file, copying it 
to a GitHub repository, and pushing the changes.

Args:
    None

Returns:
    bool: True if the process completes successfully and the file is pushed to GitHub, 
          False if an error occurs during any step.
"""
# List all files in the working directory
output_files = os.listdir('/kaggle/working')

try:
    # Filter and find the most recent .json file
    json_files = [file for file in output_files if file.endswith("ScriptFileExecution.json")]
    if json_files:
        LatestFiles = max(json_files, key=os.path.getctime)  # Get the latest file based on creation time
    else:
        raise ValueError("No JSON files found!")  # Raise an error if no JSON files are found
except ValueError as e:
    print(f"An error occurred at fetching recent .json file: {e}")

# Define repository and destination paths
kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/ExecutionTracker/Daily'

print(LatestFiles)  # Print the latest JSON file name
try:
    # Check if the repository already exists
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository
        origin.pull()  # Pull the latest changes from the repository
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    
    # Check if the destination path exists, and copy the latest file
    if os.path.exists(destination_path):
        shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
    else:
        # Create the destination directory if it doesn't exist
        os.makedirs(destination_path)
        shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
    
    # Initialize the repository for git operations
    repo = Repo(kaggle_repo_url)
    # Add the copied file to the staging area
    repo.index.add([f"{destination_path}/{LatestFiles}"])
    
    # Create a timestamp for the commit message
    ist = timezone('Asia/Kolkata')  # IST timezone
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    
    # Commit the changes with a message including the timestamp and file name
    if repo.is_dirty(untracked_files=True):
        repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {LatestFiles}")
        print("Changes committed successfully.")
    else:
        # If no changes are detected, create an empty commit
        repo.git.commit(m="Empty commit to trigger contribution", allow_empty=True)
        print("Empty commit created as no changes were detected.")
    
    # Push the changes to the remote repository
    origin = repo.remote(name="origin")
    origin.push()
    print("Execution Tracking file successfully pushed to GitHub!")

except Exception as e:
    # Handle any errors that occur during the git automation process
    print(f"An error occurred at git automation code: {e}")
    

2025-02-07_06_37_00_ScriptFileExecution.json
Already cloned and the repo file exists
Successfully pulled the git repo before push
Changes committed successfully.
Execution Tracking file successfully pushed to GitHub!
