---
**Observation:**  

1. Null values are present in the following columns:
   - **`videoDefaultLanguage`**  (will be dropped after data cleaning)
   - **`videoDefaultAudioLanguage`** 
   - **`channelCountry`**

---

2. The following columns will be dropped as part of data cleaning:
   - **`videoDescription`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoLiveBroadcastContent`**: All values are `'none'`, providing no variability or insights. 
   - **`videoFavoriteCount`**: All values are `0`, making it redundant.  
   - **`videoTags`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoUniqueId`**: Identified as a duplicate column.  
   - **`channelIdUnique`**: Identified as a duplicate column.  
   - **`channelTitleCheck`**: Identified as a duplicate column.  
   - **`channelDescription`**: Reserved for analysis in future NLP project with a broader dataset.
---

3. The columns **`channelName`** and **`videoTitle`** require further processing due to the presence of:
    - Multilingual text.  
    - Emojis and special characters.  

---

In [1]:
def Source_File_Extraction(repo_url, kaggle_repo_url, source_path):
    """
    This function checks if a specified Git repository already exists in the local system.
    If the repository exists, it pulls the latest changes from the remote repository.
    If the repository doesn't exist, it clones the repository from the provided URL.
    
    After ensuring the repository is up-to-date, it searches for a JSON file that starts with "S_" 
    and ends with "records.json" in the specified source directory, loads the file using pandas, 
    and returns the data as a DataFrame.

    Args:
    - repo_url (str): The URL of the Git repository to clone if not already present.
    - kaggle_repo_url (str): The local path where the repository is stored or will be cloned to.
    - source_path (str): The directory where the JSON file is stored.

    Returns:
    - pd.DataFrame: The data from the JSON file as a pandas DataFrame.
    """
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository
        origin.pull()  # Pull the latest changes from the repository
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    
    # List all files in the source path and find the relevant JSON file
    output_files = os.listdir(source_path)
    Source_File = max([i for i in output_files if i.startswith("S_") and i.endswith('records.json')])
    
    # Read the found JSON file into a pandas DataFrame
    Source_File = pd.read_json(f'{source_path}/{Source_File}')
    
    return Source_File

In [2]:
def DataCleaning(Target_File):
    """
    Cleans the input DataFrame by performing the following operations:
    1. Drops irrelevant columns.
    2. Removes duplicate rows.
    3. Filters videos based on language (only those with 'videoDefaultAudioLanguage' starting with 'en').
    4. Translates non-ASCII characters in 'channelName' and 'videoTitle' to English.
    5. Removes emojis and decodes HTML entities from 'channelName' and 'videoTitle'.
    6. Removes non-ASCII characters from 'channelName' and 'videoTitle'.
    7. Fills missing values in 'channelCountry' with 'Unknown'.
    8. Returns the cleaned DataFrame.

    Args:
    - Target_File (pd.DataFrame): The DataFrame to clean.

    Returns:
    - pd.DataFrame: The cleaned DataFrame.
    """
    
    # Drop irrelevant columns
    Target_File = Target_File.drop(['videoDescription', 'videoLiveBroadcastContent', 'videoFavoriteCount',
                                    'videoTags', 'videoUniqueId', 'channelIdUnique', 'channelTitleCheck', 'channelDescription'], axis=1)
    
    # Identify and keep all duplicates
    duplicates = Target_File[Target_File.duplicated(keep=False)]  # Selects all duplicates, including the first occurrence
    
    # Remove duplicates
    Target_File = Target_File.drop_duplicates(ignore_index=True)
    
    # Filter for videos with 'videoDefaultAudioLanguage' starting with 'en'
    Target_File_EN = Target_File[Target_File['videoDefaultAudioLanguage'].str.startswith("en", na=False)].reset_index(drop=True)

    # Iterate through each row in 'Target_File_EN' to clean 'channelName' and 'videoTitle'
    for i in range(len(Target_File_EN['channelName'])):
        try:
            # Translate non-ASCII characters in 'channelName' and 'videoTitle' to English
            if not Target_File_EN['channelName'][i].isascii():
                Target_File_EN.loc[i, 'channelName'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['channelName'][i])
            if not Target_File_EN['videoTitle'][i].isascii():
                Target_File_EN.loc[i, 'videoTitle'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['videoTitle'][i])

            # Remove emojis
            Target_File_EN.loc[i, 'channelName'] = emoji.replace_emoji(Target_File_EN['channelName'][i], replace='')
            Target_File_EN.loc[i, 'videoTitle'] = emoji.replace_emoji(Target_File_EN['videoTitle'][i], replace='')

            # Decode HTML entities like &amp; and &#39;
            Target_File_EN.loc[i, 'channelName'] = html.unescape(Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = html.unescape(Target_File_EN['videoTitle'][i])

            # Remove non-ASCII characters from 'channelName' and 'videoTitle'
            Target_File_EN.loc[i, 'channelName'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['videoTitle'][i])

        except Exception as e:
            print(e)
    
    # Remove duplicates after the transformations
    Target_File_EN = Target_File_EN.drop_duplicates(ignore_index=True)
    
    # Drop 'videoDefaultLanguage' column as it is no longer needed
    Target_File_EN = Target_File_EN.drop(['videoDefaultLanguage'], axis=1)
    
    # Fill missing values in 'channelCountry' with 'Unknown'
    Target_File_EN['channelCountry'] = Target_File_EN['channelCountry'].fillna('Unknown')
    
    return Target_File_EN

In [3]:
def GitHubPush(Target_File_EN):
    """
    This function handles the process of saving a cleaned and processed DataFrame as a JSON file, 
    pushing it to a GitHub repository. It ensures that the file is properly named with a timestamp 
    and number of records, creates necessary directories, and commits the changes to the repository.
    
    Args:
    - Target_File_EN (pd.DataFrame): The DataFrame that contains the processed data to be saved and pushed.
    
    Returns:
    - None: This function performs file handling and Git operations but does not return anything.
    """

    # Count the number of records in the DataFrame
    record_count = len(Target_File_EN)
    
    # Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    
    # Create a filename using the generated timestamp and number of records to ensure uniqueness.
    filename = f"DC_{timestamp}_{record_count}_records.json"
    
    # Save the DataFrame to a JSON file in a readable format (with indentation)
    Target_File_EN.to_json(filename, orient="records", indent=4)
    print(f"DataFrame saved as {filename}")
    
    # Check if the destination directory exists
    if not os.path.exists(destination_path):
        # If the directory does not exist, create it
        os.makedirs(destination_path)
        print('Created the destination directory, DataCleaning/Daily')
        # Copy the saved file into the newly created directory
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    else:
        print('Destination directory already exists')
        # Copy the file to the existing directory
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    
    # Initialize the repository for git operations using the local GitHub repository URL
    repo = Repo(kaggle_repo_url)
    
    # Add the copied file to the staging area for git commit
    repo.index.add([f"{destination_path}/{filename}"])
    
    # Create a timestamp for the commit message
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    # Commit the changes with a message that includes the timestamp and the filename
    repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
    
    # Push the changes to the remote repository
    origin = repo.remote(name="origin")
    push_result = origin.push()
    
    # Check if the push was successful and print the result
    if push_result:
        print("Push successful.")
    else:
        print("Push failed.")

In [4]:
def main():
    """
    The main function orchestrates the entire data pipeline by:
    1. Extracting the source data from the given repository URL.
    2. Cleaning the extracted data using the DataCleaning function.
    3. Applying feature engineering to the cleaned data using the FeatureEngineering function.
    4. Pushing the final processed file to a GitHub repository.
    
    This function executes the steps in sequence to process and upload data.
    
    Args:
    - None: This function does not accept any arguments. It uses predefined repository URLs and paths.
    
    Returns:
    - None: This function does not return anything but performs data processing and Git operations.
    """
    
    # Step 1: Extract the source file from the repository based on the provided URL and path.
    Source_File = Source_File_Extraction(repo_url, kaggle_repo_url, source_path)
    
    # Step 2: Clean the extracted data using the DataCleaning function.
    Cleaned_File = DataCleaning(Source_File)
    
    # Optional: Uncomment to display the cleaned file sorted by video duration.
    # display(Cleaned_File.sort_values(by='videoDurationInSeconds', ascending=True))
    
    # Optional: Uncomment to display the feature-engineered file.
    # display(Feature_File)
    
    # Step 4: Push the processed and feature-engineered data to GitHub using GitHubPush function.
    GitHubPush(Cleaned_File)

In [5]:
if __name__ == "__main__":
    """
    This script is the entry point for the data cleaning and feature engineering pipeline.
    It performs the following tasks:
    1. Imports necessary libraries for data processing, file handling, and Git operations.
    2. Retrieves user secrets for repository URL.
    3. Sets up paths for different directories (source, destination, etc.).
    4. Configures pandas to display all columns and rows without truncation.
    5. Calls the main function to execute the pipeline.

    The script is designed to be executed as the main module in a Python environment.
    It ensures that all necessary operations are performed, including fetching source data, 
    cleaning, feature engineering, and pushing the final data to a GitHub repository.
    """

    # Import necessary libraries
    import os  
    import git  # Git library for interacting with repositories
    from git import Repo  # GitHub repository interaction
    import time  # For time-related operations
    import datetime  # For working with date and time
    from pytz import timezone  # For timezone management
    import pytz  # Timezone handling
    import pandas as pd  # For data manipulation and analysis
    import deep_translator  # For translation services
    from deep_translator import GoogleTranslator  # Google Translate API integration
    import shutil  # For file operations like copying or removing
    import emoji  # For handling emojis in the data
    import re  # For regular expression operations
    import html  # For HTML parsing and escaping
    from kaggle_secrets import UserSecretsClient  # For accessing Kaggle's secret management system
    
    # Retrieve secret value for repository URL from Kaggle secrets storage
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("dataCleanRepoUrl")
    repo_url = secret_value_0  # URL for the GitHub repository used in this pipeline
    
    # Set timezone to Indian Standard Time (IST)
    ist = timezone('Asia/Kolkata')
    
    # Define paths for different directories
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'  # Path to the working repository on Kaggle
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'  # Path to store cleaned data
    source_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'  # Path to source raw data
    
    # Configure pandas to display all columns and rows without truncation for easier debugging
    pd.set_option("display.max_columns", None)  # Prevent truncating columns
    pd.set_option("display.max_rows", None)  # Prevent truncating rows
    
    # Call the main function to execute the data pipeline
    main()

Successfully cloned the git repo
DataFrame saved as DC_2025-02-06_00:37:13_423_records.json
Destination directory already exists
Push successful.
