In [1]:
def DataCleaning_File_Extraction(repo_url, kaggle_repo_url, DataCleaning_path):
    """
    Extracts and processes a data cleaning file from a specified GitHub repository.

    This function checks if the repository is already cloned locally. If found, it pulls the latest changes;
    otherwise, it clones the repository. It then searches for a JSON file in the specified directory that starts
    with 'DC_' and ends with 'records.json', reads it into a pandas DataFrame, and returns the DataFrame.

    Parameters:
    repo_url (str): The URL of the GitHub repository to clone or update.
    kaggle_repo_url (str): The local path where the repository is cloned.
    DataCleaning_path (str): The directory path where the data cleaning files are stored.

    Returns:
    pd.DataFrame: A pandas DataFrame containing the extracted data from the JSON file.
    """

    # Check if the repository already exists locally
    if os.path.exists(kaggle_repo_url):
        print("Repository already exists locally.")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository reference
        origin.pull()  # Pull the latest updates from the remote repository
        print("Successfully pulled the latest changes.")
    else:
        # Clone the repository if it does not exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the repository.")

    # List all files in the specified directory and filter for the relevant JSON file
    output_files = os.listdir(DataCleaning_path)
    DataCleaning_File = max(
        [file for file in output_files if file.startswith("DC_") and file.endswith('records.json')]
    )

    # Read the identified JSON file into a pandas DataFrame
    DataCleaning_File = pd.read_json(os.path.join(DataCleaning_path, DataCleaning_File))

    return DataCleaning_File

In [2]:
def Requirement_File_Extraction(repo_url, kaggle_repo_url, requirement_path):
    """
    Ensures the repository is up-to-date by either pulling the latest changes or cloning it.
    Then, extracts and returns the most recent JSON file starting with "RE_" and ending with 
    "country_details.json" from the specified requirement directory as a pandas DataFrame.

    Args:
    - repo_url (str): Git repository URL to clone if not present.
    - kaggle_repo_url (str): Local directory path for the repository.
    - requirement_path (str): Directory containing the JSON files.

    Returns:
    - pd.DataFrame: Data from the most recent JSON file.
    """
    
    # Check if the repository already exists locally
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        
        # Access the existing repository and pull the latest changes
        repo = git.Repo(kaggle_repo_url)
        origin = repo.remote(name='origin')
        origin.pull()  # Pull the latest changes
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist locally
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    
    # List all files in the requirement directory
    output_files = os.listdir(requirement_path)
    
    # Find the most recent JSON file that starts with "RE_" and ends with "country_details.json"
    Requirement_File = max([i for i in output_files if i.startswith("RE_") and i.endswith('country_details.json')])
    
    # Read the found JSON file into a pandas DataFrame
    Requirement_File = pd.read_json(f'{requirement_path}/{Requirement_File}')
    
    return Requirement_File

In [3]:
def videoDurationClassification(videoDurationInSeconds):
    """
    Classifies the video duration into categories based on its length in seconds.

    Args:
    - videoDurationInSeconds (int): Duration of the video in seconds.

    Returns:
    - str: A string indicating the classification of the video duration.
    """
    
     # Classifying the video duration into new categories
    if 0 <= videoDurationInSeconds < 120:
        return "Shorts (<2 min)"  # Less than 2 minutes
    elif 120 <= videoDurationInSeconds < 600:
        return "Standard (2–10 min)"  # Between 2 and 10 minutes
    elif 600 <= videoDurationInSeconds < 1800:
        return "Intermediate (10–30 min)"  # Between 10 and 30 minutes
    elif 1800 <= videoDurationInSeconds < 3600:
        return "Workshop (30 min–1 hr)"  # Between 30 minutes and 1 hour
    elif 3600 <= videoDurationInSeconds < 7200:
        return "Deep Dive (1–2 hr)"  # Between 1 and 2 hours
    elif 7200 <= videoDurationInSeconds < 18000:
        return "End-to-End (2–5 hr)"  # Between 2 and 5 hours
    elif videoDurationInSeconds >= 18000:
        return "Marathon (5 hr+)"  # More than 5 hours
    else:
        return "Invalid video duration"  # Invalid value for video duration

# Channel Growth Calculation

## **Overview**
The channel growth metric is designed to assess the growth of a YouTube channel based on key engagement indicators: 
- **Channel View Count**
- **Channel Subscriber Count**
- **Channel Video Count**
- **Channel Age (in years)**

## **Formula**
The channel growth score is computed as:

```python
channel_growth = ((normalized_views * weight_views) + 
                  (normalized_subscribers * weight_subscribers) + 
                  (normalized_videos * weight_videos)) / channel_age_in_years
```

where:
- **Min-Max Normalization** is applied to views, subscribers, and video count:
  ```python
  normalized_value = (value - min_value) / (max_value - min_value)
  ```
- **Weighting factors** determine the relative importance of each metric:
  - `weight_views = 0.5`
  - `weight_subscribers = 0.3`
  - `weight_videos = 0.2`
- **Channel Age (years)** is computed from:
  ```python
  channel_age_in_years = (current_timestamp - channelPublishedOnInSeconds) / (365 * 24 * 60 * 60)
  ```

## **Concepts Used**
1. **Min-Max Normalization**: Scales values between 0 and 1.
   - [Feature Scaling (Wikipedia)](https://en.wikipedia.org/wiki/Feature_scaling)
2. **Weighted Scoring**: Prioritizes key metrics based on impact.
   - [Weighted Scoring Model](https://theproductmanager.com/topics/weighted-scoring-model/)
3. **YouTube Analytics Metrics**: Defines the importance of views, subscribers, and videos.
   - [YouTube Analytics Help](https://support.google.com/youtube/answer/9002587?hl=en)
4. **Channel Age Calculation**: Determines the time span since the channel was created.
   - [Unix Time Conversion](https://www.unixtimestamp.com/)

## **Implementation Example**
```python
import time
import datetime
import pytz

def calculate_channel_growth(view_count, subscriber_count, video_count, channel_published_on):
    utc_timestamp = int(time.time())
    zone = pytz.timezone('Asia/Kolkata')
    current_ist_time = datetime.datetime.fromtimestamp(utc_timestamp, zone)
    channel_age_in_years = (current_ist_time - channel_published_on) / (365 * 24 * 60 * 60)
    
    # Min-Max normalization (Assume predefined min/max values from dataset)
    min_views, max_views = 1000, 10000000
    min_subscribers, max_subscribers = 10, 1000000
    min_videos, max_videos = 1, 10000
    
    normalized_views = (view_count - min_views) / (max_views - min_views)
    normalized_subscribers = (subscriber_count - min_subscribers) / (max_subscribers - min_subscribers)
    normalized_videos = (video_count - min_videos) / (max_videos - min_videos)
    
    # Weighted sum
    growth_score = ((normalized_views * 0.5) + (normalized_subscribers * 0.3) + (normalized_videos * 0.2)) / channel_age_in_years
    
    return growth_score
```

## **Conclusion**
This approach helps in analyzing a YouTube channel's growth potential by factoring in **engagement, longevity, and content volume**. It provides a scalable and adaptable framework for evaluating growth trends over time.

In [4]:
# def parse_datetime(value):
#     """
#     Parses a datetime string into a Pandas datetime object based on specific formats.
    
#     Handles two datetime formats:
#     1. "%Y-%m-%dT%H:%M:%SZ" for ISO 8601 format without fractional seconds.
#     2. "%Y-%m-%dT%H:%M:%S.%fZ" for ISO 8601 format with fractional seconds.

#     Args:
#     - value (str): The input datetime string to be parsed.

#     Returns:
#     - pd.Timestamp or pd.NaT: A Pandas Timestamp object if the format matches, otherwise pd.NaT.
#     """
    
#     # Check for the presence of "Z" and determine the format based on whether the string contains a decimal point
#     if "Z" in value and "." not in value:
#         return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%SZ")  # Format without fractional seconds
#     elif "Z" in value and "." in value:
#         return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%fZ")  # Format with fractional seconds
#     else:
#         return pd.NaT  # Return Not a Time (NaT) if the format doesn't match

In [5]:
def parse_datetime(value):
    """
    Parses a datetime string into a Pandas datetime object.
    Handles ISO 8601 formats with and without fractional seconds.
    Returns NaT if parsing fails.
    """
    try:
        return pd.to_datetime(value, errors="coerce")  # Automatically detects format
    except Exception:
        return pd.NaT  # Return NaT if parsing fails

In [6]:
# def normalize(series):
#     """
#     Normalizes a Pandas Series using Min-Max Scaling.

#     Parameters:
#     series (pd.Series): The input series to be normalized.

#     Returns:
#     np.ndarray: A NumPy array containing normalized values between 0 and 1.
#     """
#     scaler = MinMaxScaler()  # Initialize the MinMaxScaler
#     return scaler.fit_transform(series.values.reshape(-1, 1)).flatten()  # Normalize and return a 1D array

In [7]:
def normalize(series):
    """
    Normalizes a Pandas Series using Min-Max Scaling after handling NaN values.
    """
    series = series.fillna(1e-6)  # Replace NaNs with a small value
    scaler = MinMaxScaler()
    return scaler.fit_transform(series.values.reshape(-1, 1)).flatten()

In [8]:
def safe_log_transform(series):
    """
    Applies log transformation while handling zeros and negatives.
    """
    return np.log1p(np.maximum(series, 0))  # Ensures no negative values

In [9]:
# def calculate_channel_growth(Cleaned_File):
#     """
#     Calculates the growth score for channels and engagement score for videos based on various metrics.
    
#     The growth score for each channel is calculated using the normalized values of:
#     - View count
#     - Subscriber count
#     - Video count
#     - Channel age (in years)
    
#     The engagement score for each video is calculated using:
#     - Views per day
#     - Like-to-view ratio
#     - Comment-to-view ratio

#     Args:
#     - Cleaned_File (pd.DataFrame): The dataframe containing channel and video data to calculate growth and engagement scores.

#     Returns:
#     - pd.DataFrame: The cleaned dataframe with the added columns for growth and engagement scores.
#     """
    
#     # Get the current IST time (Indian Standard Time) to calculate age-based metrics
#     utc_timestamp = int(time.time())
#     zone = pytz.timezone('Asia/Kolkata')
#     current_ist_time = datetime.datetime.fromtimestamp(utc_timestamp, zone).replace(tzinfo=None)
#     # Channel Age Calculation (in years)
#     channelPublishedOn = Cleaned_File["channelPublishedOn"].apply(parse_datetime)
#     Cleaned_File['channelAgeInYears'] = (current_ist_time - channelPublishedOn).dt.total_seconds() / (365 * 24 * 60 * 60)
    
#     # Min-Max normalization for channel metrics
#     Cleaned_File['channelNormalizedViewCount'] = normalize(Cleaned_File['channelViewCount'])
#     Cleaned_File['channelNormalizedSubscriberCount'] = normalize(Cleaned_File['channelSubscriberCount'])
#     Cleaned_File['channelNormalizedVideoCount'] = normalize(Cleaned_File['channelVideoCount'])
#     Cleaned_File['channelNormalizedChannelAge'] = normalize(Cleaned_File['channelAgeInYears'])
    
#     # Define weights for each metric in the growth score calculation
#     weight_views = 50
#     weight_subscribers = 30
#     weight_videos = 20
    
#     # Growth Score Calculation for the channel
#     Cleaned_File['channelGrowthScore'] = (
#         (Cleaned_File['channelNormalizedViewCount'] * weight_views) +
#         (Cleaned_File['channelNormalizedSubscriberCount'] * weight_subscribers) +
#         (Cleaned_File['channelNormalizedVideoCount'] * weight_videos)
#     ) / (Cleaned_File['channelNormalizedChannelAge'] + 1e-6)  # Avoid division by zero
    
#     # Video Age Calculation (in days)
#     videoPublishedOn = Cleaned_File["videoPublishedOn"].apply(parse_datetime)
#     Cleaned_File["videoAgeInDays"] = (current_ist_time - videoPublishedOn).dt.total_seconds() / (24 * 60 * 60)
    
#     # Engagement Metrics for videos
#     Cleaned_File["videoViewsPerDay"] = Cleaned_File["videoViewCount"] / (Cleaned_File["videoAgeInDays"] + 1e-6)  # Avoid division by zero
#     Cleaned_File["videoLikeToViewRatio"] = Cleaned_File["videoLikeCount"] / (Cleaned_File["videoViewCount"] + 1e-6)
#     Cleaned_File["videoCommentToViewRatio"] = Cleaned_File["videoCommentCount"] / (Cleaned_File["videoViewCount"] + 1e-6)
    
#     # Engagement Score Calculation for the video
#     Cleaned_File["videoEngagementScore"] = (
#         (Cleaned_File["videoViewsPerDay"] * 50) +
#         (Cleaned_File["videoLikeToViewRatio"] * 100 * 30) +
#         (Cleaned_File["videoCommentToViewRatio"] * 100 * 20)
#     )
    
#     # Return the dataframe with added growth and engagement scores
#     return Cleaned_File

In [10]:
def calculate_channel_growth(df):
    """
    Calculates growth and engagement scores for YouTube channels and videos.
    Fixes log transformation, MinMax Scaling, and division issues.
    """

    # Ensure datetime columns are properly parsed
    df["channelPublishedOn"] = pd.to_datetime(df["channelPublishedOn"], errors="coerce")
    df["videoPublishedOn"] = pd.to_datetime(df["videoPublishedOn"], errors="coerce")

    # Convert timestamps to timezone-naive
    df["channelPublishedOn"] = df["channelPublishedOn"].dt.tz_localize(None)
    df["videoPublishedOn"] = df["videoPublishedOn"].dt.tz_localize(None)

    # Get current IST time and make it timezone-naive
    current_ist_time = datetime.datetime.now(pytz.timezone("Asia/Kolkata")).astimezone(pytz.utc).replace(tzinfo=None)

    # Channel Age Calculation (in years)
    df["channelAgeInYears"] = (current_ist_time - df["channelPublishedOn"]).dt.total_seconds() / (365 * 24 * 60 * 60)
    df["channelAgeInYears"] = df["channelAgeInYears"].fillna(1).clip(lower=1)  # Ensure minimum age is 1

    # Apply log transformation before Min-Max Scaling
    df["logChannelViewCount"] = safe_log_transform(df["channelViewCount"])
    df["logChannelSubscriberCount"] = safe_log_transform(df["channelSubscriberCount"])
    df["logChannelVideoCount"] = safe_log_transform(df["channelVideoCount"])
    df["logChannelAgeInYears"] = safe_log_transform(df["channelAgeInYears"])

    # Normalize using Min-Max Scaling
    df["channelNormalizedViewCount"] = normalize(df["logChannelViewCount"])
    df["channelNormalizedSubscriberCount"] = normalize(df["logChannelSubscriberCount"])
    df["channelNormalizedVideoCount"] = normalize(df["logChannelVideoCount"])
    df["channelNormalizedChannelAge"] = normalize(df["logChannelAgeInYears"])

    # Ensure minimum value for channel age normalization to avoid division by near-zero values
    df["channelNormalizedChannelAge"] = df["channelNormalizedChannelAge"].clip(lower=0.1)

    # Growth Score Calculation
    weight_views, weight_subscribers, weight_videos = 50, 30, 20
    df["channelGrowthScore"] = (
        (df["channelNormalizedViewCount"] * weight_views) +
        (df["channelNormalizedSubscriberCount"] * weight_subscribers) +
        (df["channelNormalizedVideoCount"] * weight_videos)
    ) / df["channelNormalizedChannelAge"]  # Avoids division errors

    # Video Age Calculation (in days)
    df["videoAgeInDays"] = (current_ist_time - df["videoPublishedOn"]).dt.total_seconds() / (24 * 60 * 60)
    df["videoAgeInDays"] = df["videoAgeInDays"].fillna(1).clip(lower=1)  # Ensure minimum age is 1

    # Log Transformation for Video Engagement Metrics
    df["logVideoViewCount"] = safe_log_transform(df["videoViewCount"])
    df["logVideoLikeCount"] = safe_log_transform(df["videoLikeCount"])
    df["logVideoCommentCount"] = safe_log_transform(df["videoCommentCount"])
    df["logVideoAgeInDays"] = safe_log_transform(df["videoAgeInDays"])

    # Engagement Metrics
    df["videoViewsPerDay"] = df["logVideoViewCount"] / df["logVideoAgeInDays"]
    df["videoLikeToViewRatio"] = df["logVideoLikeCount"] / (df["logVideoViewCount"] + 1e-6)
    df["videoCommentToViewRatio"] = df["logVideoCommentCount"] / (df["logVideoViewCount"] + 1e-6)

    # Normalize Engagement Metrics
    df["normalizedVideoViewsPerDay"] = normalize(df["videoViewsPerDay"])
    df["normalizedVideoLikeToViewRatio"] = normalize(df["videoLikeToViewRatio"])
    df["normalizedVideoCommentToViewRatio"] = normalize(df["videoCommentToViewRatio"])

    # Engagement Score Calculation
    df["videoEngagementScore"] = (
        (df["normalizedVideoViewsPerDay"] * 50) +
        (df["normalizedVideoLikeToViewRatio"] * 100 * 30) +
        (df["normalizedVideoCommentToViewRatio"] * 100 * 20)
    )

    return df

In [11]:
def HierarchicalWeightRanking(Cleaned_File):
    """
    Assigns rankings to channels and videos based on their growth and engagement scores.
    
    - Channels are ranked only once to prevent duplication.
    - Videos are ranked individually.
    
    Parameters:
    Cleaned_File (pd.DataFrame): The input DataFrame containing channel and video data.

    Returns:
    pd.DataFrame: DataFrame with added ranking columns.
    """
    
    # Rank channels uniquely (consider only one entry per channel)
    channel_ranking_df = Cleaned_File.drop_duplicates(subset=['channelId']).copy()

    # Sort by channel growth-related features
    sort_orderby_columns = ['channelGrowthScore', 'channelNormalizedViewCount', 'channelViewCount',
                            'channelNormalizedSubscriberCount', 'channelSubscriberCount', 
                            'channelNormalizedVideoCount', 'channelVideoCount', 
                            'channelNormalizedChannelAge', 'channelAgeInYears']
    ascending_bool = [False] * len(sort_orderby_columns)

    channel_ranking_df = channel_ranking_df.sort_values(by=sort_orderby_columns, ascending=ascending_bool)
    channel_ranking_df["channelGrowthScoreRank"] = range(1, len(channel_ranking_df) + 1)

    # Merge the unique channel ranks back to the original DataFrame
    Cleaned_File = Cleaned_File.merge(channel_ranking_df[['channelId', 'channelGrowthScoreRank']], on='channelId', how='left')

    # Rank videos normally (since they are unique)
    sort_orderby_columns = ['videoEngagementScore', 'videoViewsPerDay', 'videoViewCount', 
                            'videoLikeToViewRatio', 'videoLikeCount', 'videoCommentToViewRatio', 
                            'videoCommentCount', 'videoAgeInDays']
    ascending_bool = [False, False, False, False, False, False, False, True]

    Cleaned_File = Cleaned_File.sort_values(by=sort_orderby_columns, ascending=ascending_bool)
    Cleaned_File["videoEngagementScoreRank"] = range(1, len(Cleaned_File) + 1)

    return Cleaned_File

In [12]:
def FeatureEngineering(Cleaned_File):
    """
    This function performs feature engineering to enhance the dataset for analysis by creating new features 
    and transforming existing ones, such as categorizing video duration, calculating channel growth and 
    video engagement scores, and enriching geographic details.

    The key steps include:
    - Extracting the day of the week from the video publish timestamp.
    - Classifying video durations into predefined categories.
    - Calculating channel growth and video engagement scores.
    - Ranking channels and videos based on their growth and engagement scores.
    - Merging geographic details like country, continent, and IT hub information with the dataset.
    
    Args:
    - Cleaned_File (pd.DataFrame): The input dataframe containing video and channel data for feature engineering.

    Returns:
    - pd.DataFrame: The transformed dataframe with newly engineered features.
    """

    # Feature: videoPublishedWeekDay - Derive the day of the week from the videoPublishedOn timestamp.
    Cleaned_File['videoPublishedWeekDay'] = pd.to_datetime(Cleaned_File["videoPublishedOn"]).dt.day_name()
    
    # Feature: videoDurationClassification - Categorize videos based on their duration in seconds into predefined segments.
    # Categories:
    #     Very Short (0 - 60 sec), Short (61 sec - 2 min), Medium (2 min 1 sec - 5 min),
    #     Long (5 min 1 sec - 10 min), Very Long (10 min 1 sec - 1 hour),
    #     Extended (1 hour 1 sec - 3 hours), Ultra Long (3 hours 1 sec and above)
    Cleaned_File['videoDurationClassification'] = Cleaned_File['videoDurationInSeconds'].apply(videoDurationClassification)
    
    # Feature: channelGrowth metric - Calculate channel growth using factors such as views, subscribers, video count, and age.
    # Normalization of key columns: channelPublishedOn, channelViewCount, channelSubscriberCount, and channelVideoCount
    Cleaned_File = calculate_channel_growth(Cleaned_File)
    
    # Feature: videoEngagementScore - Calculate the video engagement score using video views, likes, and comments.
    # Normalization of key columns: videoPublishedOn, videoViewCount, videoLikeCount, and videoCommentCount
    Cleaned_File = calculate_channel_growth(Cleaned_File)  # This also handles the video engagement scores
    
    # Feature: channelGrowthScoreRank - Rank channels based on their growth score.
    # Feature: videoEngagementScoreRank - Rank videos based on their engagement score.
    Cleaned_File = HierarchicalWeightRanking(Cleaned_File)
    
    # Feature: Geographic Classification - Enrich dataset with geographic details (country, continent, IT hub classification).
    # This merges additional country and continent details from an external source based on the channel's country.
    
    # Fetch geographic details (ISO codes, country names, continent, etc.) from an external file
    Country_Details_ISO = Requirement_File_Extraction(repo_url, kaggle_repo_url, requirement_path).transpose()
    Country_Details_ISO = Country_Details_ISO.reset_index()
    Country_Details_ISO.rename(columns={'index': 'country_code'}, inplace=True)
    
    # Merge geographic details (from Country_Details_ISO) with the cleaned file
    resultDataFrame = pd.merge(Cleaned_File, Country_Details_ISO, left_on='channelCountry', right_on='country_code', how='left')
    
    # Fill missing geographic data with 'Unknown' (in case a country code doesn't match)
    cols_to_fill = ['country_code', 'country_name', 'continent', 'continent_code', 'it_hub_country']
    resultDataFrame[cols_to_fill] = resultDataFrame[cols_to_fill].fillna('Unknown')
    
    # Return the enriched dataframe with new features
    return resultDataFrame

In [13]:
def GitHubPush(Feature_File):
    """
    Saves a DataFrame as a JSON file and pushes it to a GitHub repository.

    This function:
    - Counts the number of records in the DataFrame.
    - Generates a unique filename using a timestamp in IST (Indian Standard Time) and the record count.
    - Saves the DataFrame as a JSON file in a readable format.
    - Checks if the destination directory exists; if not, creates it.
    - Copies the saved file to the destination directory.
    - Commits and pushes the file to a GitHub repository.

    Parameters:
    Feature_File (pd.DataFrame): The DataFrame to be saved and pushed.

    Returns:
    None
    """

    # Count the number of records in the DataFrame
    record_count = len(Feature_File)
    
    # Generate a timestamp for the filename using the current time in IST (Indian Standard Time)
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
    
    # Create a unique filename using the timestamp and number of records
    filename = f"FE_{timestamp}_{record_count}_records.json"
    
    # Save the DataFrame as a JSON file with indentation for readability
    Feature_File.to_json(filename, orient="records", indent=4)
    print(f"DataFrame saved as {filename}")
    
    # Check if the destination directory exists
    if not os.path.exists(destination_path):
        # Create the directory if it does not exist
        os.makedirs(destination_path)
        print("Created the destination directory: FeatureEngineering/Daily")
    
    # Copy the saved file to the destination directory
    shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    
    # Initialize the local Git repository
    repo = Repo(kaggle_repo_url)
    
    # Add the copied file to the Git staging area
    repo.index.add([f"{destination_path}/{filename}"])
    
    # Create a commit message including the timestamp and filename
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
    
    # Push the committed changes to the remote GitHub repository
    origin = repo.remote(name="origin")
    # push_result = origin.push()
    push_result = origin.push(refspec=f"HEAD:refs/heads/feature-engineering")
    
    # Check if the push was successful and print the result
    if push_result:
        print("Push successful.")
    else:
        print("Push failed.")

In [14]:
def main():
    """
    Main function to execute the data extraction, feature engineering, and GitHub push process.

    Steps:
    1. Extracts the cleaned data file from the repository using the provided URL and path.
    2. Applies feature engineering to enhance the cleaned data.
    3. Pushes the processed and feature-engineered data to GitHub.

    Returns:
    None
    """

    # Step 1: Extract the cleaned data file from the repository using the provided URL and path.
    DataCleaning_File = DataCleaning_File_Extraction(repo_url, kaggle_repo_url, DataCleaning_path)
    
    # Optional: Uncomment the following line to display the cleaned file sorted by video duration.
    # display(DataCleaning_File.sort_values(by='videoDurationInSeconds', ascending=True))
    
    # Step 2: Apply feature engineering transformations to the cleaned data.
    Feature_File = FeatureEngineering(DataCleaning_File)
    
    # Optional: Uncomment the following line to display the feature-engineered file.
    # display(Feature_File)
    
    # Step 3: Push the processed and feature-engineered data to GitHub.
    GitHubPush(Feature_File)

In [15]:
if __name__ == "__main__":
    """
    Entry point for the data pipeline execution. 

    This script:
    - Imports necessary libraries for file handling, Git operations, time management, data manipulation, and Kaggle secret access.
    - Retrieves the GitHub repository URL from Kaggle secrets.
    - Sets up the Indian Standard Time (IST) timezone for consistent timestamping.
    - Defines paths for various directories used in the pipeline, including repositories, data cleaning, and feature engineering storage.
    - Configures pandas to display all columns and rows for better debugging.
    - Calls the `main()` function to execute the full data pipeline, including data extraction, feature engineering, and pushing data to GitHub.

    Returns:
    None
    """

    # Import necessary libraries
    import os  # For file and directory operations
    import git  # For interacting with Git repositories
    from git import Repo  # For handling GitHub repository interactions
    import time  # For time-related operations
    import datetime  # For date and time manipulations
    import pytz  # For timezone handling
    from pytz import timezone  # To manage different timezones
    import pandas as pd  # For data manipulation and analysis
    import numpy as np
    import shutil  # For file operations like copying and removing files
    from kaggle_secrets import UserSecretsClient  # For securely accessing secrets in Kaggle
    from sklearn.preprocessing import MinMaxScaler # Using MinMaxScaler for efficient and consistent normalization

    # Retrieve the GitHub repository URL stored in Kaggle's secret management system
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("featureEngineeringRepoUrl")
    repo_url = secret_value_0  # URL for the GitHub repository used in this pipeline

    # Set the timezone to Indian Standard Time (IST) for consistent timestamping
    ist = timezone('Asia/Kolkata')

    # Define paths for various directories used in the data pipeline
    kaggle_repo_url = '/kaggle/working/DevOps-YouTube-Trends'  # Local path to the cloned GitHub repository
    destination_path = '/kaggle/working/DevOps-YouTube-Trends/FeatureEngineering/Daily'  # Directory for storing feature-engineered data
    DataCleaning_path = '/kaggle/working/DevOps-YouTube-Trends/DataCleaning/Daily'  # Directory for cleaned data files
    requirement_path = '/kaggle/working/DevOps-YouTube-Trends/Requirement/Daily'  # Directory for requirement-related files

    # Configure pandas display settings to show all columns and rows for better visibility during debugging
    pd.set_option("display.max_columns", None)  # Display all columns without truncation
    pd.set_option("display.max_rows", None)  # Display all rows without truncation

    # Execute the main function to run the data pipeline
    main()

Successfully cloned the repository.
Already cloned and the repo file exists
Successfully pulled the git repo before push
DataFrame saved as FE_2025-09-10_19_58_48_428_records.json
Push successful.
