---
**Observation:**  

1. Null values are present in the following columns:
   - **`videoDefaultLanguage`**  (will be dropped after data cleaning)
   - **`videoDefaultAudioLanguage`** 
   - **`channelCountry`**

---

2. The following columns will be dropped as part of data cleaning:
   - **`videoDescription`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoLiveBroadcastContent`**: All values are `'none'`, providing no variability or insights. 
   - **`videoFavoriteCount`**: All values are `0`, making it redundant.  
   - **`videoTags`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoUniqueId`**: Identified as a duplicate column.  
   - **`channelIdUnique`**: Identified as a duplicate column.  
   - **`channelTitleCheck`**: Identified as a duplicate column.  
   - **`channelDescription`**: Reserved for analysis in future NLP project with a broader dataset.
---

3. The columns **`channelName`** and **`videoTitle`** require further processing due to the presence of:
    - Multilingual text.  
    - Emojis and special characters.  

---

In [1]:
def Source_File_Extraction(repo_url,kaggle_repo_url):
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository
        origin.pull()  # Pull the latest changes from the repository
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    output_files = os.listdir(source_path)
    Source_File = max([i for i in output_files if i.startswith("S_") and i.endswith('records.json')])
    Source_File = pd.read_json(f'{source_path}/{Source_File}')
    return Source_File

In [2]:
def Requirement_File_Extraction(repo_url,kaggle_repo_url):
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository
        origin.pull()  # Pull the latest changes from the repository
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    output_files = os.listdir(requirement_path)
    Requirement_File = max([i for i in output_files if i.startswith("RE_") and i.endswith('country_details.json')])
    Requirement_File = pd.read_json(f'{requirement_path}/{Requirement_File}')
    return Requirement_File

In [3]:
def DataCleaning(Target_File):
    # Dropped the columns
    Target_File = Target_File.drop(['videoDescription','videoLiveBroadcastContent','videoFavoriteCount','videoTags','videoUniqueId','channelIdUnique','channelTitleCheck','channelDescription'],axis=1)
    duplicates = Target_File[Target_File.duplicated(keep=False)]  # This will select all duplicates, including the first occurrence
    
    # display(duplicates)
    # print(Target_File.duplicated(subset=['videoId', 'channelId']).sum())  # Check for duplicates based on videoId and channelId
    
    # Removing Duplicates
    Target_File  = Target_File.drop_duplicates(ignore_index=True)
    
    # Remving the videos which has videoDefaultAudioLanguage as None or starts without en
    Target_File_EN = Target_File[Target_File['videoDefaultAudioLanguage'].str.startswith("en",na=False)].reset_index(drop=True)

    for i in range(len(Target_File_EN['channelName'])):
        try:
            # Check and translate non-ASCII characters
            if not Target_File_EN['channelName'][i].isascii():
                Target_File_EN.loc[i, 'channelName'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['channelName'][i])
            if not Target_File_EN['videoTitle'][i].isascii():
                Target_File_EN.loc[i, 'videoTitle'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['videoTitle'][i])
    
            # Remove emojis
            Target_File_EN.loc[i, 'channelName'] = emoji.replace_emoji(Target_File_EN['channelName'][i], replace='')
            Target_File_EN.loc[i, 'videoTitle'] = emoji.replace_emoji(Target_File_EN['videoTitle'][i], replace='')
    
            # Decode HTML entities like &amp; and &#39;
            Target_File_EN.loc[i, 'channelName'] = html.unescape(Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = html.unescape(Target_File_EN['videoTitle'][i])
    
            # Remove non-ASCII characters
            Target_File_EN.loc[i, 'channelName'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['videoTitle'][i])
    
            # print(Target_File_EN['channelName'][i])
            # print(Target_File_EN['videoTitle'][i])
        except Exception as e:
            print(e)
    
    # Removing Duplicates
    Target_File_EN  = Target_File_EN.drop_duplicates(ignore_index=True)
    Target_File_EN = Target_File_EN.drop(['videoDefaultLanguage'],axis=1)
    Target_File_EN['channelCountry'] = Target_File_EN['channelCountry'].fillna('Unknown')
    return Target_File_EN


In [4]:
def videoDurationClassification(videoDurationInSeconds):
    if 0 <= videoDurationInSeconds <= 60:
        return "Very Short"
    elif 61 <= videoDurationInSeconds <= 120:
        return "Short"
    elif 121 <= videoDurationInSeconds <= 300:
        return "Medium"
    elif 301 <= videoDurationInSeconds <= 600:
        return "Long"
    elif 601 <= videoDurationInSeconds <= 3600:
        return "Very Long"
    elif 3601 <= videoDurationInSeconds <= 10800:
        return "Extended"
    elif videoDurationInSeconds > 10800:
        return "Ultra Long"
    else:
        return "Invalid video duration"

# Channel Growth Calculation

## **Overview**
The channel growth metric is designed to assess the growth of a YouTube channel based on key engagement indicators: 
- **Channel View Count**
- **Channel Subscriber Count**
- **Channel Video Count**
- **Channel Age (in years)**

## **Formula**
The channel growth score is computed as:

```python
channel_growth = ((normalized_views * weight_views) + 
                  (normalized_subscribers * weight_subscribers) + 
                  (normalized_videos * weight_videos)) / channel_age_in_years
```

where:
- **Min-Max Normalization** is applied to views, subscribers, and video count:
  ```python
  normalized_value = (value - min_value) / (max_value - min_value)
  ```
- **Weighting factors** determine the relative importance of each metric:
  - `weight_views = 0.5`
  - `weight_subscribers = 0.3`
  - `weight_videos = 0.2`
- **Channel Age (years)** is computed from:
  ```python
  channel_age_in_years = (current_timestamp - channelPublishedOnInSeconds) / (365 * 24 * 60 * 60)
  ```

## **Concepts Used**
1. **Min-Max Normalization**: Scales values between 0 and 1.
   - [Feature Scaling (Wikipedia)](https://en.wikipedia.org/wiki/Feature_scaling)
2. **Weighted Scoring**: Prioritizes key metrics based on impact.
   - [Weighted Scoring Model](https://theproductmanager.com/topics/weighted-scoring-model/)
3. **YouTube Analytics Metrics**: Defines the importance of views, subscribers, and videos.
   - [YouTube Analytics Help](https://support.google.com/youtube/answer/9002587?hl=en)
4. **Channel Age Calculation**: Determines the time span since the channel was created.
   - [Unix Time Conversion](https://www.unixtimestamp.com/)

## **Implementation Example**
```python
import time
import datetime
import pytz

def calculate_channel_growth(view_count, subscriber_count, video_count, channel_published_on):
    utc_timestamp = int(time.time())
    zone = pytz.timezone('Asia/Kolkata')
    current_ist_time = datetime.datetime.fromtimestamp(utc_timestamp, zone)
    channel_age_in_years = (current_ist_time - channel_published_on) / (365 * 24 * 60 * 60)
    
    # Min-Max normalization (Assume predefined min/max values from dataset)
    min_views, max_views = 1000, 10000000
    min_subscribers, max_subscribers = 10, 1000000
    min_videos, max_videos = 1, 10000
    
    normalized_views = (view_count - min_views) / (max_views - min_views)
    normalized_subscribers = (subscriber_count - min_subscribers) / (max_subscribers - min_subscribers)
    normalized_videos = (video_count - min_videos) / (max_videos - min_videos)
    
    # Weighted sum
    growth_score = ((normalized_views * 0.5) + (normalized_subscribers * 0.3) + (normalized_videos * 0.2)) / channel_age_in_years
    
    return growth_score
```

## **Conclusion**
This approach helps in analyzing a YouTube channel's growth potential by factoring in **engagement, longevity, and content volume**. It provides a scalable and adaptable framework for evaluating growth trends over time.


In [5]:
# Min-Max normalization
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

In [6]:
# Define a function to handle different datetime formats
def parse_datetime(value):
    if "Z" in value and "." not in value:
        return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%SZ")
    elif "Z" in value and "." in value:
        return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%fZ")
    else:
        return pd.NaT  # return NaT if neither format matches


In [7]:
def calculate_channel_growth(Cleaned_File):
    utc_timestamp = int(time.time())
    zone = pytz.timezone('Asia/Kolkata')
    current_ist_time = datetime.datetime.fromtimestamp(utc_timestamp, zone).replace(tzinfo=None) 
    # print(current_ist_time)
    
    '''

    channel
    
    '''

    channelPublishedOn = Cleaned_File["channelPublishedOn"].apply(parse_datetime)
    # print(Cleaned_File["channelPublishedOn"])
    
    Cleaned_File['channelAgeInYears'] = (current_ist_time - channelPublishedOn).dt.total_seconds() / (365 * 24 * 60 * 60)
    # print(Cleaned_File['channelAgeInYears'])
    # Min-Max normalization
    Cleaned_File['channelNormalizedViewCount'] = normalize(Cleaned_File['channelViewCount'])
    Cleaned_File['channelNormalizedSubscriberCount'] = normalize(Cleaned_File['channelSubscriberCount'])
    Cleaned_File['channelNormalizedVideoCount'] = normalize(Cleaned_File['channelVideoCount'])
    Cleaned_File['channelNormalizedChannelAge'] = normalize(Cleaned_File['channelAgeInYears'])
    
    # Weights
    weight_views = 0.5
    weight_subscribers = 0.3
    weight_videos = 0.2
    
    # Growth Score Calculation
    Cleaned_File['channelGrowthScore'] = ((Cleaned_File['channelNormalizedViewCount'] * weight_views) +
                          (Cleaned_File['channelNormalizedSubscriberCount'] * weight_subscribers) +
                          (Cleaned_File['channelNormalizedVideoCount'] * weight_videos)) / Cleaned_File['channelNormalizedChannelAge']

    '''

    Video
    
    '''
    
    videoPublishedOn = Cleaned_File["videoPublishedOn"].apply(parse_datetime)
    
    # Compute video age
    Cleaned_File["videoAgeInDays"] = (current_ist_time - videoPublishedOn).dt.total_seconds() / (24 * 60 * 60)
    
    # Calculate engagement metrics
    Cleaned_File["videoViewsPerDay"] = Cleaned_File["videoViewCount"] / Cleaned_File["videoAgeInDays"]
    Cleaned_File["videoLikeToViewRatio"] = Cleaned_File["videoLikeCount"] / Cleaned_File["videoViewCount"]
    Cleaned_File["videoCommentToViewRatio"] = Cleaned_File["videoCommentCount"] / Cleaned_File["videoViewCount"]
    
    # Engagement Score Calculation
    Cleaned_File["videoEngagementScore"] = (
        (Cleaned_File["videoViewsPerDay"] * 0.5) +
        (Cleaned_File["videoLikeToViewRatio"] * 100 * 0.3) +
        (Cleaned_File["videoCommentToViewRatio"] * 100 * 0.2)
    )
    
    return Cleaned_File

In [8]:
def FeatureEngineering(Cleaned_File):
    # Feature Engineering  
    
    # videoPublishedWeekDay - Derive the day of the week from the videoPublishedOn timestamp.  
    Cleaned_File['videoPublishedWeekDay'] = pd.to_datetime(Cleaned_File["videoPublishedOn"]).dt.day_name()
    # videoDurationClassification - Categorize videos into duration segments based on videoDurationInSeconds.  
        # Categories:
        #     Very Short (0 - 60 sec), Typically Shorts, Reels, or quick snippets.
        #     Short (61 sec - 2 min), Brief content, short tutorials, or quick explanations.
        #     Medium (2 min 1 sec - 5 min), Standard short-form content, concise videos.
        #     Long (5 min 1 sec - 10 min), In-depth discussions, detailed tutorials.
        #     Very Long (10 min 1 sec - 1 hour), Educational content, podcasts, detailed explainers.
        #     Extended (1 hour 1 sec - 3 hours), Webinars, live sessions, long-form discussions.
        #     Ultra Long (3 hours 1 sec and above), Movie-length content, streams, recorded conferences.
    Cleaned_File['videoDurationClassification'] = Cleaned_File['videoDurationInSeconds'].apply(videoDurationClassification)
    
    # channelGrowth metric and normalization of columns(channelPublishedOn, channelViewCount, channelSubscriberCount, and channelVideoCount) - Assess channel growth using factors such as channelPublishedOn, channelViewCount, channelSubscriberCount, and channelVideoCount.  
    # videoEngagementScore metric and normalization of columns(videoPublishedOn, videoViewCount, videoLikeCount, and videoCommentCount)- Evaluate video engagement based on videoPublishedOn, videoViewCount, videoLikeCount, and videoCommentCount.  
    Cleaned_File = calculate_channel_growth(Cleaned_File)
    
    # channelGrowthScoreRank - Rank channels based on the channelGrowth metric.  
    Cleaned_File["channelGrowthScoreRank"] = Cleaned_File["channelGrowthScore"].rank()
    # videoEngagementScoreRank - Rank videos based on the videoEngagement metric.  
    Cleaned_File["videoEngagementScoreRank"] = Cleaned_File["videoEngagementScore"].rank()
    # Geographic Classification - Assign an upper-level geographical classification.  
        # Columns include:  
            # - country_name  
            # - continent  
            # - continent_code  
            # - it_hub_country (indicator for whether the country is a major IT hub).  

    Country_Details_ISO = Requirement_File_Extraction(repo_url,kaggle_repo_url).transpose()
    Country_Details_ISO = Country_Details_ISO.reset_index()
    Country_Details_ISO.rename(columns={'index': 'country_code'}, inplace=True)
    resultDataFrame = pd.merge(Cleaned_File, Country_Details_ISO, left_on='channelCountry', right_on='country_code', how='left')
    resultDataFrame.fillna('Unknown', inplace=True)
    # for i in Country_Details_ISO['country_name']:
    #     print(i)
    # display(Country_Details_ISO)
    return resultDataFrame

In [9]:
def GitHubPush(Target_File_EN):
    record_count = len(Target_File_EN)
    
    # Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    
    # Create a filename using the generated timestamp to ensure uniqueness with number of records.
    filename = f"DC_{timestamp}_{record_count}_records.json"
    
    # Save the DataFrame to a JSON file with readable formatting.
    Target_File_EN.to_json(filename, orient="records", indent=4)
    print(f"DataFrame saved as {filename}")
    
    if not os.path.exists(destination_path):
        # Create the destination directory if it doesn't exist
        os.makedirs(destination_path)
        print('created the destination directory, DataCleaning/Daily')
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    else:
        print('Destination directory already exists')
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    
    # Initialize the repository for git operations
    repo = Repo(kaggle_repo_url)
    
    # Add the copied file to the staging area
    repo.index.add([f"{destination_path}/{filename}"])
    
    # Create a timestamp for the commit message
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    # Commit the changes with a message including the timestamp
    repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
    
    # Push the changes to the remote repository
    origin = repo.remote(name="origin")
    push_result = origin.push()
    if push_result:
        print("Push successful.")
    else:
        print("Push failed.")

In [10]:
def main():
    Source_File = Source_File_Extraction(repo_url,kaggle_repo_url)
    Cleaned_File = DataCleaning(Source_File)
    # display(Cleaned_File.sort_values(by='videoDurationInSeconds',ascending = True))
    Feature_File = FeatureEngineering(Cleaned_File)
    # display(Feature_File)
    GitHubPush(Feature_File)

In [11]:
if __name__ == "__main__":
    import os  
    import git 
    from git import Repo  
    import time
    import datetime  
    from pytz import timezone
    import pytz
    import pandas as pd
    import deep_translator
    from deep_translator import GoogleTranslator
    import shutil
    import emoji
    import re
    import html
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("dataCleanRepoUrl")
    repo_url = secret_value_0
    ist = timezone('Asia/Kolkata')
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'
    source_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'
    requirement_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Requirement/Daily'
    # Below script prevents all columns and rows from getting truncated while display
    pd.set_option("display.max_columns", None)
    pd.set_option("display.max_rows",None)
    main()


Successfully cloned the git repo
Already cloned and the repo file exists
Successfully pulled the git repo before push
DataFrame saved as DC_2025-02-05_01:33:35_412_records.json
Destination directory already exists
Push successful.
