---
**Observation:**  

1. Null values are present in the following columns:
   - **`videoDefaultLanguage`**  (will be dropped after data cleaning)
   - **`videoDefaultAudioLanguage`** 
   - **`channelCountry`**

---

2. The following columns will be dropped as part of data cleaning:
   - **`videoDescription`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoLiveBroadcastContent`**: All values are `'none'`, providing no variability or insights. 
   - **`videoFavoriteCount`**: All values are `0`, making it redundant.  
   - **`videoTags`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoUniqueId`**: Identified as a duplicate column.  
   - **`channelIdUnique`**: Identified as a duplicate column.  
   - **`channelTitleCheck`**: Identified as a duplicate column.  
   - **`channelDescription`**: Reserved for analysis in future NLP project with a broader dataset.
---

3. The columns **`channelName`** and **`videoTitle`** require further processing due to the presence of:
    - Multilingual text.  
    - Emojis and special characters.  

---

In [1]:
!pip install deep_translator
!pip install emoji

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [2]:
def Source_File_Extraction(repo_url,kaggle_repo_url):
    if os.path.exists(kaggle_repo_url):
        print("Already cloned and the repo file exists")
        repo = git.Repo(kaggle_repo_url)  # Access the existing repository
        origin = repo.remote(name='origin')  # Get the remote repository
        origin.pull()  # Pull the latest changes from the repository
        print("Successfully pulled the git repo before push")
    else:
        # Clone the repository if it doesn't exist
        repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
        print("Successfully cloned the git repo")
    output_files = os.listdir(source_path)
    Target_File = max([i for i in output_files if i.startswith("S_") and i.endswith('records.json')])
    Target_File = pd.read_json(f'{source_path}/{Target_File}')
    return Target_File

---



In [3]:
def DataCleaning(Target_File):
    # Dropped the columns
    Target_File = Target_File.drop(['videoDescription','videoLiveBroadcastContent','videoFavoriteCount','videoTags','videoUniqueId','channelIdUnique','channelTitleCheck','channelDescription'],axis=1)
    
    # Removing Duplicates
    Target_File  = Target_File.drop_duplicates(ignore_index=True)
    
    # Remving the videos which has videoDefaultAudioLanguage as None or not starts with en
    Target_File_EN = Target_File[Target_File['videoDefaultAudioLanguage'].str.startswith("en",na=False)].reset_index(drop=True)

    for i in range(len(Target_File_EN['channelName'])):
        try:
            # Check and translate non-ASCII characters
            if not Target_File_EN['channelName'][i].isascii():
                Target_File_EN.loc[i, 'channelName'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['channelName'][i])
            if not Target_File_EN['videoTitle'][i].isascii():
                Target_File_EN.loc[i, 'videoTitle'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['videoTitle'][i])
    
            # Remove emojis
            Target_File_EN.loc[i, 'channelName'] = emoji.replace_emoji(Target_File_EN['channelName'][i], replace='')
            Target_File_EN.loc[i, 'videoTitle'] = emoji.replace_emoji(Target_File_EN['videoTitle'][i], replace='')
    
            # Decode HTML entities like &amp; and &#39;
            Target_File_EN.loc[i, 'channelName'] = html.unescape(Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = html.unescape(Target_File_EN['videoTitle'][i])
    
            # Remove non-ASCII characters
            Target_File_EN.loc[i, 'channelName'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['channelName'][i])
            Target_File_EN.loc[i, 'videoTitle'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['videoTitle'][i])
    
            # print(Target_File_EN['channelName'][i])
            # print(Target_File_EN['videoTitle'][i])
        except Exception as e:
            print(e)
    
    # Removing Duplicates
    Target_File_EN  = Target_File_EN.drop_duplicates(ignore_index=True)
    Target_File_EN = Target_File_EN.drop(['videoDefaultLanguage'],axis=1)
    Target_File_EN['channelCountry'] = Target_File_EN['channelCountry'].fillna('Unknown')
    return Target_File_EN


In [4]:
def GitHubPush(Target_File_EN):
    record_count = len(Target_File_EN)
    
    # Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    
    # Create a filename using the generated timestamp to ensure uniqueness with number of records.
    filename = f"DC_{timestamp}_{record_count}_records.json"
    
    # Save the DataFrame to a JSON file with readable formatting.
    Target_File_EN.to_json(filename, orient="records", indent=4)
    print(f"DataFrame saved as {filename}")
    
    if not os.path.exists(destination_path):
        # Create the destination directory if it doesn't exist
        os.makedirs(destination_path)
        print('created the destination directory, DataCleaning/Daily')
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    else:
        print('Destination directory already exists')
        shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
    
    # Initialize the repository for git operations
    repo = Repo(kaggle_repo_url)
    
    # Add the copied file to the staging area
    repo.index.add([f"{destination_path}/{filename}"])
    
    # Create a timestamp for the commit message
    timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
    # Commit the changes with a message including the timestamp
    repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
    
    # Push the changes to the remote repository
    origin = repo.remote(name="origin")
    push_result = origin.push()
    if push_result:
        print("Push successful.")
    else:
        print("Push failed.")

In [5]:
def main():
    Source_File = Source_File_Extraction(repo_url,kaggle_repo_url)
    Cleaned_File = DataCleaning(Source_File)
    display(Cleaned_File.sort_values(by='videoDurationInSeconds',ascending = True))
    GitHubPush(Cleaned_File)

In [6]:
if __name__ == "__main__":
    import os  
    import git 
    from git import Repo  
    import datetime  
    from pytz import timezone
    import pandas as pd
    import deep_translator
    from deep_translator import GoogleTranslator
    from pytz import timezone 
    import shutil
    import emoji
    import re
    import html
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("dataCleanRepoUrl")
    repo_url = secret_value_0
    ist = timezone('Asia/Kolkata')
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'
    source_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'
    # Below script prevents all columns and rows from getting truncated while display
    pd.set_option("display.max_columns", None)
    pd.set_option("display.max_rows",None)
    main()

Successfully cloned the git repo


Unnamed: 0,channelId,channelName,videoId,videoTitle,videoPublishYear,videoPublishMonth,videoPublishDay,videoPublishTime,videoPublishedOn,videoPublishedOnInSeconds,videoViewCount,videoLikeCount,videoCommentCount,videoCategoryId,videoDefaultAudioLanguage,videoDuration,videoDurationInSeconds,ContentType,videoDimension,videoDefinition,videoCaption,videoLicensedContent,videoProjection,channelCustomUrl,channelPublishYear,channelPublishMonth,channelPublishDay,channelPublishTime,channelPublishedOn,channelPublishedOnInSeconds,channelCountry,channelViewCount,channelSubscriberCount,channelVideoCount
215,UCiPMh3oIcVee2Q2p8_W-lxg,Blujay Technologies,lLubrB7zt6g,DevOps Zero to Hero,2023,10,29,16:46:48,2023-10-29T16:46:48Z,1698598008,2511,76,0,22,en,PT14S,14,Short,2d,hd,True,False,rectangular,@blujaytech,2012,8,11,08:54:06,2012-08-11T08:54:06Z,1344675246,Unknown,55970,2760,413
406,UCXddspaT5bYIQ9133GPYQdQ,Integrity Vision,KN5kv1Ya2vQ,QA: What is Quality Assurance #qualityassuranc...,2024,8,26,20:00:07,2024-08-26T20:00:07Z,1724702407,153,3,0,28,en-GB,PT26S,26,Short,2d,hd,True,False,rectangular,@integrityvision,2010,9,20,13:59:00,2010-09-20T13:59:00Z,1284991140,UA,28317,93,149
318,UCuVZCbckCp0O49wlvkcDnbA,Glitter Brains,anX7_5iZoWo,The Top 5 Must-Have DevOps Tools You Need for ...,2023,5,2,14:00:02,2023-05-02T14:00:02Z,1683036002,499,17,0,28,en-US,PT32S,32,Short,2d,hd,True,False,rectangular,@glitterbrains2431,2023,3,9,05:14:49.818231,2023-03-09T05:14:49.818231Z,1678338889,Unknown,27474,90,103
307,UCzUEbiVPFhuRvkwYFbGPALg,The Resume Whisperer,OfTCfPylRUc,Cracking the DevOps Code: Courses & Certificat...,2023,8,8,23:53:26,2023-08-08T23:53:26Z,1691538806,573,21,0,27,en,PT33S,33,Short,2d,hd,True,False,rectangular,@airesumebuilder,2021,7,24,12:10:09.321493,2021-07-24T12:10:09.321493Z,1627128609,US,259652,351,185
153,UC4PSydHaXya-VSrgFCdRULA,The Deep Stack Podcast,LO5Uj9o6fYs,It shouldn't be so annoying to run a container...,2024,7,8,16:52:17,2024-07-08T16:52:17Z,1720457537,6318,39,0,28,en-CA,PT35S,35,Short,2d,hd,True,False,rectangular,@thedeepstackpodcast,2023,4,4,02:59:37.024718,2023-04-04T02:59:37.024718Z,1680577177,CA,19586,268,6
395,UCBwSCyzT3GukpVdUdxMjKYw,Red Hat EMEA,n2HGPoSuKfI,Red Hat DevOps Culture and Practices - the sec...,2024,4,26,19:39:35,2024-04-26T19:39:35Z,1714160375,167,1,0,28,en,PT36S,36,Short,2d,hd,True,False,rectangular,@redhatemea,2013,7,15,17:06:36,2013-07-15T17:06:36Z,1373907996,Unknown,1133773,3490,370
412,UCbrXRQHV4TOU4Pqzw325Z1A,Pluralsight,FT-2pkA2zhs,Changing behaviors makes you better... #tech ...,2023,10,5,20:00:04,2023-10-05T20:00:04Z,1696536004,123,4,0,28,en-US,PT39S,39,Short,2d,hd,True,False,rectangular,@pluralsight,2009,3,9,14:47:15,2009-03-09T14:47:15Z,1236610035,US,16892109,91200,3928
325,UCJFBnZFBlHfuSWdLM4jX9bw,OutOfDevOps,1cTRjLrSOGw,What's the goal of Platform Engineering? #shor...,2023,3,22,12:45:00,2023-03-22T12:45:00Z,1679489100,436,13,0,27,en-GB,PT40S,40,Short,2d,hd,True,False,rectangular,@outofdevops,2020,6,12,02:38:47.618757,2020-06-12T02:38:47.618757Z,1591929527,GB,106620,2140,102
329,UCmIIOHKgJnGQruIVD_Zx71g,Sleuth TV,8bQBLmPKGww,Trunk-based dev causes burnout? - State of De...,2023,12,11,15:00:04,2023-12-11T15:00:04Z,1702306804,411,6,0,28,en,PT43S,43,Short,2d,hd,True,False,rectangular,@sleuthtv,2020,5,18,15:02:07.21018,2020-05-18T15:02:07.21018Z,1589814127,US,2591248,1170,167
348,UCaJpQyYXpTlMoHHZuuaTTug,Oleg Shelaev,kMOAq1O8lCE,What are 15-factor apps and environment parity...,2023,8,30,07:46:59,2023-08-30T07:46:59Z,1693381619,319,7,0,28,en,PT43S,43,Short,2d,hd,True,False,rectangular,@shelajev,2009,5,30,12:36:00,2009-05-30T12:36:00Z,1243686960,EE,103577,1140,80


DataFrame saved as DC_2025-01-29_01:18:14_427_records.json
Destination directory already exists
Push successful.
