In [1]:
!pip install deep_translator
!pip install emoji

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [2]:
import os  
import git 
from git import Repo  
import datetime  
from pytz import timezone
import pandas as pd
import deep_translator
from deep_translator import GoogleTranslator
from pytz import timezone 
import shutil
import emoji
import re
import html
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("dataCleanRepoUrl")

In [3]:
repo_url = secret_value_0
ist = timezone('Asia/Kolkata')
kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'
source_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'

In [4]:
if os.path.exists(kaggle_repo_url):
    print("Already cloned and the repo file exists")
    repo = git.Repo(kaggle_repo_url)  # Access the existing repository
    origin = repo.remote(name='origin')  # Get the remote repository
    origin.pull()  # Pull the latest changes from the repository
    print("Successfully pulled the git repo before push")
else:
    # Clone the repository if it doesn't exist
    repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
    print("Successfully cloned the git repo")

Successfully cloned the git repo


In [5]:
output_files = os.listdir(source_path)
Target_File = max([i for i in output_files if i.startswith("S_") and i.endswith('records.json')])
Target_File = pd.read_json(f'{source_path}/{Target_File}')

---
**Observation:**  

1. Null values are present in the following columns:
   - **`videoDefaultLanguage`**  (will be dropped after data cleaning)
   - **`videoDefaultAudioLanguage`** 
   - **`channelCountry`**

---

2. The following columns will be dropped as part of data cleaning:
   - **`videoDescription`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoLiveBroadcastContent`**: All values are `'none'`, providing no variability or insights. 
   - **`videoFavoriteCount`**: All values are `0`, making it redundant.  
   - **`videoTags`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoUniqueId`**: Identified as a duplicate column.  
   - **`channelIdUnique`**: Identified as a duplicate column.  
   - **`channelTitleCheck`**: Identified as a duplicate column.  
   - **`channelDescription`**: Reserved for analysis in future NLP project with a broader dataset.
---

In [6]:
count= 0
for i in Target_File['videoLiveBroadcastContent']:
    if i == 'none':
        count +=1
print(f'Count of none in videoLiveBroadcastContent: {count}\n')
count =0
for i in Target_File['videoFavoriteCount']:
    if i == 0:
        count +=1
print(f'Count of 0 in videoFavoriteCount: {count}\n')
Target_File.info()

Count of none in videoLiveBroadcastContent: 508

Count of 0 in videoFavoriteCount: 508

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508 entries, 0 to 507
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   currentDate                  508 non-null    object
 1   channelId                    508 non-null    object
 2   channelName                  508 non-null    object
 3   videoId                      508 non-null    object
 4   videoTitle                   508 non-null    object
 5   videoPublishYear             508 non-null    int64 
 6   videoPublishMonth            508 non-null    int64 
 7   videoPublishDay              508 non-null    int64 
 8   videoPublishTime             508 non-null    object
 9   videoPublishedOn             508 non-null    object
 10  videoPublishedOnInSeconds    508 non-null    int64 
 11  videoUniqueId                508 non-null    object
 12  vide

In [7]:
# Dropped the columns
Target_File = Target_File.drop(['videoDescription','videoLiveBroadcastContent','videoFavoriteCount','videoTags','videoUniqueId','channelIdUnique','channelTitleCheck','channelDescription'],axis=1)

Target_File.info()

# Below script prevents all columns and rows from getting truncated while display
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows",None)

# Removing Duplicates
Target_File  = Target_File.drop_duplicates(ignore_index=True)

# Remving the videos which has videoDefaultAudioLanguage as None or not starts with en
Target_File_EN = Target_File[Target_File['videoDefaultAudioLanguage'].str.startswith("en",na=False)].reset_index(drop=True)
# GroupBYLang = Target_File_EN.groupby('videoDefaultAudioLanguage').count()
# display(GroupBYLang)
# len(Target_File_EN)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508 entries, 0 to 507
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   currentDate                  508 non-null    object
 1   channelId                    508 non-null    object
 2   channelName                  508 non-null    object
 3   videoId                      508 non-null    object
 4   videoTitle                   508 non-null    object
 5   videoPublishYear             508 non-null    int64 
 6   videoPublishMonth            508 non-null    int64 
 7   videoPublishDay              508 non-null    int64 
 8   videoPublishTime             508 non-null    object
 9   videoPublishedOn             508 non-null    object
 10  videoPublishedOnInSeconds    508 non-null    int64 
 11  videoViewCount               508 non-null    int64 
 12  videoLikeCount               508 non-null    int64 
 13  videoCommentCount            508 no

---

**Observation:**  

The columns **`channelName`** and **`videoTitle`** require further processing due to the presence of:  
- Multilingual text.  
- Emojis and special characters.  

---

In [8]:
for i in range(len(Target_File_EN['channelName'])):
    try:
        # Check and translate non-ASCII characters
        if not Target_File_EN['channelName'][i].isascii():
            Target_File_EN.loc[i, 'channelName'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['channelName'][i])
        if not Target_File_EN['videoTitle'][i].isascii():
            Target_File_EN.loc[i, 'videoTitle'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['videoTitle'][i])

        # Remove emojis
        Target_File_EN.loc[i, 'channelName'] = emoji.replace_emoji(Target_File_EN['channelName'][i], replace='')
        Target_File_EN.loc[i, 'videoTitle'] = emoji.replace_emoji(Target_File_EN['videoTitle'][i], replace='')

        # Decode HTML entities like &amp; and &#39;
        Target_File_EN.loc[i, 'channelName'] = html.unescape(Target_File_EN['channelName'][i])
        Target_File_EN.loc[i, 'videoTitle'] = html.unescape(Target_File_EN['videoTitle'][i])

        # Remove non-ASCII characters
        Target_File_EN.loc[i, 'channelName'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['channelName'][i])
        Target_File_EN.loc[i, 'videoTitle'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['videoTitle'][i])

        # print(Target_File_EN['channelName'][i])
        # print(Target_File_EN['videoTitle'][i])
    except Exception as e:
        print(e)

# Removing Duplicates
Target_File_EN  = Target_File_EN.drop_duplicates(ignore_index=True)

In [9]:
Target_File_EN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   currentDate                  425 non-null    object
 1   channelId                    425 non-null    object
 2   channelName                  425 non-null    object
 3   videoId                      425 non-null    object
 4   videoTitle                   425 non-null    object
 5   videoPublishYear             425 non-null    int64 
 6   videoPublishMonth            425 non-null    int64 
 7   videoPublishDay              425 non-null    int64 
 8   videoPublishTime             425 non-null    object
 9   videoPublishedOn             425 non-null    object
 10  videoPublishedOnInSeconds    425 non-null    int64 
 11  videoViewCount               425 non-null    int64 
 12  videoLikeCount               425 non-null    int64 
 13  videoCommentCount            425 no

In [10]:
# Target_File_EN = Target_File_EN.isnull().any(axis=1)

In [11]:
# print(Target_File_EN)
Target_File_EN_NULL = Target_File_EN.isnull().any(axis=1)
null_rows = Target_File_EN[Target_File_EN_NULL]

display(null_rows)

Unnamed: 0,currentDate,channelId,channelName,videoId,videoTitle,videoPublishYear,videoPublishMonth,videoPublishDay,videoPublishTime,videoPublishedOn,videoPublishedOnInSeconds,videoViewCount,videoLikeCount,videoCommentCount,videoCategoryId,videoDefaultLanguage,videoDefaultAudioLanguage,videoDuration,videoDurationInSeconds,videoDimension,videoDefinition,videoCaption,videoLicensedContent,videoProjection,channelCustomUrl,channelPublishYear,channelPublishMonth,channelPublishDay,channelPublishTime,channelPublishedOn,channelPublishedOnInSeconds,channelCountry,channelViewCount,channelSubscriberCount,channelVideoCount
2,2025-01-28,UCs_tLP3AiwYKwdUHpltJPuA,GOTO Conferences,Qv92qaIGbDg,Expert Talk: DevOps & Software Architecture S...,2022,2,15,13:00:09,2022-02-15T13:00:09Z,1644930009,3109431,8574,47,28,,en,2391000,2391,2d,hd,True,True,rectangular,@goto-,2011,10,3,12:34:19,2011-10-03T12:34:19Z,1317645259,CH,61896889,1050000,3058
3,2025-01-28,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,fqMOX6JJhGo,Docker Tutorial for Beginners - A Full DevOps ...,2019,8,16,13:48:15,2019-08-16T13:48:15Z,1565963295,2696007,45602,1150,27,,en,7819000,7819,2d,hd,True,True,rectangular,@freecodecamp,2014,12,16,21:18:48,2014-12-16T21:18:48Z,1418764728,US,826365715,10400000,1798
8,2025-01-28,UCkw4JCwteGrDHIsyIIKo4tQ,edureka!,a9_oMNSgX2g,Introduction to DevOps | DevOps Tutorial for B...,2018,3,26,14:23:23,2018-03-26T14:23:23Z,1522074203,627578,7070,77,27,,en,1206000,1206,2d,hd,True,True,rectangular,@edurekain,2012,6,29,06:12:26,2012-06-29T06:12:26Z,1340950346,IN,443515789,4330000,11558
11,2025-01-28,UCkw4JCwteGrDHIsyIIKo4tQ,edureka!,3EyT1i0wYUY,What is DevOps? | DevOps Training - DevOps Int...,2016,9,15,13:29:57,2016-09-15T13:29:57Z,1473946197,479022,2739,170,27,,en,534000,534,2d,hd,True,True,rectangular,@edurekain,2012,6,29,06:12:26,2012-06-29T06:12:26Z,1340950346,IN,443515789,4330000,11558
13,2025-01-28,UCkw4JCwteGrDHIsyIIKo4tQ,edureka!,GJQ36pIYbic,DevOps Tutorial For Beginners | What Is DevOps...,2018,4,20,05:33:22,2018-04-20T05:33:22Z,1524202402,359109,3308,94,27,,en,4272000,4272,2d,hd,True,True,rectangular,@edurekain,2012,6,29,06:12:26,2012-06-29T06:12:26Z,1340950346,IN,443515789,4330000,11558
14,2025-01-28,UCH912uDFX3sIaPXMrJa9X_w,Questpond,aonA7Kb7WGE,Azure DevOps Step by Step Tutorial for Beginne...,2023,3,12,08:30:14,2023-03-12T08:30:14Z,1678609814,352259,6929,494,27,,en-GB,6127000,6127,2d,hd,True,True,rectangular,@questpondvideos,2009,9,5,12:02:49,2009-09-05T12:02:49Z,1252152169,IN,20974289,220000,561
15,2025-01-28,UCHIbErciyS3Hs0kjAz-at5Q,Technical Suneja,XvtFppcynYM,DevOps & Cloud Engineer Certifications Get Ce...,2023,8,25,12:30:30,2023-08-25T12:30:30Z,1692966630,342687,9869,312,27,,en,2430000,2430,2d,hd,True,True,rectangular,@technicalsuneja,2016,6,14,18:47:50,2016-06-14T18:47:50Z,1465930070,IN,52059250,523000,919
17,2025-01-28,UCkw4JCwteGrDHIsyIIKo4tQ,edureka!,z6Olg2YRPC4,DevOps Tutorial | DevOps Tutorial for Beginner...,2017,1,5,07:27:25,2017-01-05T07:27:25Z,1483601245,319403,1361,162,27,,en,8375000,8375,2d,hd,True,True,rectangular,@edurekain,2012,6,29,06:12:26,2012-06-29T06:12:26Z,1340950346,IN,443515789,4330000,11558
19,2025-01-28,UC0m-80FnNY2Qb7obvTL_2fA,Microsoft Azure,w6RURa_LPgQ,Use DevOps to build cloud-native apps anywhere...,2021,10,18,16:45:12,2021-10-18T16:45:12Z,1634575512,290925,47,0,28,,en,489000,489,2d,hd,True,False,rectangular,@microsoftazure,2008,11,21,07:09:44,2008-11-21T07:09:44Z,1227251384,US,22050485,333000,1306
22,2025-01-28,UCYeiozh-4QwuC1sjgCmB92w,DevOps Toolbox,Z5uBcczJxUY,Stop Using Docker. Use Open Source Instead,2025,1,24,14:30:17,2025-01-24T14:30:17Z,1737729017,234986,5401,367,28,,en,760000,760,2d,hd,True,True,rectangular,@devopstoolbox,2022,12,2,17:49:54.018545,2022-12-02T17:49:54.018545Z,1670003394,GB,3418125,42100,111


In [12]:
Target_File_EN = Target_File_EN.drop(['videoDefaultLanguage'],axis=1)
Target_File_EN['channelCountry'] = Target_File_EN['channelCountry'].fillna('Unknown')

In [13]:
record_count = len(Target_File_EN)

# Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")

# Create a filename using the generated timestamp to ensure uniqueness with number of records.
filename = f"DC_{timestamp}_{record_count}_records.json"

# Save the DataFrame to a JSON file with readable formatting.
Target_File_EN.to_json(filename, orient="records", indent=4)
print(f"DataFrame saved as {filename}")

if not os.path.exists(destination_path):
    # Create the destination directory if it doesn't exist
    os.makedirs(destination_path)
    print('created the destination directory, DataCleaning/Daily')
    shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
else:
    print('Destination directory already exists')
    shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')

# Initialize the repository for git operations
repo = Repo(kaggle_repo_url)

# Add the copied file to the staging area
repo.index.add([f"{destination_path}/{filename}"])

# Create a timestamp for the commit message
timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
# Commit the changes with a message including the timestamp
repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")

# Push the changes to the remote repository
origin = repo.remote(name="origin")
push_result = origin.push()
if push_result:
    print("Push successful.")
else:
    print("Push failed.")

DataFrame saved as DC_2025-01-28_05:12:38_425_records.json
Destination directory already exists
Push successful.
