In [1]:
!pip install deep_translator
!pip install emoji

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [2]:
import os  
import git 
from git import Repo  
import datetime  
from pytz import timezone
import pandas as pd
import deep_translator
from deep_translator import GoogleTranslator
from pytz import timezone 
import shutil
import emoji
import re
import html
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("dataCleanRepoUrl")

In [3]:
repo_url = secret_value_0
ist = timezone('Asia/Kolkata')
kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/DataCleaning/Daily'
source_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Source/Daily'

In [4]:
if os.path.exists(kaggle_repo_url):
    print("Already cloned and the repo file exists")
    repo = git.Repo(kaggle_repo_url)  # Access the existing repository
    origin = repo.remote(name='origin')  # Get the remote repository
    origin.pull()  # Pull the latest changes from the repository
    print("Successfully pulled the git repo before push")
else:
    # Clone the repository if it doesn't exist
    repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
    print("Successfully cloned the git repo")

Successfully cloned the git repo


In [5]:
output_files = os.listdir(source_path)
Target_File = max([i for i in output_files if i.startswith("S_") and i.endswith('records.json')])
Target_File = pd.read_json(f'{source_path}/{Target_File}')

---
**Observation:**  

1. Null values are present in the following columns:
   - **`videoDefaultLanguage`**  (will be dropped after data cleaning)
   - **`videoDefaultAudioLanguage`** 
   - **`channelCountry`**

---

2. The following columns will be dropped as part of data cleaning:
   - **`videoDescription`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoLiveBroadcastContent`**: All values are `'none'`, providing no variability or insights. 
   - **`videoFavoriteCount`**: All values are `0`, making it redundant.  
   - **`videoTags`**: Reserved for analysis in future NLP project with a broader dataset.  
   - **`videoUniqueId`**: Identified as a duplicate column.  
   - **`channelIdUnique`**: Identified as a duplicate column.  
   - **`channelTitleCheck`**: Identified as a duplicate column.  
   - **`channelDescription`**: Reserved for analysis in future NLP project with a broader dataset.
---

In [6]:
count= 0
for i in Target_File['videoLiveBroadcastContent']:
    if i == 'none':
        count +=1
print(f'Count of none in videoLiveBroadcastContent: {count}\n')
count =0
for i in Target_File['videoFavoriteCount']:
    if i == 0:
        count +=1
print(f'Count of 0 in videoFavoriteCount: {count}\n')
Target_File.info()

Count of none in videoLiveBroadcastContent: 509

Count of 0 in videoFavoriteCount: 509

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   currentDate                  509 non-null    object
 1   channelId                    509 non-null    object
 2   channelName                  509 non-null    object
 3   videoId                      509 non-null    object
 4   videoTitle                   509 non-null    object
 5   videoPublishYear             509 non-null    int64 
 6   videoPublishMonth            509 non-null    int64 
 7   videoPublishDay              509 non-null    int64 
 8   videoPublishTime             509 non-null    object
 9   videoPublishedOn             509 non-null    object
 10  videoPublishedOnInSeconds    509 non-null    int64 
 11  videoUniqueId                509 non-null    object
 12  vide

In [7]:
# Dropped the columns
Target_File = Target_File.drop(['videoDescription','videoLiveBroadcastContent','videoFavoriteCount','videoTags','videoUniqueId','channelIdUnique','channelTitleCheck','channelDescription'],axis=1)

Target_File.info()

# Below script prevents all columns and rows from getting truncated while display
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows",None)

# Removing Duplicates
Target_File  = Target_File.drop_duplicates(ignore_index=True)

# Remving the videos which has videoDefaultAudioLanguage as None or not starts with en
Target_File_EN = Target_File[Target_File['videoDefaultAudioLanguage'].str.startswith("en",na=False)].reset_index(drop=True)
# GroupBYLang = Target_File_EN.groupby('videoDefaultAudioLanguage').count()
# display(GroupBYLang)
# len(Target_File_EN)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   currentDate                  509 non-null    object
 1   channelId                    509 non-null    object
 2   channelName                  509 non-null    object
 3   videoId                      509 non-null    object
 4   videoTitle                   509 non-null    object
 5   videoPublishYear             509 non-null    int64 
 6   videoPublishMonth            509 non-null    int64 
 7   videoPublishDay              509 non-null    int64 
 8   videoPublishTime             509 non-null    object
 9   videoPublishedOn             509 non-null    object
 10  videoPublishedOnInSeconds    509 non-null    int64 
 11  videoViewCount               509 non-null    int64 
 12  videoLikeCount               509 non-null    int64 
 13  videoCommentCount            509 no

---

**Observation:**  

The columns **`channelName`** and **`videoTitle`** require further processing due to the presence of:  
- Multilingual text.  
- Emojis and special characters.  

---

In [8]:
for i in range(len(Target_File_EN['channelName'])):
    try:
        # Check and translate non-ASCII characters
        if not Target_File_EN['channelName'][i].isascii():
            Target_File_EN.loc[i, 'channelName'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['channelName'][i])
        if not Target_File_EN['videoTitle'][i].isascii():
            Target_File_EN.loc[i, 'videoTitle'] = GoogleTranslator(source='auto', target='en').translate(Target_File_EN['videoTitle'][i])

        # Remove emojis
        Target_File_EN.loc[i, 'channelName'] = emoji.replace_emoji(Target_File_EN['channelName'][i], replace='')
        Target_File_EN.loc[i, 'videoTitle'] = emoji.replace_emoji(Target_File_EN['videoTitle'][i], replace='')

        # Decode HTML entities like &amp; and &#39;
        Target_File_EN.loc[i, 'channelName'] = html.unescape(Target_File_EN['channelName'][i])
        Target_File_EN.loc[i, 'videoTitle'] = html.unescape(Target_File_EN['videoTitle'][i])

        # Remove non-ASCII characters
        Target_File_EN.loc[i, 'channelName'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['channelName'][i])
        Target_File_EN.loc[i, 'videoTitle'] = re.sub(r'[^\x00-\x7F]+', '', Target_File_EN['videoTitle'][i])

        # print(Target_File_EN['channelName'][i])
        # print(Target_File_EN['videoTitle'][i])
    except Exception as e:
        print(e)

# Removing Duplicates
Target_File_EN  = Target_File_EN.drop_duplicates(ignore_index=True)

In [9]:
Target_File_EN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   currentDate                  421 non-null    object
 1   channelId                    421 non-null    object
 2   channelName                  421 non-null    object
 3   videoId                      421 non-null    object
 4   videoTitle                   421 non-null    object
 5   videoPublishYear             421 non-null    int64 
 6   videoPublishMonth            421 non-null    int64 
 7   videoPublishDay              421 non-null    int64 
 8   videoPublishTime             421 non-null    object
 9   videoPublishedOn             421 non-null    object
 10  videoPublishedOnInSeconds    421 non-null    int64 
 11  videoViewCount               421 non-null    int64 
 12  videoLikeCount               421 non-null    int64 
 13  videoCommentCount            421 no

In [10]:
# print(Target_File_EN)
# Target_File_EN_NULL = Target_File_EN.isnull().any(axis=1)
# null_rows = Target_File_EN[Target_File_EN_NULL]
# display(null_rows)

In [11]:
Target_File_EN = Target_File_EN.drop(['videoDefaultLanguage'],axis=1)
Target_File_EN['channelCountry'] = Target_File_EN['channelCountry'].fillna('Unknown')

In [12]:
record_count = len(Target_File_EN)

# Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")

# Create a filename using the generated timestamp to ensure uniqueness with number of records.
filename = f"DC_{timestamp}_{record_count}_records.json"

# Save the DataFrame to a JSON file with readable formatting.
Target_File_EN.to_json(filename, orient="records", indent=4)
print(f"DataFrame saved as {filename}")

if not os.path.exists(destination_path):
    # Create the destination directory if it doesn't exist
    os.makedirs(destination_path)
    print('created the destination directory, DataCleaning/Daily')
    shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
else:
    print('Destination directory already exists')
    shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')

# Initialize the repository for git operations
repo = Repo(kaggle_repo_url)

# Add the copied file to the staging area
repo.index.add([f"{destination_path}/{filename}"])

# Create a timestamp for the commit message
timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
# Commit the changes with a message including the timestamp
repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")

# Push the changes to the remote repository
origin = repo.remote(name="origin")
push_result = origin.push()
if push_result:
    print("Push successful.")
else:
    print("Push failed.")

DataFrame saved as DC_2025-01-28_05:26:21_421_records.json
Destination directory already exists
Push successful.
