In [1]:
import os
import shutil
from git import Repo
import pandas as pd
import numpy as np
from datetime import datetime
from pytz import timezone
import json
from kaggle_secrets import UserSecretsClient

In [2]:
def detect_outliers_iqr(dataFrame):
    try:
        numeric_cols = dataFrame.select_dtypes(include=['int64', 'float64']).columns.tolist()
        for column in numeric_cols:
            Q1 = dataFrame[column].quantile(0.25)
            Q3 = dataFrame[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = max(Q1 - 1.5 * IQR, dataFrame[column].min())
            upper_bound = min(Q3 + 1.5 * IQR, dataFrame[column].max())
            dataFrame[column] = np.where(dataFrame[column] < lower_bound, lower_bound, dataFrame[column])
            dataFrame[column] = np.where(dataFrame[column] > upper_bound, upper_bound, dataFrame[column])
            # print(f"Outliers handled in column: {column}")
        return dataFrame
    except Exception as e:
        print(f"Failed to detect outliers in {column}: {e}")
        raise

In [3]:
def FeatureEngineering_File_Extraction(repo_url, kaggle_repo_url, FeatureEngineering_path):
    if os.path.exists(kaggle_repo_url):
        print("Repository already exists locally.")
        repo = Repo(kaggle_repo_url)  
        repo.config_writer().set_value("user", "name", name).release()
        repo.config_writer().set_value("user", "email", email).release()
        origin = repo.remote(name='origin')  
        origin.pull() 
        print("Successfully pulled the latest changes.")
    else:
        repo = Repo.clone_from(repo_url, kaggle_repo_url)
        repo.config_writer().set_value("user", "name", name).release()
        repo.config_writer().set_value("user", "email", email).release()
        print("Successfully cloned the repository.")

   
    output_files = os.listdir(FeatureEngineering_path)
    FeatureEngineering_File = max(
        [file for file in output_files if file.startswith("FE_") and file.endswith('records.json')]
    )

   
    FeatureEngineering_File = pd.read_json(os.path.join(FeatureEngineering_path, FeatureEngineering_File))

    return FeatureEngineering_File

In [4]:
def PushToGithub(filename,destination_path):
    try:
        if os.path.exists(kaggle_repo_url):
            print("Already cloned and the repo file exists")
            repo = Repo(kaggle_repo_url)
            repo.config_writer().set_value("user", "name", name).release()
            repo.config_writer().set_value("user", "email", email).release()
            origin = repo.remote(name='origin')
            origin.pull()
            print("Successfully pulled the git repo before push")
        else:
            repo = Repo.clone_from(repo_url, kaggle_repo_url)
            repo.config_writer().set_value("user", "name", name).release()
            repo.config_writer().set_value("user", "email", email).release()
            print("Successfully cloned the git repo")
        
        if os.path.exists(destination_path):
            shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
        else:
            os.makedirs(destination_path)
            shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
        
        repo = Repo(kaggle_repo_url)
        repo.index.add([f"{destination_path}/{filename}"])
        timestamp = datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
        origin = repo.remote(name="origin")
        push_result = origin.push()
        
        if push_result:
            print("Output files successfully pushed to GitHub!")
        else:
            print("Output files pushed to GitHub failed:(")
        return True
    
    except Exception as e:
        print(f"An error occurred at git automation code: {e}")
        return False

In [5]:
def pre_eda_validation(dataFrame):

    report_df = pd.DataFrame({
        "missing_values": dataFrame.isnull().sum(),
        "duplicates": [dataFrame.duplicated().sum()] * len(dataFrame.columns),
        "data_types": dataFrame.dtypes.astype(str),
        "cardinality": dataFrame.nunique()
    }).reset_index().rename(columns={"index": "columns"})

    # Extract inconsistent records for group1
    inconsistent_group1 = dataFrame[dataFrame.duplicated(subset=["channelId"], keep=False)][
        ["channelId", "channelName", "channelCustomUrl", "channelGrowthScoreRank"]]

    # Extract inconsistent records for group2
    inconsistent_group2 = dataFrame[dataFrame.duplicated(subset=["videoId"], keep=False)][
        ["videoId", "videoTitle", "videoEngagementScoreRank"]]
    DataFrameHandelledOutliers = detect_outliers_iqr(dataFrame)
    report = {
        "Pre_EDA": report_df.to_dict(orient="records"),
        "inconsistent_records_channelLevel": inconsistent_group1.to_dict(orient="records"),
        "inconsistent_records_videolevel": inconsistent_group2.to_dict(orient="records"),
        "Dataframe": DataFrameHandelledOutliers.to_dict(orient="records")
    }
    record_count = len(DataFrameHandelledOutliers)
    timestamp = datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
    filename = f"PEDA_{timestamp}_{record_count}_records.json"

    if report["Pre_EDA"]:
        with open(filename, "w") as json_file:
            json.dump(report, json_file, indent=4)
        print(f"DataFrame validation report saved as {filename}")
    else:
        print("No data to save since empty DataFrame returned.")

    destination_path = '/kaggle/working/DevOps-YouTube-Trends/ExploratoryDataAnalysis/PEDA/Daily'
    PushToGithub(filename, destination_path)
        
    return True


In [6]:
def main(repo_url, kaggle_repo_url, FeatureEngineering_path, ExploratoryDataAnalysis_path):
    FeatureEngineering_File = FeatureEngineering_File_Extraction(repo_url, kaggle_repo_url, FeatureEngineering_path)
    pre_eda_validation(FeatureEngineering_File)
    return True

In [7]:
if __name__ == "__main__":    
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("EDARepoOwner")
    secret_value_1 = user_secrets.get_secret("EDARepoOwnerMail")
    secret_value_2 = user_secrets.get_secret("EDARepoURL")
    
    name = secret_value_0
    email = secret_value_1
    repo_url = secret_value_2
    
    kaggle_repo_url = '/kaggle/working/DevOps-YouTube-Trends'
    FeatureEngineering_path = '/kaggle/working/DevOps-YouTube-Trends/FeatureEngineering/Daily'
    ExploratoryDataAnalysis_path = '/kaggle/working/DevOps-YouTube-Trends/ExploratoryDataAnalysis'

    ist = timezone("Asia/Kolkata")
    
    main(repo_url, kaggle_repo_url, FeatureEngineering_path, ExploratoryDataAnalysis_path)

Successfully cloned the repository.
DataFrame validation report saved as PEDA_2025-03-06_18_26_19_403_records.json
Already cloned and the repo file exists
Successfully pulled the git repo before push
Output files successfully pushed to GitHub!
