In [1]:
import os
import shutil
from git import Repo
import pandas as pd
import numpy as np
from datetime import datetime
from pytz import timezone
import json
from kaggle_secrets import UserSecretsClient
import plotly.express as px
import plotly.graph_objects as go
from pandas.api.types import CategoricalDtype
import time
import io
import plotly.io as pio
pio.renderers.default = "iframe_connected" #"iframe_connected"
pd.set_option('future.no_silent_downcasting', True)


# **📊 Choosing the Right Univariate Analysis Plot**  

## **🔍 Step 1: Key Statistical Measures (Default)**
Before visualizing, analyze the core statistics to understand the data distribution.  

| **Measure** | **Description** | **Why It Matters?** |
|------------|---------------|--------------------|
| **Count** | Number of non-null values | Checks data completeness |
| **Mean** | Average of values | Measures central tendency |
| **Median** | Middle value (50th percentile) | Less affected by outliers |
| **Mode** | Most frequent value | Useful for categorical or discrete data |
| **Min** | Smallest value | Defines range |
| **Max** | Largest value | Defines range |
| **Standard Deviation (std)** | Spread of data | Higher std = more variation |
| **Variance** | Squared standard deviation | Measures dispersion |
| **25th Percentile (Q1)** | Lower quartile | 25% of data falls below this |
| **75th Percentile (Q3)** | Upper quartile | 75% of data falls below this |
| **Interquartile Range (IQR)** | Q3 - Q1 | Helps detect outliers |
| **Skewness** | Asymmetry of data distribution | 0 = symmetric, >0 = right-skewed, <0 = left-skewed |
| **Kurtosis** | Measures tail weight | High = heavy tails, low = light tails |

---

## **📊 Step 2: Automated Graph Selection for Numerical Data**

### **1️⃣ Histogram (For Discrete Data)**
- **When to use?**  
  * To check **buckets (bins) and counts** in numeric data.  
  * Ideal for **discrete or grouped numerical data** (e.g., age groups, salaries).
  * Helps in detecting **skewness** and approximate **distribution**.  

- **Best Approach:**  
  - Use **Histogram + Key Statistics** if the data has **discrete values** (e.g., `10, 20, 30` bins).  
  - Example: Salary ranges, test scores, age groups.  

---

### **2️⃣ Kernel Density Estimation (KDE) Plot (For Continuous Data)**
- **When to use?**  
  * For **continuous numerical data** to visualize the **smooth density curve**.
  * Ideal for datasets with **many unique values** (e.g., height, weight, income).  
  * Useful when **understanding distribution trends** is more important than individual counts.  

- **Best Approach:**  
  - Use **KDE + Key Statistics** for **highly continuous** data.  
  - Example: Temperature, stock prices, blood pressure levels.  

---

### **📌 Step 3: BoxPlot?**
- ❌ **Not needed if outliers are already cleaned.**  
- ✅ **Use BoxPlot only when checking for remaining outliers.**  
- It helps **visualize skewness, quartiles, and anomalies.**

---

## **Summary Table for Numerical Data**  

| Data Type       | Recommended Plots |
|----------------|-----------------|
| Discrete Data (10, 20, 30, etc.) | **Histogram + Key Statistics** |
| Continuous Data (e.g., 0.1, 0.2, 0.3, etc.) | **KDE + Key Statistics** |
| Outlier Detection (if needed) | **BoxPlot** |

---

### **🚀 Final Approach:**
1️⃣ **Start with Key Statistical Measures.**  
2️⃣ **Use Histogram (Discrete) or KDE (Continuous).**  
3️⃣ **BoxPlot is optional (only if outliers exist).**  



## **Automated Graph Selection for Categorical Data**

| **Condition** | **Best Visualization** | **Reason** |
|--------------|-----------------|----------|
| **Unique categories > 10** | **Bar Chart** | Too many categories make Pie Charts cluttered. Bar charts handle large categories better. |
| **Unique categories ≤ 10** | **Pie Chart** | Pie charts are useful when comparing a few categories. |
| **Unique categories ≤ 5** | **Pie Chart (default)** | Clear and easy to interpret. |
| **For quick insights** | **Count Plot** | Seaborn’s `countplot()` provides a cleaner bar chart. |


In [2]:
def univariate(column: pd.Series):
    """
    Automated Univariate Analysis for a given DataFrame.
    Displays key statistics and corresponding graph with fixed Plotly rendering.
    """
    columnType = column.dtype
    total_values = len(column)
    unique_values = column.nunique()
    unique_ratio = unique_values / total_values
    col_name = column.name

    # Print Key Statistics
    if pd.api.types.is_numeric_dtype(column):
        print(f"📊 **Key Statistical Measures for {col_name}**:\n")
        key_stats = column.describe().to_frame().T
        print(key_stats.to_string(index=False), "\n")

    # Create a figure variable outside the conditions
    fig = None
    
    if pd.api.types.is_numeric_dtype(column):
        col_min = column.min()
        col_max = column.max()

        if unique_values <= 31:
            print(f"🔍 Using Bar Chart for Discrete Numeric Data: {col_name}")
            count_df = column.value_counts().reset_index()
            count_df.columns = [col_name, "Count"]
            fig = px.bar(count_df, x=col_name, y="Count", text="Count",
                        title=f"Univariate Analysis: {col_name}")
        
        elif unique_values < 50:
            print(f"🔍 Using Histogram for Discrete Numeric Data: {col_name}")
            bin_edges = np.histogram_bin_edges(column.dropna(), bins='auto')
            fig = px.histogram(column, x=col_name, nbins=len(bin_edges)-1, 
                             title=f"Univariate Analysis: {col_name}")
        
        else:
            print(f"🔍 Using KDE-like Histogram for Continuous Data: {col_name}")
            bin_edges = np.histogram_bin_edges(column.dropna(), bins='auto')
            fig = px.histogram(column, x=col_name, nbins=len(bin_edges)-1,
                             histnorm='density', opacity=0.7,
                             marginal="rug", title=f"Univariate Analysis: {col_name}")
    
    elif isinstance(columnType, CategoricalDtype) or columnType == object:
        if unique_ratio > 0.5:
            print(f"⚠️ Skipping `{col_name}`: High cardinality ({unique_values} unique out of {total_values})")
            print("⚠️ Skipped for visualization due to high cardinality.\n")
        else:
            category_counts = column.value_counts(normalize=True) * 100
            category_table = pd.DataFrame({"Category": category_counts.index, 
                                         "Percentage": category_counts.values.round(2)})
            print(f"📊 **Category Frequency Counts for {col_name}**:\n")
            print(category_table.to_string(index=False), "\n")

            if unique_values > 10:
                print(f"🔍 Using Bar Chart for {col_name} (too many categories for pie chart)")
                fig = px.bar(category_table, y='Percentage', x='Category', orientation='v',
                            title=f"Univariate Analysis: {col_name}", text='Percentage')
            else:
                print(f"🔍 Using Pie Chart for {col_name} (categories: {unique_values})")
                fig = px.pie(category_table, names='Category', values='Percentage',
                            title=f"Univariate Analysis: {col_name}", hole=0.3)

    else:
        print(f"⚠️ Unsupported Data Type for {col_name}: {columnType}")
    
    # Display the figure at the end if it exists
    if fig:
        # Update layout for better display
        # fig.update_layout(
        #     margin=dict(l=20, r=20, t=40, b=20),
        #     height=500,
        #     width=800
        # )
        fig.show()
        time.sleep(1)
    
    return True

In [3]:
# def univariate(column):
#     """
#     Automated Univariate Analysis: Selects the best visualization 
#     based on data type and characteristics.
    
#     Parameters:
#     - graphName: str -> The preferred visualization type (optional, default logic applies).
#     - column: pandas Series -> The column to analyze.
    
#     Returns:
#     - Displays the appropriate visualization.
#     - Prints Key Statistical Measures in a tabular format for numerical data.
#     """
    
#     columnType = column.dtype  # Get the column data type

#     if pd.api.types.is_numeric_dtype(column):  # Check if numeric
#         # Step 1: Key Statistical Measures (Default)
#         key_stats = column.describe().to_frame().T  # Convert to DataFrame for tabular format
#         key_stats.rename(index={0: "Statistics"}, inplace=True)
#         print("📊 **Key Statistical Measures**:\n")
#         print(key_stats.to_string(index=False), "\n")  # Print neatly formatted table

#         # Step 2: Automated Graph Selection
#         unique_values = column.nunique()

#         plt.figure(figsize=(8, 4))  # Set figure size
        
#         if unique_values < 20:  # Discrete Data
#             print("🔍 Using Histogram for Discrete Data")
#             sns.histplot(column, bins=10, kde=False, color="skyblue")
        
#         else:  # Continuous Data
#             print("🔍 Using KDE for Continuous Data")
#             sns.kdeplot(column, fill=True, color="orange")

#         plt.title(f"Univariate Analysis of {column.name}")
#         plt.xlabel(column.name)
#         plt.ylabel("Frequency")
#         plt.show()

#     elif pd.api.types.is_categorical_dtype(column) or column.nunique() < 10:  # Categorical Data
#         category_counts = column.value_counts(normalize=True) * 100
#         category_table = pd.DataFrame({"Category": category_counts.index, "Percentage": category_counts.values})
#         print("📊 **Category Frequency Counts**:\n")
#         print(category_table.to_string(index=False), "\n")  # Print neatly formatted category table

#         plt.figure(figsize=(8, 4))
#         print("🔍 Using Bar Chart for Category Distribution")
#         sns.countplot(y=column, palette="pastel")

#         plt.title(f"Univariate Analysis of {column.name}")
#         plt.xlabel("Count")
#         plt.ylabel(column.name)
#         plt.show()

#     else:
#         print("⚠️ Unsupported Data Type")

#     return True


In [4]:
def detect_outliers_iqr(dataFrame):
    try:
        numeric_cols = dataFrame.select_dtypes(include=['int64', 'float64']).columns.tolist()
        for column in numeric_cols:
            Q1 = dataFrame[column].quantile(0.25)
            Q3 = dataFrame[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = max(Q1 - 1.5 * IQR, dataFrame[column].min())
            upper_bound = min(Q3 + 1.5 * IQR, dataFrame[column].max())
            dataFrame[column] = np.where(dataFrame[column] < lower_bound, lower_bound, dataFrame[column])
            dataFrame[column] = np.where(dataFrame[column] > upper_bound, upper_bound, dataFrame[column])
            # print(f"Outliers handled in column: {column}")
        return dataFrame
    except Exception as e:
        print(f"Failed to detect outliers in {column}: {e}")
        raise

In [5]:
def pre_eda_validation(dataFrame):
    
    # Basic stats
    report_df = pd.DataFrame({
        "missing_values": dataFrame.isnull().sum(),
        "duplicates": [dataFrame.duplicated().sum()] * len(dataFrame.columns),
        "data_types": dataFrame.dtypes.astype(str),
        "cardinality": dataFrame.nunique(),
        "non_null_count": dataFrame.notnull().sum()
    }).reset_index().rename(columns={"index": "columns"})

    # Extract .info() as summary
    buffer = io.StringIO()
    dataFrame.info(buf=buffer)
    info_summary = buffer.getvalue()

    # Add .shape
    shape_summary = {"rows": dataFrame.shape[0], "columns": dataFrame.shape[1]}

    # Safe describe summary
    describe_df = dataFrame.describe(include='all').T
    describe_df = describe_df.fillna("").infer_objects(copy=False)
    describe_summary = describe_df.to_dict(orient='index')

    # Inconsistency checks
    inconsistent_group1 = dataFrame[dataFrame.duplicated(subset=["channelId"], keep=False)][
        ["channelId", "channelName", "channelCustomUrl"]]

    inconsistent_group2 = dataFrame[dataFrame.duplicated(subset=["videoId"], keep=False)][
        ["videoId", "videoTitle"]]

    DataFrameHandelledOutliers = detect_outliers_iqr(dataFrame)

    # Final report
    report = {
        "Pre_EDA_Summary": report_df.to_dict(orient="records"),
        "DataFrame_Info": info_summary,
        "DataFrame_Shape": shape_summary,
        "Describe_Summary": describe_summary,
        "inconsistent_records_channelLevel": inconsistent_group1.to_dict(orient="records"),
        "inconsistent_records_videolevel": inconsistent_group2.to_dict(orient="records"),
        "Dataframe": DataFrameHandelledOutliers.to_dict(orient="records")
    }

    record_count = len(DataFrameHandelledOutliers)
    timestamp = datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
    filename = f"PEDA_{timestamp}_{record_count}_records.json"

    if report["Pre_EDA_Summary"]:
        with open(filename, "w") as json_file:
            json.dump(report, json_file, indent=4)
        print(f"DataFrame validation report saved as {filename}")
    else:
        print("No data to save since empty DataFrame returned.")

    destination_path = '/kaggle/working/YouTube-Trends/ExploratoryDataAnalysis/PEDA/Daily'
    PushToGithub(filename, destination_path)
        
    return True


In [6]:
# def pre_eda_validation(dataFrame):

#     report_df = pd.DataFrame({
#         "missing_values": dataFrame.isnull().sum(),
#         "duplicates": [dataFrame.duplicated().sum()] * len(dataFrame.columns),
#         "data_types": dataFrame.dtypes.astype(str),
#         "cardinality": dataFrame.nunique()
#     }).reset_index().rename(columns={"index": "columns"})

#     # Extract inconsistent records for group1
#     inconsistent_group1 = dataFrame[dataFrame.duplicated(subset=["channelId"], keep=False)][
#         ["channelId", "channelName", "channelCustomUrl"]]

#     # Extract inconsistent records for group2
#     inconsistent_group2 = dataFrame[dataFrame.duplicated(subset=["videoId"], keep=False)][
#         ["videoId", "videoTitle"]]
#     DataFrameHandelledOutliers = detect_outliers_iqr(dataFrame)
#     report = {
#         "Pre_EDA": report_df.to_dict(orient="records"),
#         "inconsistent_records_channelLevel": inconsistent_group1.to_dict(orient="records"),
#         "inconsistent_records_videolevel": inconsistent_group2.to_dict(orient="records"),
#         "Dataframe": DataFrameHandelledOutliers.to_dict(orient="records")
#     }
#     record_count = len(DataFrameHandelledOutliers)
#     timestamp = datetime.now(ist).strftime("%Y-%m-%d_%H_%M_%S")
#     filename = f"PEDA_{timestamp}_{record_count}_records.json"

#     if report["Pre_EDA"]:
#         with open(filename, "w") as json_file:
#             json.dump(report, json_file, indent=4)
#         print(f"DataFrame validation report saved as {filename}")
#     else:
#         print("No data to save since empty DataFrame returned.")

#     destination_path = '/kaggle/working/DevOps-YouTube-Trends/ExploratoryDataAnalysis/PEDA/Daily'
#     PushToGithub(filename, destination_path)
        
#     return True


In [7]:
def DataCleaning_File_Extraction(repo_url, kaggle_repo_url, DataCleaning_path):
    if os.path.exists(kaggle_repo_url):
        print("Repository already exists locally.")
        repo = Repo(kaggle_repo_url)  
        repo.config_writer().set_value("user", "name", name).release()
        repo.config_writer().set_value("user", "email", email).release()
        origin = repo.remote(name='origin')  
        origin.pull() 
        print("Successfully pulled the latest changes.")
    else:
        repo = Repo.clone_from(repo_url, kaggle_repo_url)
        repo.config_writer().set_value("user", "name", name).release()
        repo.config_writer().set_value("user", "email", email).release()
        print("Successfully cloned the repository.")

   
    output_files = os.listdir(DataCleaning_path)
    DataCleaning_File = max(
        [file for file in output_files if file.startswith("DC_") and file.endswith('records.json')]
    )

   
    DataCleaning_File = pd.read_json(os.path.join(DataCleaning_path, DataCleaning_File))

    return DataCleaning_File

In [8]:
def PushToGithub(filename,destination_path):
    try:
        if os.path.exists(kaggle_repo_url):
            print("Already cloned and the repo file exists")
            repo = Repo(kaggle_repo_url)
            repo.config_writer().set_value("user", "name", name).release()
            repo.config_writer().set_value("user", "email", email).release()
            origin = repo.remote(name='origin')
            origin.pull()
            print("Successfully pulled the git repo before push")
        else:
            repo = Repo.clone_from(repo_url, kaggle_repo_url)
            repo.config_writer().set_value("user", "name", name).release()
            repo.config_writer().set_value("user", "email", email).release()
            print("Successfully cloned the git repo")
        
        if os.path.exists(destination_path):
            shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
        else:
            os.makedirs(destination_path)
            shutil.copyfile(f'/kaggle/working/{filename}', f'{destination_path}/{filename}')
        
        repo = Repo(kaggle_repo_url)
        repo.index.add([f"{destination_path}/{filename}"])
        timestamp = datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {filename}")
        origin = repo.remote(name="origin")
        # push_result = origin.push()
        push_result = origin.push(refspec=f"HEAD:refs/heads/eda")
        
        if push_result:
            print("Output files successfully pushed to GitHub!")
        else:
            print("Output files pushed to GitHub failed:(")
        return True
    
    except Exception as e:
        print(f"An error occurred at git automation code: {e}")
        return False

In [9]:
def main(repo_url, kaggle_repo_url, DataCleaning_path, ExploratoryDataAnalysis_path):
    DataCleaning_File = DataCleaning_File_Extraction(repo_url, kaggle_repo_url, DataCleaning_path)
    DataCleaning_File['videoCategoryId'] = DataCleaning_File['videoCategoryId'].astype('category')
    pre_eda_validation(DataCleaning_File)
    univariateTargetColumns = ['']
     
    ChannelLevel_DataCleaning_File = DataCleaning_File.drop(['videoId', 'videoTitle', 'videoPublishYear',
       'videoPublishMonth', 'videoPublishDay', 'videoPublishTime',
       'videoPublishedOn', 'videoPublishedOnInSeconds', 'videoViewCount',
       'videoLikeCount', 'videoCommentCount', 'videoCategoryId',
       'videoDefaultAudioLanguage', 'videoDuration', 'videoDurationInSeconds',
       'videoContentType', 'videoDimension', 'videoDefinition', 'videoCaption',
       'videoLicensedContent', 'videoProjection'], axis=1)
    ChannelLevel_DataCleaning_File = ChannelLevel_DataCleaning_File.drop_duplicates()
    VideoLevel_DataCleaning_File = DataCleaning_File.drop( ['channelId', 'channelName','channelCustomUrl',
       'channelPublishYear', 'channelPublishMonth', 'channelPublishDay',
       'channelPublishTime', 'channelPublishedOn',
       'channelPublishedOnInSeconds', 'channelCountry', 'channelViewCount',
       'channelSubscriberCount', 'channelVideoCount'], axis=1)
    columns = ChannelLevel_DataCleaning_File.columns
    print(columns)
    for col in columns:
        univariate(ChannelLevel_DataCleaning_File[col])

    columns = VideoLevel_DataCleaning_File.columns
    print(columns)
    for col in columns:
        univariate(VideoLevel_DataCleaning_File[col])
    return True

In [10]:
if __name__ == "__main__":    
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("EDARepoOwner")
    secret_value_1 = user_secrets.get_secret("EDARepoOwnerMail")
    secret_value_2 = user_secrets.get_secret("EDARepoURL")
    
    name = secret_value_0
    email = secret_value_1
    repo_url = secret_value_2
    
    kaggle_repo_url = '/kaggle/working/YouTube-Trends'
    DataCleaning_path = '/kaggle/working/YouTube-Trends/DataCleaning/Daily'
    ExploratoryDataAnalysis_path = '/kaggle/working/YouTube-Trends/ExploratoryDataAnalysis'

    ist = timezone("Asia/Kolkata")
    
    main(repo_url, kaggle_repo_url, DataCleaning_path, ExploratoryDataAnalysis_path)

Successfully cloned the repository.
DataFrame validation report saved as PEDA_2025-05-01_20_55_18_309_records.json
Already cloned and the repo file exists
Successfully pulled the git repo before push
Output files successfully pushed to GitHub!
Index(['channelId', 'channelName', 'channelCustomUrl', 'channelPublishYear',
       'channelPublishMonth', 'channelPublishDay', 'channelPublishTime',
       'channelPublishedOn', 'channelPublishedOnInSeconds', 'channelCountry',
       'channelViewCount', 'channelSubscriberCount', 'channelVideoCount'],
      dtype='object')
⚠️ Skipping `channelId`: High cardinality (185 unique out of 185)
⚠️ Skipped for visualization due to high cardinality.

⚠️ Skipping `channelName`: High cardinality (185 unique out of 185)
⚠️ Skipped for visualization due to high cardinality.

⚠️ Skipping `channelCustomUrl`: High cardinality (185 unique out of 185)
⚠️ Skipped for visualization due to high cardinality.

📊 **Key Statistical Measures for channelPublishYear**:

 co

📊 **Key Statistical Measures for channelPublishMonth**:

 count     mean      std  min  25%  50%  75%  max
 185.0 6.497297 3.495532  1.0  4.0  7.0 10.0 12.0 

🔍 Using Bar Chart for Discrete Numeric Data: channelPublishMonth


📊 **Key Statistical Measures for channelPublishDay**:

 count      mean      std  min  25%  50%  75%  max
 185.0 16.308108 8.728837  1.0  9.0 16.0 24.0 31.0 

🔍 Using Bar Chart for Discrete Numeric Data: channelPublishDay


⚠️ Skipping `channelPublishTime`: High cardinality (185 unique out of 185)
⚠️ Skipped for visualization due to high cardinality.

⚠️ Skipping `channelPublishedOn`: High cardinality (185 unique out of 185)
⚠️ Skipped for visualization due to high cardinality.

📊 **Key Statistical Measures for channelPublishedOnInSeconds**:

 count         mean          std          min          25%          50%          75%          max
 185.0 1.462636e+09 1.539761e+08 1138069141.0 1355280989.0 1451740727.0 1596272639.0 1740278826.0 

🔍 Using KDE-like Histogram for Continuous Data: channelPublishedOnInSeconds


📊 **Category Frequency Counts for channelCountry**:

Category  Percentage
      US       34.05
      IN       22.70
 Unknown       12.97
      GB        4.86
      CA        4.32
      AU        2.70
      DE        2.16
      NL        2.16
      PK        1.62
      CH        1.08
      IE        1.08
      AE        0.54
      JP        0.54
      SG        0.54
      SK        0.54
      RS        0.54
      ES        0.54
      UA        0.54
      HU        0.54
      FI        0.54
      ID        0.54
      BR        0.54
      CZ        0.54
      PH        0.54
      PL        0.54
      SA        0.54
      BE        0.54
      ZA        0.54
      AT        0.54
      SE        0.54 

🔍 Using Bar Chart for channelCountry (too many categories for pie chart)


📊 **Key Statistical Measures for channelViewCount**:

 count         mean          std   min     25%      50%       75%        max
 185.0 7.533246e+06 1.322276e+07 136.0 75383.0 515731.0 7674163.0 42380397.5 

🔍 Using KDE-like Histogram for Continuous Data: channelViewCount


📊 **Key Statistical Measures for channelSubscriberCount**:

 count         mean           std  min   25%    50%     75%      max
 185.0 68910.762162 120129.325945  3.0 882.0 5710.0 73500.0 362270.0 

🔍 Using KDE-like Histogram for Continuous Data: channelSubscriberCount


📊 **Key Statistical Measures for channelVideoCount**:

 count       mean       std  min  25%   50%   75%    max
 185.0 479.481081 628.49409  1.0 58.0 194.0 591.0 2207.5 

🔍 Using KDE-like Histogram for Continuous Data: channelVideoCount


Index(['videoId', 'videoTitle', 'videoPublishYear', 'videoPublishMonth',
       'videoPublishDay', 'videoPublishTime', 'videoPublishedOn',
       'videoPublishedOnInSeconds', 'videoViewCount', 'videoLikeCount',
       'videoCommentCount', 'videoCategoryId', 'videoDefaultAudioLanguage',
       'videoDuration', 'videoDurationInSeconds', 'videoContentType',
       'videoDimension', 'videoDefinition', 'videoCaption',
       'videoLicensedContent', 'videoProjection'],
      dtype='object')
⚠️ Skipping `videoId`: High cardinality (309 unique out of 309)
⚠️ Skipped for visualization due to high cardinality.

⚠️ Skipping `videoTitle`: High cardinality (308 unique out of 309)
⚠️ Skipped for visualization due to high cardinality.

📊 **Key Statistical Measures for videoPublishYear**:

 count        mean      std    min    25%    50%    75%    max
 309.0 2022.576052 1.748341 2019.0 2022.0 2023.0 2024.0 2025.0 

🔍 Using Bar Chart for Discrete Numeric Data: videoPublishYear


📊 **Key Statistical Measures for videoPublishMonth**:

 count    mean      std  min  25%  50%  75%  max
 309.0 6.31068 3.400809  1.0  3.0  6.0  9.0 12.0 

🔍 Using Bar Chart for Discrete Numeric Data: videoPublishMonth


📊 **Key Statistical Measures for videoPublishDay**:

 count      mean     std  min  25%  50%  75%  max
 309.0 16.378641 8.63899  1.0 10.0 17.0 24.0 31.0 

🔍 Using Bar Chart for Discrete Numeric Data: videoPublishDay


⚠️ Skipping `videoPublishTime`: High cardinality (304 unique out of 309)
⚠️ Skipped for visualization due to high cardinality.

⚠️ Skipping `videoPublishedOn`: High cardinality (308 unique out of 309)
⚠️ Skipped for visualization due to high cardinality.

📊 **Key Statistical Measures for videoPublishedOnInSeconds**:

 count         mean          std          min          25%          50%          75%          max
 309.0 1.673448e+09 5.647030e+07 1533640534.5 1644332418.0 1686918937.0 1718127007.0 1745963478.0 

🔍 Using KDE-like Histogram for Continuous Data: videoPublishedOnInSeconds


📊 **Key Statistical Measures for videoViewCount**:

 count         mean          std  min   25%    50%     75%     max
 309.0 14276.223301 19350.270133  0.0 580.0 4098.0 22025.0 54192.5 

🔍 Using KDE-like Histogram for Continuous Data: videoViewCount


📊 **Key Statistical Measures for videoLikeCount**:

 count       mean        std  min  25%  50%   75%   max
 309.0 222.433657 301.546519  0.0  9.0 56.0 338.0 831.5 

🔍 Using KDE-like Histogram for Continuous Data: videoLikeCount


📊 **Key Statistical Measures for videoCommentCount**:

 count      mean       std  min  25%  50%  75%  max
 309.0 13.423948 18.292844  0.0  0.0  3.0 20.0 50.0 

🔍 Using Histogram for Discrete Numeric Data: videoCommentCount


📊 **Category Frequency Counts for videoCategoryId**:

Category  Percentage
      28       56.63
      27       30.74
      22        9.39
      29        1.29
      24        0.97
      26        0.97 

🔍 Using Pie Chart for videoCategoryId (categories: 6)


📊 **Category Frequency Counts for videoDefaultAudioLanguage**:

Category  Percentage
      en       68.93
   en-US       19.09
   en-GB        7.12
   en-IN        4.21
   en-CA        0.32
   en-AU        0.32 

🔍 Using Pie Chart for videoDefaultAudioLanguage (categories: 6)


⚠️ Skipping `videoDuration`: High cardinality (273 unique out of 309)
⚠️ Skipped for visualization due to high cardinality.

📊 **Key Statistical Measures for videoDurationInSeconds**:

 count        mean        std  min   25%   50%    75%    max
 309.0 1000.242718 994.043119 14.0 242.0 609.0 1381.0 3089.5 

🔍 Using KDE-like Histogram for Continuous Data: videoDurationInSeconds


📊 **Category Frequency Counts for videoContentType**:

Category  Percentage
   Video       90.29
   Short        9.71 

🔍 Using Pie Chart for videoContentType (categories: 2)


📊 **Category Frequency Counts for videoDimension**:

Category  Percentage
      2d       100.0 

🔍 Using Pie Chart for videoDimension (categories: 1)


📊 **Category Frequency Counts for videoDefinition**:

Category  Percentage
      hd       99.35
      sd        0.65 

🔍 Using Pie Chart for videoDefinition (categories: 2)


📊 **Category Frequency Counts for videoCaption**:

Category  Percentage
    true       100.0 

🔍 Using Pie Chart for videoCaption (categories: 1)


📊 **Key Statistical Measures for videoLicensedContent**:

count unique   top freq
  309      2 False  202 

🔍 Using Bar Chart for Discrete Numeric Data: videoLicensedContent


📊 **Category Frequency Counts for videoProjection**:

   Category  Percentage
rectangular       100.0 

🔍 Using Pie Chart for videoProjection (categories: 1)
