# YouTube Analytics - Production Pipeline Demo

This notebook demonstrates the production-ready data pipeline with all the transformations and analytics modules.

In [1]:
# Setup notebook environment
from notebook_setup import setup_notebook_environment, test_imports

# Setup paths and test imports
project_root = setup_notebook_environment()
test_imports()

✅ Project root: e:\Study Space\Analytics Enginerring\Data Engineering\Azure Databricks\ADB_Practice\YouTube Analytics
✅ Added to Python path:
   - e:\Study Space\Analytics Enginerring\Data Engineering\Azure Databricks\ADB_Practice\YouTube Analytics
   - e:\Study Space\Analytics Enginerring\Data Engineering\Azure Databricks\ADB_Practice\YouTube Analytics\src
✅ Config import successful
✅ SparkUtils import successful
✅ YouTubeDataReader import successful
✅ All imports working correctly!


True

In [2]:
# Import production modules
from config.settings import Config
from src.utils.spark_utils import SparkUtils
from src.data_processing.pipeline import YouTubeDataPipeline
from src.analytics.trending_analysis import TrendingAnalyzer
from src.data_processing.transformers import YouTubeDataTransformer
from src.data_processing.cleaners import YouTubeDataCleaner
from src.data_ingestion.processed_data_loader import ProcessedDataLoader

In [3]:
# Initialize Spark session and pipeline
spark = SparkUtils.get_spark_session()
pipeline = YouTubeDataPipeline(spark)
analyzer = TrendingAnalyzer()

print(f"Spark version: {spark.version}")
print(f"Available countries: {Config.COUNTRIES}")

Spark version: 3.5.6
Available countries: ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']


## Demo 1: Single Country Processing with Full Pipeline

In [4]:
# Process US data with full pipeline
print("Processing US data with full pipeline...")
us_df = pipeline.process_single_country_data('US', apply_cleaning=True)

print(f"Processed US data: {us_df.count()} rows")
us_df.printSchema()

Processing US data with full pipeline...
Processed US data: 40899 rows
root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: boolean (nullable = true)
 |-- ratings_disabled: boolean (nullable = true)
 |-- video_error_or_removed: boolean (nullable = true)
 |-- description: string (nullable = true)
 |-- country: string (nullable = false)
 |-- category_name: string (nullable = true)
 |-- publish_time_parsed: timestamp (nullable = true)



In [None]:
# Show sample of processed data
us_df.select("video_id", "title", "channel_title", "views", "likes", "category_name", "country").show(5, truncate=False)

## Demo 2: Analytics on Processed Data

In [None]:
# Generate summary report
summary = analyzer.generate_trending_summary_report(us_df)
print("US Trending Summary:")
for key, value in summary.items():
    print(f"  {key}: {value:,}" if isinstance(value, (int, float)) else f"  {key}: {value}")

In [None]:
# Top videos by views
print("Top 10 Videos by Views:")
top_videos = analyzer.top_videos_by_views(us_df, limit=10)
top_videos.show(10, truncate=False)

In [None]:
# Category performance analysis
print("Category Performance Analysis:")
category_performance = analyzer.category_performance_analysis(us_df)
category_performance.show(15, truncate=False)

In [None]:
# Top channels by trending count
print("Top Channels by Trending Video Count:")
top_channels = analyzer.top_channels_by_trending_count(us_df, limit=10)
top_channels.show(10, truncate=False)

## Demo 3: Multi-Country Analysis

In [None]:
# Process data for multiple countries
print("Processing data for US, CA, and GB...")
multi_country_df = pipeline.run_full_pipeline(countries=['US', 'CA', 'GB'], save_output=False)

print(f"Multi-country data: {multi_country_df.count()} rows")
print("Countries in dataset:")
multi_country_df.groupBy("country").count().show()

In [None]:
# Country comparison analysis
print("Country Comparison Analysis:")
country_comparison = analyzer.country_comparison_analysis(multi_country_df)
country_comparison.show(truncate=False)

## Demo 4: Advanced Analytics

In [None]:
# Engagement rate analysis
print("Top 10 Videos by Engagement Rate:")
engagement_analysis = analyzer.engagement_rate_analysis(us_df)
engagement_analysis.select("title", "channel_title", "views", "engagement_rate", "like_rate", "comment_rate").show(10, truncate=False)

In [None]:
# Trending duration analysis
print("Videos with Most Trending Days:")
trending_duration = analyzer.trending_duration_analysis(us_df)
trending_duration.show(10, truncate=False)

## Demo 5: Data Quality Validation

In [None]:
# Data quality validation
print("Data Quality Metrics:")
quality_metrics = YouTubeDataCleaner.validate_data_quality(us_df)
for metric, value in quality_metrics.items():
    print(f"  {metric}: {value}")

In [None]:
# Stop Spark session
SparkUtils.stop_spark_session()
print("Spark session stopped.")