# PHASE 1: DATA PREPROCESSING AND VISUALIZATION
YouTube Video NLP Analysis Project


In [1]:
# =============================================================================
# CELL 1: SETUP & IMPORTS
# =============================================================================
!pip install youtube-transcript-api==0.6.1
!pip install gensim
# Basic libraries
import warnings
warnings.filterwarnings('ignore')

import os
import re
import time
import json
import pandas as pd
import numpy as np
from collections import Counter
from typing import List, Dict, Tuple
from pathlib import Path

# YouTube & Text Processing
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

# NLP Libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
try:
    from sentence_transformers import SentenceTransformer
except:
    print("Installing sentence-transformers...")
    !pip install sentence-transformers

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Download NLTK data
print("Downloading NLTK resources...")
for resource in ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger']:
    nltk.download(resource, quiet=True)

# Load spaCy
print("Loading spaCy model...")
try:
    nlp = spacy.load('en_core_web_sm')
except:
    print("Installing spaCy model...")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm')

print("‚úì All imports successful!")

Downloading NLTK resources...
Loading spaCy model...
‚úì All imports successful!


In [2]:
# =============================================================================
# CELL 2: CONFIGURATION
# =============================================================================

# Project configuration
CONFIG = {
    'output_dir': 'phase1_outputs',
    'max_videos': 10,
    'embedding_dim': 100,
    'tfidf_max_features': 1000,
    'w2v_window': 5,
    'w2v_min_count': 2,
    'visualization_top_n': 20,
    'sentence_transformer_model': 'all-MiniLM-L6-v2'
}

# Create output directory
Path(CONFIG['output_dir']).mkdir(exist_ok=True)

print("Configuration loaded:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

Configuration loaded:
  output_dir: phase1_outputs
  max_videos: 10
  embedding_dim: 100
  tfidf_max_features: 1000
  w2v_window: 5
  w2v_min_count: 2
  visualization_top_n: 20
  sentence_transformer_model: all-MiniLM-L6-v2


In [3]:
# =============================================================================
# CELL 3: YOUTUBE VIDEO LIST
# =============================================================================

# Educational YouTube Videos Dataset
# Add your video URLs here
VIDEO_URLS = [
    # Machine Learning & AI
    "https://www.youtube.com/watch?v=aircAruvnKk",  # 3Blue1Brown: Neural Networks
    "https://www.youtube.com/watch?v=rfscVS0vtbw",  # Python for Beginners

    #more URLs from:
    # - Khan Academy
    # - MIT OpenCourseWare
    # - CrashCourse
    # - 3Blue1Brown
    # - StatQuest
    # - Computerphile
]

print(f"Dataset: {len(VIDEO_URLS)} videos queued for processing")
print("Video IDs:")
for url in VIDEO_URLS:
    try:
        video_id = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11})', url).group(1)
        print(f"  - {video_id}")
    except:
        print(f"  - Invalid URL: {url}")


Dataset: 2 videos queued for processing
Video IDs:
  - aircAruvnKk
  - rfscVS0vtbw


In [4]:
# =============================================================================
# DIAGNOSTIC CELL: RUN THIS FIRST TO CHECK YOUR SETUP
# =============================================================================

print("="*80)
print("DIAGNOSTIC CHECK: YouTube Transcript API")
print("="*80)

# Check 1: Import test
print("\n[1/5] Checking import...")
try:
    from youtube_transcript_api import YouTubeTranscriptApi
    print("‚úì youtube_transcript_api imported successfully")
except ImportError as e:
    print("‚úó FAILED: youtube_transcript_api not installed")
    print("   Fix: pip install youtube-transcript-api")
    print(f"   Error: {e}")

# Check 2: Version check
print("\n[2/5] Checking version...")
try:
    import youtube_transcript_api
    version = youtube_transcript_api.__version__
    print(f"‚úì Version: {version}")

    # Parse version
    major, minor = map(int, version.split('.')[:2])
    if major == 0 and minor < 5:
        print(f"‚ö†Ô∏è  WARNING: Old version detected (v{version})")
        print("   Recommendation: pip install --upgrade youtube-transcript-api")
        print("   Some features may not be available")
    else:
        print(f"‚úì Version is recent (v{version})")
except:
    print("‚ö†Ô∏è  Cannot determine version")

# Check 3: Test simple extraction
print("\n[3/5] Testing simple extraction method...")
try:
    test_id = "aircAruvnKk"  # 3Blue1Brown video
    transcript = YouTubeTranscriptApi.get_transcript(test_id)
    print(f"‚úì Simple method works! Extracted {len(transcript)} segments")
    print(f"   Sample: {transcript[0]['text'][:50]}...")
except Exception as e:
    print(f"‚úó Simple method failed: {e}")

# Check 4: Test advanced extraction (if available)
print("\n[4/5] Testing advanced extraction method...")
try:
    test_id = "aircAruvnKk"
    transcript_list = YouTubeTranscriptApi.list_transcripts(test_id)
    print("‚úì Advanced method available!")
    print(f"   Available transcripts: {[t.language_code for t in transcript_list]}")
except AttributeError:
    print("‚ö†Ô∏è  Advanced method not available (old version)")
    print("   This is OK - simple method will be used")
except Exception as e:
    print(f"‚ö†Ô∏è  Advanced method test failed: {e}")

# Check 5: Test with your video URLs
print("\n[5/5] Testing with your video URLs...")
print(f"Found {len(VIDEO_URLS)} URLs to test")

test_results = []
for i, url in enumerate(VIDEO_URLS[:3], 1):  # Test first 3 only
    try:
        # Extract video ID
        import re
        match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11})', url)
        if not match:
            print(f"  [{i}] ‚úó Invalid URL format: {url}")
            continue

        video_id = match.group(1)

        # Try extraction
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        words = sum(len(entry['text'].split()) for entry in transcript)
        print(f"  [{i}] ‚úì {video_id}: {words} words")
        test_results.append(True)

    except Exception as e:
        print(f"  [{i}] ‚úó {video_id}: {str(e)[:60]}...")
        test_results.append(False)

# Summary
print("\n" + "="*80)
print("DIAGNOSTIC SUMMARY")
print("="*80)

if all(test_results):
    print("‚úì ALL TESTS PASSED!")
    print("   Your setup is ready. Proceed with data extraction.")
elif any(test_results):
    print("‚ö†Ô∏è  PARTIAL SUCCESS")
    print(f"   {sum(test_results)}/{len(test_results)} videos extracted successfully")
    print("   Some videos may not have transcripts. Consider replacing failed URLs.")
else:
    print("‚úó SETUP ISSUES DETECTED")
    print("\nTroubleshooting steps:")
    print("1. Check internet connection")
    print("2. Verify video URLs are correct")
    print("3. Ensure videos have English transcripts (check on YouTube)")
    print("4. Try: pip install --upgrade youtube-transcript-api")
    print("5. Restart kernel after installing/upgrading")

print("="*80)

# Recommended action
print("\nüìã RECOMMENDED NEXT STEPS:")
if all(test_results):
    print("‚Üí Proceed to Cell 4 (Data Extraction)")
else:
    print("‚Üí Fix the issues above before proceeding")
    print("‚Üí Replace any failed video URLs with alternatives")
    print("‚Üí Ensure videos have captions/subtitles enabled")

DIAGNOSTIC CHECK: YouTube Transcript API

[1/5] Checking import...
‚úì youtube_transcript_api imported successfully

[2/5] Checking version...
‚ö†Ô∏è  Cannot determine version

[3/5] Testing simple extraction method...
‚úó Simple method failed: no element found: line 1, column 0

[4/5] Testing advanced extraction method...
‚úì Advanced method available!
   Available transcripts: ['ar', 'bn', 'zh', 'zh-CN', 'zh-TW', 'cs', 'en', 'fil', 'fr', 'de', 'el', 'iw', 'hi', 'hu', 'it', 'ja', 'ko', 'mr', 'fa', 'fa-IR', 'pl', 'pt', 'pt-BR', 'ro', 'ru', 'es', 'th', 'tr', 'uk', 'ur', 'en']

[5/5] Testing with your video URLs...
Found 2 URLs to test
  [1] ‚úó aircAruvnKk: no element found: line 1, column 0...
  [2] ‚úó rfscVS0vtbw: no element found: line 1, column 0...

DIAGNOSTIC SUMMARY
‚úó SETUP ISSUES DETECTED

Troubleshooting steps:
1. Check internet connection
2. Verify video URLs are correct
3. Ensure videos have English transcripts (check on YouTube)
4. Try: pip install --upgrade youtube-trans

In [8]:
# =============================================================================
# EMERGENCY FIX: Reinstall youtube-transcript-api
# RUN THIS CELL FIRST, THEN RESTART KERNEL
# =============================================================================

print("="*80)
print("FIXING youtube-transcript-api INSTALLATION")
print("="*80)

# Step 1: Uninstall old version
print("\n[1/3] Uninstalling old/broken version...")
import sys
import subprocess

try:
    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "youtube-transcript-api"])
    print("‚úì Old version removed")
except:
    print("‚ö†Ô∏è  No previous installation found (this is OK)")

# Step 2: Install fresh version
print("\n[2/3] Installing fresh version...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "youtube-transcript-api==0.6.1"])
    print("‚úì youtube-transcript-api v0.6.1 installed successfully")
except Exception as e:
    print(f"‚úó Installation failed: {e}")
    print("\nManual fix required:")
    print("1. Open terminal/command prompt")
    print("2. Run: pip uninstall youtube-transcript-api")
    print("3. Run: pip install youtube-transcript-api==0.6.1")
    print("4. Restart Jupyter kernel")

# Step 3: Verify installation
print("\n[3/3] Verifying installation...")
try:
    # Force reload
    if 'youtube_transcript_api' in sys.modules:
        del sys.modules['youtube_transcript_api']

    from youtube_transcript_api import YouTubeTranscriptApi
    import youtube_transcript_api

    print(f"‚úì Import successful")
    print(f"‚úì Version: {youtube_transcript_api.__version__}")

    # Test extraction
    print("\n[TEST] Trying to extract a sample video...")
    test_transcript = YouTubeTranscriptApi.get_transcript("aircAruvnKk")
    print(f"‚úì SUCCESS! Extracted {len(test_transcript)} segments")
    print(f"‚úì Sample text: {test_transcript[0]['text'][:60]}...")

    print("\n" + "="*80)
    print("‚úÖ INSTALLATION FIXED!")
    print("="*80)
    print("\n‚ö†Ô∏è  IMPORTANT: RESTART YOUR JUPYTER KERNEL NOW")
    print("   Kernel ‚Üí Restart & Clear Output")
    print("\nThen proceed with Cell 4 (Data Extraction)")

except Exception as e:
    print(f"\n‚úó Verification failed: {e}")
    print("\n" + "="*80)
    print("MANUAL INSTALLATION REQUIRED")
    print("="*80)
    print("\nPlease follow these steps:")
    print("\n1. Stop this notebook")
    print("\n2. Open terminal/Anaconda Prompt and run:")
    print("   pip uninstall youtube-transcript-api")
    print("   pip install youtube-transcript-api==0.6.1")
    print("\n3. Restart Jupyter:")
    print("   jupyter notebook")
    print("\n4. Reopen this notebook")
    print("   Kernel ‚Üí Restart & Clear Output")

FIXING youtube-transcript-api INSTALLATION

[1/3] Uninstalling old/broken version...
‚úì Old version removed

[2/3] Installing fresh version...
‚úì youtube-transcript-api v0.6.1 installed successfully

[3/3] Verifying installation...

‚úó Verification failed: cannot import name 'TooManyRequests' from 'youtube_transcript_api._errors' (/usr/local/lib/python3.12/dist-packages/youtube_transcript_api/_errors.py)

MANUAL INSTALLATION REQUIRED

Please follow these steps:

1. Stop this notebook

2. Open terminal/Anaconda Prompt and run:
   pip uninstall youtube-transcript-api
   pip install youtube-transcript-api==0.6.1

3. Restart Jupyter:
   jupyter notebook

4. Reopen this notebook
   Kernel ‚Üí Restart & Clear Output


In [5]:
# =============================================================================
# CELL 4: DATA EXTRACTION
# =============================================================================

class YouTubeTranscriptExtractor:
    """Extract YouTube video transcripts with metadata"""

    def __init__(self):
        self.extracted_data = []

    @staticmethod
    def extract_video_id(url: str) -> str:
        """Extract video ID from YouTube URL"""
        patterns = [
            r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
            r'(?:embed\/)([0-9A-Za-z_-]{11})',
            r'^([0-9A-Za-z_-]{11})$'
        ]
        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1)
        raise ValueError(f"Invalid YouTube URL: {url}")

    def extract_transcript(self, video_url: str, language: str = 'en') -> Dict:
        """Extract transcript with full metadata"""
        try:
            video_id = self.extract_video_id(video_url)
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

            # Prefer manual transcripts
            try:
                transcript = transcript_list.find_manually_created_transcript([language])
                is_auto = False
            except:
                transcript = transcript_list.find_generated_transcript([language])
                is_auto = True

            entries = transcript.fetch()
            full_text = ' '.join([entry['text'] for entry in entries])

            return {
                'video_id': video_id,
                'video_url': video_url,
                'transcript': full_text,
                'num_segments': len(entries),
                'duration_seconds': entries[-1]['start'] + entries[-1]['duration'] if entries else 0,
                'language': language,
                'is_auto_generated': is_auto,
                'word_count': len(full_text.split()),
                'char_count': len(full_text),
                'entries': entries  # Keep raw data
            }
        except Exception as e:
            raise Exception(f"Extraction failed for {video_url}: {str(e)}")

# Extract all transcripts
print("="*80)
print("STEP 1: EXTRACTING TRANSCRIPTS")
print("="*80)

extractor = YouTubeTranscriptExtractor()
transcripts_data = []
failed_extractions = []

for i, url in enumerate(VIDEO_URLS, 1):
    print(f"\n[{i}/{len(VIDEO_URLS)}] Processing: {url}")
    try:
        data = extractor.extract_transcript(url)
        transcripts_data.append(data)

        print(f"‚úì Success!")
        print(f"  Video ID: {data['video_id']}")
        print(f"  Duration: {data['duration_seconds']:.0f}s ({data['duration_seconds']/60:.1f}min)")
        print(f"  Words: {data['word_count']:,}")
        print(f"  Auto-generated: {data['is_auto_generated']}")

        time.sleep(1)  # Respectful rate limiting

    except Exception as e:
        print(f"‚úó Failed: {e}")
        failed_extractions.append({'url': url, 'error': str(e)})

print(f"\n{'='*80}")
print(f"EXTRACTION COMPLETE: {len(transcripts_data)}/{len(VIDEO_URLS)} successful")
print(f"{'='*80}")

# Summary statistics
if transcripts_data:
    total_words = sum(d['word_count'] for d in transcripts_data)
    total_duration = sum(d['duration_seconds'] for d in transcripts_data)

    print(f"\nDataset Summary:")
    print(f"  Total videos: {len(transcripts_data)}")
    print(f"  Total words: {total_words:,}")
    print(f"  Total duration: {total_duration/60:.1f} minutes")
    print(f"  Average words/video: {total_words/len(transcripts_data):.0f}")
    print(f"  Manual transcripts: {sum(not d['is_auto_generated'] for d in transcripts_data)}")
    print(f"  Auto-generated: {sum(d['is_auto_generated'] for d in transcripts_data)}")

# Save raw data
with open(f"{CONFIG['output_dir']}/raw_transcripts.json", 'w') as f:
    json.dump(transcripts_data, f, indent=2)
print(f"\n‚úì Saved: {CONFIG['output_dir']}/raw_transcripts.json")

STEP 1: EXTRACTING TRANSCRIPTS

[1/2] Processing: https://www.youtube.com/watch?v=aircAruvnKk
‚úó Failed: Extraction failed for https://www.youtube.com/watch?v=aircAruvnKk: no element found: line 1, column 0

[2/2] Processing: https://www.youtube.com/watch?v=rfscVS0vtbw
‚úó Failed: Extraction failed for https://www.youtube.com/watch?v=rfscVS0vtbw: no element found: line 1, column 0

EXTRACTION COMPLETE: 0/2 successful

‚úì Saved: phase1_outputs/raw_transcripts.json


In [6]:
# =============================================================================
# CELL 5: TEXT PREPROCESSING
# =============================================================================

class TextPreprocessor:
    """Comprehensive NLP preprocessing pipeline"""

    def __init__(self, remove_stopwords=True, apply_lemmatization=True):
        self.remove_stopwords = remove_stopwords
        self.apply_lemmatization = apply_lemmatization
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    def clean_text(self, text: str) -> str:
        """Clean text: URLs, timestamps, special chars"""
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\[\d{2}:\d{2}\]', '', text)
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'([.!?])\1+', r'\1', text)
        return text

    def tokenize(self, text: str) -> List[str]:
        """NLTK tokenization with lowercasing"""
        return word_tokenize(text.lower())

    def remove_stopwords_func(self, tokens: List[str]) -> List[str]:
        """Remove English stopwords"""
        return [t for t in tokens if t not in self.stop_words]

    def remove_punctuation(self, tokens: List[str]) -> List[str]:
        """Remove non-alphanumeric tokens"""
        return [t for t in tokens if t.isalnum()]

    def lemmatize_tokens(self, tokens: List[str]) -> List[str]:
        """WordNet lemmatization"""
        return [self.lemmatizer.lemmatize(t) for t in tokens]

    def stem_tokens(self, tokens: List[str]) -> List[str]:
        """Porter stemming"""
        return [self.stemmer.stem(t) for t in tokens]

    def preprocess(self, text: str) -> Dict:
        """Complete preprocessing pipeline"""
        cleaned = self.clean_text(text)
        tokens = self.tokenize(cleaned)
        original_tokens = tokens.copy()

        tokens = self.remove_punctuation(tokens)
        tokens_no_stop = self.remove_stopwords_func(tokens) if self.remove_stopwords else tokens
        lemmatized = self.lemmatize_tokens(tokens_no_stop) if self.apply_lemmatization else tokens_no_stop
        stemmed = self.stem_tokens(tokens_no_stop)

        return {
            'original_text': text,
            'cleaned_text': cleaned,
            'tokens': original_tokens,
            'tokens_no_stopwords': tokens_no_stop,
            'lemmatized': lemmatized,
            'stemmed': stemmed,
            'processed_text': ' '.join(lemmatized),
            'word_count': len(lemmatized),
            'unique_words': len(set(lemmatized)),
            'lexical_diversity': len(set(lemmatized)) / len(lemmatized) if lemmatized else 0
        }

# Preprocess all transcripts
print("="*80)
print("STEP 2: TEXT PREPROCESSING")
print("="*80)

preprocessor = TextPreprocessor(remove_stopwords=True, apply_lemmatization=True)
processed_data = []

for i, data in enumerate(transcripts_data, 1):
    print(f"\n[{i}/{len(transcripts_data)}] Preprocessing: {data['video_id']}")

    result = preprocessor.preprocess(data['transcript'])
    result['video_id'] = data['video_id']
    result['video_url'] = data['video_url']
    result['duration_seconds'] = data['duration_seconds']
    result['is_auto_generated'] = data['is_auto_generated']

    processed_data.append(result)

    print(f"  Original: {data['word_count']:,} words")
    print(f"  After cleaning: {len(result['tokens_no_stopwords']):,} tokens")
    print(f"  After lemmatization: {result['word_count']:,} tokens")
    print(f"  Unique words: {result['unique_words']:,}")
    print(f"  Lexical diversity: {result['lexical_diversity']:.3f}")

print(f"\n{'='*80}")
print(f"PREPROCESSING COMPLETE")
print(f"{'='*80}")

# Preprocessing statistics
preprocessing_stats = pd.DataFrame([
    {
        'video_id': d['video_id'],
        'original_words': next(t['word_count'] for t in transcripts_data if t['video_id'] == d['video_id']),
        'processed_tokens': d['word_count'],
        'unique_tokens': d['unique_words'],
        'lexical_diversity': d['lexical_diversity'],
        'retention_rate': d['word_count'] / next(t['word_count'] for t in transcripts_data if t['video_id'] == d['video_id'])
    }
    for d in processed_data
])

print("\nPreprocessing Summary:")
print(preprocessing_stats.describe())

# Save processed data
with open(f"{CONFIG['output_dir']}/processed_transcripts.json", 'w') as f:
    json.dump(processed_data, f, indent=2)
print(f"\n‚úì Saved: {CONFIG['output_dir']}/processed_transcripts.json")

# =============================================================================
# CELL 6: TEXT ANALYSIS & STATISTICS
# =============================================================================

class TextAnalyzer:
    """Comprehensive text statistics and linguistic analysis"""

    @staticmethod
    def compute_statistics(text: str, tokens: List[str]) -> Dict:
        """Calculate text statistics"""
        sentences = sent_tokenize(text)

        stats = {
            'num_sentences': len(sentences),
            'num_tokens': len(tokens),
            'num_unique_tokens': len(set(tokens)),
            'avg_word_length': np.mean([len(w) for w in tokens]) if tokens else 0,
            'avg_sentence_length': len(tokens) / len(sentences) if sentences else 0,
            'lexical_diversity': len(set(tokens)) / len(tokens) if tokens else 0,
            'most_common_words': Counter(tokens).most_common(50)
        }

        sentence_lengths = [len(word_tokenize(s)) for s in sentences]
        stats['sentence_length_mean'] = np.mean(sentence_lengths)
        stats['sentence_length_std'] = np.std(sentence_lengths)
        stats['sentence_length_min'] = np.min(sentence_lengths)
        stats['sentence_length_max'] = np.max(sentence_lengths)

        return stats

    @staticmethod
    def pos_analysis(text: str) -> Dict:
        """POS tagging with spaCy"""
        doc = nlp(text[:1000000])  # Limit for performance
        pos_counts = Counter([token.pos_ for token in doc])

        return {
            'pos_distribution': dict(pos_counts),
            'num_nouns': pos_counts.get('NOUN', 0),
            'num_verbs': pos_counts.get('VERB', 0),
            'num_adjectives': pos_counts.get('ADJ', 0),
            'num_adverbs': pos_counts.get('ADV', 0)
        }

    @staticmethod
    def extract_entities(text: str) -> Dict:
        """Named Entity Recognition"""
        doc = nlp(text[:1000000])
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        entity_counts = Counter([label for _, label in entities])

        return {
            'entities': entities[:100],
            'entity_distribution': dict(entity_counts),
            'num_entities': len(entities)
        }

# Analyze all texts
print("="*80)
print("STEP 3: TEXT ANALYSIS & STATISTICS")
print("="*80)

analyzer = TextAnalyzer()
analysis_results = []

for i, data in enumerate(processed_data, 1):
    print(f"\n[{i}/{len(processed_data)}] Analyzing: {data['video_id']}")

    stats = analyzer.compute_statistics(data['cleaned_text'], data['lemmatized'])
    pos_analysis = analyzer.pos_analysis(data['cleaned_text'][:100000])
    entities = analyzer.extract_entities(data['cleaned_text'][:100000])

    result = {
        'video_id': data['video_id'],
        'statistics': stats,
        'pos_analysis': pos_analysis,
        'entities': entities
    }

    analysis_results.append(result)

    print(f"  Sentences: {stats['num_sentences']}")
    print(f"  Tokens: {stats['num_tokens']:,}")
    print(f"  Lexical diversity: {stats['lexical_diversity']:.3f}")
    print(f"  Avg sentence length: {stats['avg_sentence_length']:.1f} words")
    print(f"  Named entities: {entities['num_entities']}")

# Save analysis results
with open(f"{CONFIG['output_dir']}/analysis_results.json", 'w') as f:
    json.dump(analysis_results, f, indent=2, default=str)

print(f"\n‚úì Analysis complete!")

# =============================================================================
# CELL 7: WORD EMBEDDINGS - TF-IDF
# =============================================================================

print("="*80)
print("STEP 4A: GENERATING TF-IDF EMBEDDINGS")
print("="*80)

# Prepare documents
documents = [d['processed_text'] for d in processed_data]

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=CONFIG['tfidf_max_features'],
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"‚úì TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"‚úì Vocabulary Size: {len(feature_names)}")
print(f"‚úì Sparsity: {(1.0 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.2%}")

# Top TF-IDF terms per document
print("\nTop 5 TF-IDF terms per video:")
for i, data in enumerate(processed_data):
    doc_tfidf = tfidf_matrix[i].toarray()[0]
    top_indices = doc_tfidf.argsort()[-5:][::-1]
    top_terms = [(feature_names[idx], doc_tfidf[idx]) for idx in top_indices]

    print(f"\n{data['video_id']}:")
    for term, score in top_terms:
        print(f"  {term}: {score:.4f}")

# =============================================================================
# CELL 8: WORD EMBEDDINGS - WORD2VEC
# =============================================================================

print("\n" + "="*80)
print("STEP 4B: TRAINING WORD2VEC EMBEDDINGS")
print("="*80)

# Prepare tokenized documents
tokenized_docs = [d['lemmatized'] for d in processed_data]

# Train Word2Vec
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=CONFIG['embedding_dim'],
    window=CONFIG['w2v_window'],
    min_count=CONFIG['w2v_min_count'],
    workers=4,
    sg=1,  # Skip-gram
    epochs=10
)

print(f"‚úì Word2Vec Model Trained")
print(f"‚úì Vocabulary Size: {len(w2v_model.wv):,}")
print(f"‚úì Vector Dimensions: {w2v_model.vector_size}")

# Test semantic relationships
test_words = ['learn', 'model', 'data', 'algorithm', 'neural']
available_test_words = [w for w in test_words if w in w2v_model.wv]

if available_test_words:
    print(f"\nSemantic Similarities:")
    for word in available_test_words[:3]:
        similar = w2v_model.wv.most_similar(word, topn=5)
        print(f"\n'{word}' is similar to:")
        for sim_word, score in similar:
            print(f"  {sim_word}: {score:.3f}")

# Save model
w2v_model.save(f"{CONFIG['output_dir']}/word2vec.model")
print(f"\n‚úì Saved: {CONFIG['output_dir']}/word2vec.model")

# =============================================================================
# CELL 9: SENTENCE EMBEDDINGS
# =============================================================================

print("\n" + "="*80)
print("STEP 4C: GENERATING SENTENCE EMBEDDINGS")
print("="*80)

# Initialize Sentence Transformer
sentence_model = SentenceTransformer(CONFIG['sentence_transformer_model'])

# Extract sentences from first video (demo)
sample_sentences = []
for data in processed_data[:2]:  # First 2 videos
    sents = sent_tokenize(data['cleaned_text'])[:20]  # First 20 sentences
    sample_sentences.extend(sents)

print(f"Encoding {len(sample_sentences)} sentences...")
sentence_embeddings = sentence_model.encode(sample_sentences, show_progress_bar=True)

print(f"‚úì Sentence Embeddings Shape: {sentence_embeddings.shape}")
print(f"‚úì Embedding Dimensions: {sentence_embeddings.shape[1]}")

# Save embeddings
np.save(f"{CONFIG['output_dir']}/sentence_embeddings.npy", sentence_embeddings)
with open(f"{CONFIG['output_dir']}/sentences.json", 'w') as f:
    json.dump(sample_sentences, f, indent=2)

print(f"‚úì Saved sentence embeddings")

# =============================================================================
# CELL 10: VISUALIZATION - WORD FREQUENCY
# =============================================================================

print("\n" + "="*80)
print("STEP 5: GENERATING VISUALIZATIONS")
print("="*80)

# Use first video for detailed visualization
first_video = analysis_results[0]
first_processed = processed_data[0]

# Word Frequency Bar Chart
fig, ax = plt.subplots(figsize=(14, 7))
words, counts = zip(*first_video['statistics']['most_common_words'][:CONFIG['visualization_top_n']])
bars = ax.bar(range(len(words)), counts, color='steelblue', edgecolor='navy', linewidth=1.2)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words, rotation=45, ha='right')
ax.set_xlabel('Words', fontsize=12, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax.set_title(f'Top {CONFIG["visualization_top_n"]} Most Frequent Words - {first_video["video_id"]}',
            fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
           f'{int(height)}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig(f"{CONFIG['output_dir']}/word_frequency.png", dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Saved: word_frequency.png")

# =============================================================================
# CELL 11: VISUALIZATION - WORD CLOUD
# =============================================================================

# Word Cloud
wordcloud = WordCloud(
    width=1600,
    height=800,
    background_color='white',
    colormap='viridis',
    max_words=100,
    relative_scaling=0.5,
    min_font_size=10
).generate(first_processed['processed_text'])

plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud - {first_video["video_id"]}', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(f"{CONFIG['output_dir']}/wordcloud.png", dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Saved: wordcloud.png")

# =============================================================================
# CELL 12: VISUALIZATION - TEXT STATISTICS
# =============================================================================

# Text Statistics Dashboard
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

stats = first_video['statistics']

# 1. Basic Statistics
ax1 = axes[0, 0]
metrics = ['Sentences', 'Tokens', 'Unique\nTokens', 'Avg Word\nLength']
values = [
    stats['num_sentences'],
    stats['num_tokens'],
    stats['num_unique_tokens'],
    stats['avg_word_length']
]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
bars = ax1.barh(metrics, values, color=colors, edgecolor='black', linewidth=1.5)
ax1.set_xlabel('Count / Value', fontsize=11, fontweight='bold')
ax1.set_title('Text Statistics Overview', fontsize=13, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

for i, (bar, val) in enumerate(zip(bars, values)):
    ax1.text(val, i, f'  {val:.1f}', va='center', fontsize=10, fontweight='bold')

# 2. Lexical Diversity
ax2 = axes[0, 1]
diversity_val = stats['lexical_diversity']
ax2.bar(['Lexical Diversity'], [diversity_val], color='#95E1D3', edgecolor='black', linewidth=2)
ax2.set_ylim(0, 1)
ax2.set_ylabel('Ratio', fontsize=11, fontweight='bold')
ax2.set_title('Lexical Diversity (Unique/Total)', fontsize=13, fontweight='bold')
ax2.axhline(y=diversity_val, color='red', linestyle='--', linewidth=2, alpha=0.7)
ax2.text(0, diversity_val + 0.05, f'{diversity_val:.3f}', ha='center', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# 3. Sentence Length Statistics
ax3 = axes[1, 0]
sent_metrics = ['Mean', 'Std Dev', 'Min', 'Max']
sent_values = [
    stats['sentence_length_mean'],
    stats['sentence_length_std'],
    stats['sentence_length_min'],
    stats['sentence_length_max']
]
colors3 = ['#A8E6CF', '#FFD3B6', '#FFAAA5', '#FF8B94']
ax3.bar(sent_metrics, sent_values, color=colors3, edgecolor='black', linewidth=1.5)
ax3.set_ylabel('Words per Sentence', fontsize=11, fontweight='bold')
ax3.set_title('Sentence Length Statistics', fontsize=13, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

for i, val in enumerate(sent_values):
    ax3.text(i, val, f'{val:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# 4. Top Words Pie Chart
ax4 = axes[1, 1]
top_5 = stats['most_common_words'][:5]
labels, sizes = zip(*top_5)
colors4 = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#95E1D3']
explode = (0.05, 0, 0, 0, 0)
ax4.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
       colors=colors4, explode=explode, shadow=True, textprops={'fontsize': 10, 'fontweight': 'bold'})
ax4.set_title('Top 5 Words Distribution', fontsize=13, fontweight='bold')

plt.suptitle(f'Comprehensive Text Analysis - {first_video["video_id"]}',
            fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig(f"{CONFIG['output_dir']}/text_statistics.png", dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Saved: text_statistics.png")

# =============================================================================
# CELL 13: VISUALIZATION - POS DISTRIBUTION
# =============================================================================

# POS Distribution
pos_dist = first_video['pos_analysis']['pos_distribution']

if pos_dist:
    pos_tags = list(pos_dist.keys())
    counts = list(pos_dist.values())

    plt.figure(figsize=(14, 7))
    bars = plt.bar(pos_tags, counts, color='coral', edgecolor='darkred', linewidth=1.5)
    plt.xlabel('Part-of-Speech Tags', fontsize=12, fontweight='bold')
    plt.ylabel('Count', fontsize=12, fontweight='bold')
    plt.title(f'Part-of-Speech Distribution - {first_video["video_id"]}',
             fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)

    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_dir']}/pos_distribution.png", dpi=300, bbox_inches='tight')
    plt.show()

    print("‚úì Saved: pos_distribution.png")

# =============================================================================
# CELL 14: VISUALIZATION - NAMED ENTITIES
# =============================================================================

# Named Entities Distribution
entity_dist = first_video['entities']['entity_distribution']

if entity_dist:
    # Filter to top entity types
    sorted_entities = sorted(entity_dist.items(), key=lambda x: x[1], reverse=True)[:10]
    labels, counts = zip(*sorted_entities)

    plt.figure(figsize=(12, 7))
    bars = plt.barh(range(len(labels)), counts, color='teal', edgecolor='darkslategray', linewidth=1.5)
    plt.yticks(range(len(labels)), labels)
    plt.xlabel('Count', fontsize=12, fontweight='bold')
    plt.ylabel('Entity Type', fontsize=12, fontweight='bold')
    plt.title(f'Named Entity Distribution - {first_video["video_id"]}',
             fontsize=14, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)

    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, counts)):
        plt.text(val, i, f'  {val}', va='center', fontsize=10, fontweight='bold')

    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_dir']}/entity_distribution.png", dpi=300, bbox_inches='tight')
    plt.show()

    print("‚úì Saved: entity_distribution.png")

    # Sample entities
    print("\nSample Named Entities:")
    for entity, label in first_video['entities']['entities'][:15]:
        print(f"  {entity} ({label})")

# =============================================================================
# CELL 15: VISUALIZATION - PCA EMBEDDINGS
# =============================================================================

# PCA Visualization of Word Embeddings
print("\nGenerating PCA visualization...")

if len(w2v_model.wv) > 50:
    # Get top words for visualization
    top_words = [word for word, _ in first_video['statistics']['most_common_words'][:100]
                if word in w2v_model.wv][:50]

    word_vectors = np.array([w2v_model.wv[word] for word in top_words])

    # Apply PCA
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(word_vectors)

    # Create visualization
    plt.figure(figsize=(16, 12))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
                         c=range(len(embeddings_2d)), cmap='viridis',
                         alpha=0.7, s=150, edgecolors='black', linewidth=1.5)

    # Add word labels
    for i, word in enumerate(top_words):
        plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]),
                   fontsize=9, alpha=0.8, fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.3))

    plt.colorbar(scatter, label='Word Index')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
              fontsize=12, fontweight='bold')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
              fontsize=12, fontweight='bold')
    plt.title('Word Embeddings Visualization using PCA (Word2Vec)',
             fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_dir']}/embeddings_pca.png", dpi=300, bbox_inches='tight')
    plt.show()

    print(f"‚úì Saved: embeddings_pca.png")
    print(f"‚úì Total variance explained: {sum(pca.explained_variance_ratio_):.1%}")

# =============================================================================
# CELL 16: VISUALIZATION - t-SNE EMBEDDINGS
# =============================================================================

# t-SNE Visualization of Word Embeddings
print("\nGenerating t-SNE visualization...")

if len(w2v_model.wv) > 50:
    # Use same top words
    top_words_tsne = [word for word, _ in first_video['statistics']['most_common_words'][:80]
                     if word in w2v_model.wv][:60]

    word_vectors_tsne = np.array([w2v_model.wv[word] for word in top_words_tsne])

    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=20, random_state=42, n_iter=1000,
                learning_rate=200)
    embeddings_tsne = tsne.fit_transform(word_vectors_tsne)

    # Create visualization
    plt.figure(figsize=(18, 14))
    scatter = plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1],
                         c=range(len(embeddings_tsne)), cmap='plasma',
                         alpha=0.7, s=180, edgecolors='black', linewidth=1.5)

    # Add word labels with better positioning
    for i, word in enumerate(top_words_tsne):
        plt.annotate(word, (embeddings_tsne[i, 0], embeddings_tsne[i, 1]),
                   fontsize=10, alpha=0.9, fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.4', facecolor='lightblue',
                            edgecolor='navy', alpha=0.6))

    plt.colorbar(scatter, label='Word Index')
    plt.xlabel('t-SNE Dimension 1', fontsize=12, fontweight='bold')
    plt.ylabel('t-SNE Dimension 2', fontsize=12, fontweight='bold')
    plt.title('Word Embeddings Visualization using t-SNE (Word2Vec)',
             fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_dir']}/embeddings_tsne.png", dpi=300, bbox_inches='tight')
    plt.show()

    print("‚úì Saved: embeddings_tsne.png")

# =============================================================================
# CELL 17: VISUALIZATION - WORD SIMILARITY HEATMAP
# =============================================================================

# Word Similarity Heatmap
print("\nGenerating word similarity heatmap...")

if len(w2v_model.wv) > 20:
    # Select important words for similarity analysis
    important_words = ['learn', 'model', 'data', 'algorithm', 'neural',
                      'train', 'network', 'function', 'system', 'problem']

    # Filter to words in vocabulary
    available_words = [w for w in important_words if w in w2v_model.wv]

    if len(available_words) >= 5:
        # Compute similarity matrix
        similarity_matrix = np.zeros((len(available_words), len(available_words)))

        for i, word1 in enumerate(available_words):
            for j, word2 in enumerate(available_words):
                similarity_matrix[i, j] = w2v_model.wv.similarity(word1, word2)

        # Create heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(similarity_matrix,
                   xticklabels=available_words,
                   yticklabels=available_words,
                   annot=True,
                   fmt='.3f',
                   cmap='coolwarm',
                   center=0.5,
                   vmin=0,
                   vmax=1,
                   cbar_kws={'label': 'Cosine Similarity'},
                   linewidths=1,
                   linecolor='gray')

        plt.title('Word Similarity Heatmap (Word2Vec Cosine Similarity)',
                 fontsize=14, fontweight='bold', pad=15)
        plt.xlabel('Words', fontsize=12, fontweight='bold')
        plt.ylabel('Words', fontsize=12, fontweight='bold')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig(f"{CONFIG['output_dir']}/similarity_heatmap.png", dpi=300, bbox_inches='tight')
        plt.show()

        print("‚úì Saved: similarity_heatmap.png")

# =============================================================================
# CELL 18: VISUALIZATION - SENTENCE EMBEDDINGS (t-SNE)
# =============================================================================

# Sentence Embeddings Visualization
print("\nGenerating sentence embeddings visualization...")

if len(sentence_embeddings) >= 10:
    # Apply t-SNE to sentence embeddings
    perplexity_val = min(5, len(sentence_embeddings) - 1)
    tsne_sent = TSNE(n_components=2, perplexity=perplexity_val,
                    random_state=42, n_iter=1000)
    sent_embeddings_2d = tsne_sent.fit_transform(sentence_embeddings)

    # Create visualization
    plt.figure(figsize=(16, 12))

    # Color by sentence index (proxy for temporal order)
    colors = np.arange(len(sent_embeddings_2d))
    scatter = plt.scatter(sent_embeddings_2d[:, 0], sent_embeddings_2d[:, 1],
                         c=colors, cmap='rainbow', alpha=0.6, s=100,
                         edgecolors='black', linewidth=1)

    # Add sentence preview labels for first 15
    for i in range(min(15, len(sample_sentences))):
        preview = sample_sentences[i][:40] + "..." if len(sample_sentences[i]) > 40 else sample_sentences[i]
        plt.annotate(f"{i+1}: {preview}",
                   (sent_embeddings_2d[i, 0], sent_embeddings_2d[i, 1]),
                   fontsize=7, alpha=0.7,
                   bbox=dict(boxstyle='round,pad=0.3', facecolor='white',
                            edgecolor='gray', alpha=0.7))

    plt.colorbar(scatter, label='Sentence Position in Video')
    plt.xlabel('t-SNE Dimension 1', fontsize=12, fontweight='bold')
    plt.ylabel('t-SNE Dimension 2', fontsize=12, fontweight='bold')
    plt.title('Sentence Embeddings Visualization (Sentence-BERT + t-SNE)',
             fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_dir']}/sentence_embeddings_tsne.png", dpi=300, bbox_inches='tight')
    plt.show()

    print("‚úì Saved: sentence_embeddings_tsne.png")

# =============================================================================
# CELL 19: COMPARATIVE ANALYSIS ACROSS VIDEOS
# =============================================================================

print("\n" + "="*80)
print("COMPARATIVE ANALYSIS ACROSS ALL VIDEOS")
print("="*80)

# Create comprehensive comparison DataFrame
comparison_df = pd.DataFrame([
    {
        'Video ID': r['video_id'],
        'Sentences': r['statistics']['num_sentences'],
        'Tokens': r['statistics']['num_tokens'],
        'Unique Tokens': r['statistics']['num_unique_tokens'],
        'Lexical Diversity': r['statistics']['lexical_diversity'],
        'Avg Word Length': r['statistics']['avg_word_length'],
        'Avg Sent Length': r['statistics']['avg_sentence_length'],
        'Named Entities': r['entities']['num_entities'],
        'Nouns': r['pos_analysis']['num_nouns'],
        'Verbs': r['pos_analysis']['num_verbs']
    }
    for r in analysis_results
])

print("\nDataset Overview:")
print(comparison_df.to_string(index=False))

print("\nStatistical Summary:")
print(comparison_df.describe())

# Save comparison
comparison_df.to_csv(f"{CONFIG['output_dir']}/video_comparison.csv", index=False)
print(f"\n‚úì Saved: video_comparison.csv")

# Comparative visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Token counts comparison
ax1 = axes[0, 0]
ax1.bar(range(len(comparison_df)), comparison_df['Tokens'],
       color='steelblue', edgecolor='navy', linewidth=1.5)
ax1.set_xticks(range(len(comparison_df)))
ax1.set_xticklabels(comparison_df['Video ID'], rotation=45, ha='right')
ax1.set_ylabel('Token Count', fontsize=11, fontweight='bold')
ax1.set_title('Token Count per Video', fontsize=12, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# 2. Lexical diversity comparison
ax2 = axes[0, 1]
ax2.bar(range(len(comparison_df)), comparison_df['Lexical Diversity'],
       color='coral', edgecolor='darkred', linewidth=1.5)
ax2.set_xticks(range(len(comparison_df)))
ax2.set_xticklabels(comparison_df['Video ID'], rotation=45, ha='right')
ax2.set_ylabel('Lexical Diversity', fontsize=11, fontweight='bold')
ax2.set_title('Lexical Diversity per Video', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# 3. Named entities comparison
ax3 = axes[1, 0]
ax3.bar(range(len(comparison_df)), comparison_df['Named Entities'],
       color='teal', edgecolor='darkslategray', linewidth=1.5)
ax3.set_xticks(range(len(comparison_df)))
ax3.set_xticklabels(comparison_df['Video ID'], rotation=45, ha='right')
ax3.set_ylabel('Entity Count', fontsize=11, fontweight='bold')
ax3.set_title('Named Entities per Video', fontsize=12, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

# 4. POS distribution comparison (Nouns vs Verbs)
ax4 = axes[1, 1]
x = np.arange(len(comparison_df))
width = 0.35
ax4.bar(x - width/2, comparison_df['Nouns'], width, label='Nouns',
       color='#95E1D3', edgecolor='black', linewidth=1)
ax4.bar(x + width/2, comparison_df['Verbs'], width, label='Verbs',
       color='#FFD3B6', edgecolor='black', linewidth=1)
ax4.set_xticks(x)
ax4.set_xticklabels(comparison_df['Video ID'], rotation=45, ha='right')
ax4.set_ylabel('Count', fontsize=11, fontweight='bold')
ax4.set_title('Nouns vs Verbs per Video', fontsize=12, fontweight='bold')
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

plt.suptitle('Comparative Analysis Across Videos', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(f"{CONFIG['output_dir']}/comparative_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Saved: comparative_analysis.png")

# =============================================================================
# CELL 20: FINAL SUMMARY & EXPORT
# =============================================================================

print("\n" + "="*80)
print("FINAL SUMMARY & REPORT GENERATION")
print("="*80)

# Generate comprehensive summary
final_summary = {
    'project_info': {
        'phase': 'Phase 1 - Data Preprocessing and Visualization',
        'videos_processed': len(transcripts_data),
        'total_words': sum(d['word_count'] for d in transcripts_data),
        'total_duration_minutes': sum(d['duration_seconds'] for d in transcripts_data) / 60
    },
    'preprocessing_summary': {
        'total_tokens_after_processing': sum(d['word_count'] for d in processed_data),
        'unique_vocabulary': len(set(word for d in processed_data for word in d['lemmatized'])),
        'average_lexical_diversity': np.mean([d['lexical_diversity'] for d in processed_data])
    },
    'embeddings_generated': {
        'tfidf_dimensions': tfidf_matrix.shape,
        'word2vec_vocabulary': len(w2v_model.wv),
        'word2vec_dimensions': w2v_model.vector_size,
        'sentence_embeddings_count': len(sentence_embeddings),
        'sentence_embedding_dimensions': sentence_embeddings.shape[1]
    },
    'visualizations_created': [
        'word_frequency.png',
        'wordcloud.png',
        'text_statistics.png',
        'pos_distribution.png',
        'entity_distribution.png',
        'embeddings_pca.png',
        'embeddings_tsne.png',
        'similarity_heatmap.png',
        'sentence_embeddings_tsne.png',
        'comparative_analysis.png'
    ],
    'key_findings': {
        'most_common_words': [word for word, _ in first_video['statistics']['most_common_words'][:10]],
        'average_sentence_length': np.mean([r['statistics']['avg_sentence_length'] for r in analysis_results]),
        'total_named_entities': sum(r['entities']['num_entities'] for r in analysis_results)
    }
}

# Save final summary
with open(f"{CONFIG['output_dir']}/phase1_final_summary.json", 'w') as f:
    json.dump(final_summary, f, indent=2, default=str)

print("\nPhase 1 Summary:")
print(json.dumps(final_summary, indent=2, default=str))

print("\n" + "="*80)
print("‚úì PHASE 1 COMPLETED SUCCESSFULLY!")
print("="*80)

print("\nDeliverables:")
print(f"  1. Processed Transcripts: {CONFIG['output_dir']}/processed_transcripts.json")
print(f"  2. Analysis Results: {CONFIG['output_dir']}/analysis_results.json")
print(f"  3. Word2Vec Model: {CONFIG['output_dir']}/word2vec.model")
print(f"  4. Sentence Embeddings: {CONFIG['output_dir']}/sentence_embeddings.npy")
print(f"  5. Visualizations: {CONFIG['output_dir']}/*.png (10 files)")
print(f"  6. Comparison Data: {CONFIG['output_dir']}/video_comparison.csv")
print(f"  7. Final Summary: {CONFIG['output_dir']}/phase1_final_summary.json")

print("\nNext Steps:")
print("  ‚Üí Prepare Phase 1 Report (max 2 pages)")
print("  ‚Üí Include key visualizations in report")
print("  ‚Üí Document preprocessing decisions and observations")
print("  ‚Üí Proceed to Phase 2: Model Selection & Training")

# =============================================================================
# CELL 21: EXPORT FOR PHASE 2
# =============================================================================

print("\n" + "="*80)
print("PREPARING DATA FOR PHASE 2")
print("="*80)

# Package data for Phase 2
phase2_data = {
    'processed_texts': [d['processed_text'] for d in processed_data],
    'lemmatized_tokens': [d['lemmatized'] for d in processed_data],
    'video_ids': [d['video_id'] for d in processed_data],
    'tfidf_matrix': tfidf_matrix.toarray().tolist(),  # Convert sparse to dense
    'tfidf_features': feature_names.tolist(),
    'word2vec_model_path': f"{CONFIG['output_dir']}/word2vec.model",
    'sentence_embeddings_path': f"{CONFIG['output_dir']}/sentence_embeddings.npy"
}

# Save for Phase 2
with open(f"{CONFIG['output_dir']}/phase2_input_data.json", 'w') as f:
    json.dump(phase2_data, f, indent=2)

print(f"‚úì Phase 2 input data prepared: {CONFIG['output_dir']}/phase2_input_data.json")

print("\n" + "="*80)
print("ALL TASKS COMPLETED!")
print("="*80)

STEP 2: TEXT PREPROCESSING

PREPROCESSING COMPLETE

Preprocessing Summary:


ValueError: Cannot describe a DataFrame without columns