## Download datasets to analyze
This dataset requires kagglehub. Install it with:
```bash
pip install kagglehub[pandas-datasets]
```

In [6]:
import shutil
import gzip
import requests
from pathlib import Path

# Define paths using pathlib.Path
datasets_dir = Path("Datasets")
imdb_dir = Path("Datasets/IMDB")

# Headers to mimic a browser request
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
}


def download_file_with_progress(url: str, dest_path: Path):
    """Download a file with progress indication, handling large files properly."""
    print(f"  Downloading from: {url}")
    
    # Use a session for connection pooling
    session = requests.Session()
    session.headers.update(HEADERS)
    
    try:
        response = session.get(url, stream=True, timeout=(10, 300))  # 10s connect, 300s read timeout
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        print(f"  File size: {total_size / (1024*1024):.1f} MB (compressed)")
        downloaded = 0
        
        with open(dest_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                if chunk:
                    f.write(chunk)
                    downloaded += len(chunk)
                    if total_size > 0:
                        progress = (downloaded / total_size) * 100
                        print(f"\r  Progress: {progress:.1f}% ({downloaded / (1024*1024):.1f} MB)", end="", flush=True)
        
        print()  # New line after progress
        return True
        
    except requests.exceptions.Timeout:
        print(f"\n  ERROR: Download timed out")
        raise
    except requests.exceptions.HTTPError as e:
        print(f"\n  ERROR: HTTP error {e.response.status_code}: {e.response.reason}")
        raise
    except requests.exceptions.ConnectionError as e:
        print(f"\n  ERROR: Connection failed: {e}")
        raise


def download_from_kaggle(kaggle_dataset: str, target_filename: str, dest_path: Path):
    """Download a dataset from Kaggle and copy the target file to dest_path."""
    import kagglehub
    
    print(f"  Downloading dataset: {kaggle_dataset}")
    
    # Download the dataset (returns path to cached directory)
    cache_path = Path(kagglehub.dataset_download(kaggle_dataset))
    print(f"  Downloaded to cache: {cache_path}")
    
    # Find the target file in the downloaded dataset
    # Try exact match first, then case-insensitive search
    found_file = None
    
    # List all files in the cache
    all_files = list(cache_path.rglob("*"))
    csv_files = [f for f in all_files if f.is_file() and f.suffix.lower() == '.csv']
    
    print(f"  Found {len(csv_files)} CSV file(s) in dataset")
    
    # Try to find exact match
    for f in csv_files:
        if f.name == target_filename:
            found_file = f
            break
    
    # If no exact match, try case-insensitive
    if not found_file:
        for f in csv_files:
            if f.name.lower() == target_filename.lower():
                found_file = f
                break
    
    # If still no match and there's only one CSV, use that
    if not found_file and len(csv_files) == 1:
        found_file = csv_files[0]
        print(f"  Using only CSV file found: {found_file.name}")
    
    if not found_file:
        available = [f.name for f in csv_files[:5]]  # Show first 5
        raise FileNotFoundError(
            f"Could not find '{target_filename}' in dataset. "
            f"Available CSV files: {available}"
        )
    
    # Copy the file to destination
    print(f"  Copying {found_file.name} -> {dest_path}")
    shutil.copy2(found_file, dest_path)
    return True


def download_all_datasets():
    print("Dataset downloading started.")

    # Check and create Datasets folder if it doesn't exist
    if not datasets_dir.exists():
        print(f"Creating Datasets folder: {datasets_dir}")
        datasets_dir.mkdir(parents=True, exist_ok=True)
    else:
        print(f"Datasets folder exists: {datasets_dir}")

    # Check and create IMDB subfolder if it doesn't exist
    if not imdb_dir.exists():
        print(f"Creating IMDB folder: {imdb_dir}")
        imdb_dir.mkdir(parents=True, exist_ok=True)
    else:
        print(f"IMDB folder exists: {imdb_dir}")

    print()

    total_downloaded = 0
    total_skipped = 0
    total_failed = 0
    
    # Define all datasets with their download methods
    datasets_to_check = [
        {
            "name": "actorfilms.csv",
            "path": imdb_dir / "actorfilms.csv",
            "source": "kaggle",
            "kaggle_dataset": "darinhawley/imdb-films-by-actor-for-10k-actors",
            "kaggle_filename": "actorfilms.csv",
        },
        {
            "name": "Celebrity.csv",
            "path": datasets_dir / "Celebrity.csv",
            "source": "kaggle",
            "kaggle_dataset": "madhuripanchakshri/top-10000-celebrities-dataset",
            "kaggle_filename": "Celebrity.csv",
        },
        {
            "name": "IMDb movies.csv",
            "path": datasets_dir / "IMDb movies.csv",
            "source": "kaggle",
            "kaggle_dataset": "simhyunsu/imdbextensivedataset",
            "kaggle_filename": "IMDb movies.csv",
        },
        {
            "name": "title.crew.tsv",
            "path": imdb_dir / "title.crew.tsv",
            "source": "url",
            "url": "a/title.crew.tsv.gz",
            "decompress": True,
        },
        {
            "name": "name.basics.tsv",
            "path": imdb_dir / "name.basics.tsv",
            "source": "url",
            "url": "https://datasets.imdbws.com/name.basics.tsv.gz",
            "decompress": True,
        },
    ]
    
    # Check and download each dataset
    for dataset_info in datasets_to_check:
        name = dataset_info["name"]
        path = dataset_info["path"]

        print(f"Checking: {name}")
        
        # Check if dataset already exists
        if path.exists():
            file_size = path.stat().st_size / (1024 * 1024)
            print(f"Already exists ({file_size:.2f} MB)")
            print(f"Skipping download")
            total_skipped += 1
            continue
        
        # Dataset is missing, attempt download
        print(f"Missing - attempting download...")
        
        try:
            # Create parent directory if needed
            path.parent.mkdir(parents=True, exist_ok=True)
            
            if dataset_info["source"] == "kaggle":
                # Download from Kaggle using kagglehub
                try:
                    print(f"  Source: Kaggle ({dataset_info['kaggle_dataset']})")
                    
                    download_from_kaggle(
                        dataset_info['kaggle_dataset'],
                        dataset_info['kaggle_filename'],
                        path
                    )
                    
                    file_size = path.stat().st_size / (1024 * 1024)
                    print(f"Successfully downloaded ({file_size:.2f} MB)")
                    total_downloaded += 1
                    
                except ImportError:
                    print(f"Failed: kagglehub not installed")
                    print(f"Install with: pip install kagglehub")
                    total_failed += 1
                    
            elif dataset_info["source"] == "url":
                # Download from URL
                url = dataset_info["url"]
                decompress = dataset_info.get("decompress", False)
                
                if decompress and url.endswith('.gz'):
                    # Download compressed file first
                    temp_gz = path.with_suffix(path.suffix + '.gz')
                    
                    download_file_with_progress(url, temp_gz)
                    
                    # Verify the downloaded file exists and has content
                    if not temp_gz.exists() or temp_gz.stat().st_size == 0:
                        raise Exception(f"Downloaded file is empty or missing: {temp_gz}")
                    
                    # Decompress the file
                    print(f"  Extracting {temp_gz.name} -> {path.name}...")
                    with gzip.open(temp_gz, 'rb') as f_in:
                        with open(path, 'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                    
                    # Remove temporary compressed file
                    temp_gz.unlink()
                    print(f"  Removed temporary file: {temp_gz.name}")
                else:
                    # Direct download
                    download_file_with_progress(url, path)
                
                file_size = path.stat().st_size / (1024 * 1024)
                print(f"Successfully downloaded ({file_size:.2f} MB)")
                total_downloaded += 1
                
        except Exception as e:
            print(f"Failed: {type(e).__name__}: {e}")
            # Clean up any partial downloads
            if path.exists():
                path.unlink()
            temp_gz = path.with_suffix(path.suffix + '.gz')
            if temp_gz.exists():
                temp_gz.unlink()
            total_failed += 1
    
    # Final summary
    print(f"\n\nTotal datasets checked: {len(datasets_to_check)}")
    print(f"Already present:        {total_skipped}")
    print(f"Downloaded:             {total_downloaded}")
    print(f"Failed:                 {total_failed}")
    
    if total_failed == 0 and total_skipped + total_downloaded == len(datasets_to_check):
        print(f"\nAll {len(datasets_to_check)} datasets are now available!")
    elif total_failed > 0:
        print(f"\n Warning: {total_failed} dataset(s) failed to download")

    
    return {
        "total": len(datasets_to_check),
        "skipped": total_skipped,
        "downloaded": total_downloaded,
        "failed": total_failed
    }

# Run the pipeline
result = download_all_datasets()

Dataset downloading started.
Creating Datasets folder: Datasets
Creating IMDB folder: Datasets/IMDB

Checking: actorfilms.csv
Missing - attempting download...
  Source: Kaggle (darinhawley/imdb-films-by-actor-for-10k-actors)
  Downloading dataset: darinhawley/imdb-films-by-actor-for-10k-actors
  Downloaded to cache: /Users/kostyalbalint/.cache/kagglehub/datasets/darinhawley/imdb-films-by-actor-for-10k-actors/versions/1
  Found 1 CSV file(s) in dataset
  Copying actorfilms.csv -> Datasets/IMDB/actorfilms.csv
Successfully downloaded (11.96 MB)
Checking: Celebrity.csv
Missing - attempting download...
  Source: Kaggle (madhuripanchakshri/top-10000-celebrities-dataset)
  Downloading dataset: madhuripanchakshri/top-10000-celebrities-dataset
  Downloaded to cache: /Users/kostyalbalint/.cache/kagglehub/datasets/madhuripanchakshri/top-10000-celebrities-dataset/versions/1
  Found 1 CSV file(s) in dataset
  Copying Celebrity.csv -> Datasets/Celebrity.csv
Successfully downloaded (0.59 MB)
Checking