## Download datasets to analyze
This dataset requires kagglehub. Install it with:
```bash
pip install kagglehub[pandas-datasets]
```

In [None]:
import shutil
import gzip
import urllib


def download_all_datasets():
    print("Dataset downloading started.")

    if not datasets_dir.exists():
        print(f"Creating Datasets folder: {datasets_dir}")
        datasets_dir.mkdir(parents=True, exist_ok=True)
    else:
        print(f"Datasets folder exists: {datasets_dir}")

    if not imdb_dir.exists():
        print(f"Creating IMDB folder: {imdb_dir}")
        imdb_dir.mkdir(parents=True, exist_ok=True)
    else:
        print(f"IMDB folder exists: {imdb_dir}")


    total_downloaded = 0
    total_skipped = 0
    total_failed = 0
    
    # Define all datasets with their download methods
    datasets_to_check = [
        {
            "name": "actorfilms.csv",
            "path": imdb_dir / "actorfilms.csv",
            "source": "kaggle",
            "kaggle_dataset": "darinhawley/imdb-films-by-actor-for-10k-actors",
        },
        {
            "name": "Celebrity.csv",
            "path": datasets_dir / "Celebrity.csv",
            "source": "kaggle",
            "kaggle_dataset": "madhuripanchakshri/top-10000-celebrities-dataset",
        },
        {
            "name": "IMDb movies.csv",
            "path": datasets_dir / "IMDb movies.csv",
            "source": "kaggle",
            "kaggle_dataset": "simhyunsu/imdbextensivedataset",
        },
        {
            "name": "title.crew.tsv",
            "path": imdb_dir / "title.crew.tsv",
            "source": "url",
            "url": "https://datasets.imdbws.com/title.crew.tsv.gz",
            "decompress": True,
        },
        {
            "name": "name.basics.tsv",
            "path": imdb_dir / "name.basics.tsv",
            "source": "url",
            "url": "https://datasets.imdbws.com/name.basics.tsv.gz",
            "decompress": True,
        },
    ]
    
    # Check and download each dataset
    for dataset_info in datasets_to_check:
        name = dataset_info["name"]
        path = dataset_info["path"]

        print(f"Checking: {name}")
        
        # Check if dataset already exists
        if path.exists():
            file_size = path.stat().st_size / (1024 * 1024)
            print(f"Already exists ({file_size:.2f} MB)")
            print(f"Skipping download")
            total_skipped += 1
            continue
        
        # Dataset is missing, attempt download
        print(f"Missing - attempting download...")
        
        try:
            # Create parent directory if needed
            path.parent.mkdir(parents=True, exist_ok=True)
            
            if dataset_info["source"] == "kaggle":
                # Download from Kaggle using kagglehub
                try:
                    import kagglehub
                    from kagglehub import KaggleDatasetAdapter
                    
                    print(f" Source: Kaggle ({dataset_info['kaggle_dataset']})")
                    
                    df = kagglehub.load_dataset(
                        KaggleDatasetAdapter.PANDAS,
                        dataset_info['kaggle_dataset'],
                        "",
                    )
                    
                    # Save to CSV
                    df.to_csv(path, index=True)
                    
                    file_size = path.stat().st_size / (1024 * 1024)
                    print(f"Successfully downloaded ({file_size:.2f} MB)")
                    total_downloaded += 1
                    
                except ImportError:
                    print(f"Failed: kagglehub not installed")
                    print(f"Install with: pip install kagglehub[pandas-datasets]")
                    total_failed += 1
                    
            elif dataset_info["source"] == "url":
                # Download from URL
                url = dataset_info["url"]
                decompress = dataset_info.get("decompress", False)
                
                print(f"  Source: {url}")
                
                if decompress and url.endswith('.gz'):
                    # Download and decompress
                    temp_gz = path.with_suffix(path.suffix + '.gz')

                    urllib.request.urlretrieve(url, temp_gz)

                    with gzip.open(temp_gz, 'rb') as f_in:
                        with open(path, 'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                    
                    temp_gz.unlink()
                else:
                    # Direct download
                    urllib.request.urlretrieve(url, path)
                
                file_size = path.stat().st_size / (1024 * 1024)
                print(f"Successfully downloaded ({file_size:.2f} MB)")
                total_downloaded += 1
                
        except Exception as e:
            print(f"Failed: {e}")
            total_failed += 1
    
    # Final summary
    print(f"\n\nTotal datasets checked: {len(datasets_to_check)}")
    print(f"Already present:        {total_skipped}")
    print(f"Downloaded:             {total_downloaded}")
    print(f"Failed:                 {total_failed}")
    
    if total_failed == 0 and total_skipped + total_downloaded == len(datasets_to_check):
        print(f"\nAll {len(datasets_to_check)} datasets are now available!")
    elif total_failed > 0:
        print(f"\n Warning: {total_failed} dataset(s) failed to download")

    
    return {
        "total": len(datasets_to_check),
        "skipped": total_skipped,
        "downloaded": total_downloaded,
        "failed": total_failed
    }

# Run the pipeline
result = download_all_datasets()