# Install Requirements

In [None]:
!pip install kaggle selenium webdriver_manager

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

# Kaggle API set and fetch datasets

In [None]:
# Step 1: Create the correct folder
!mkdir -p /root/.config/kaggle

# Step 2: Move kaggle.json to that location
!mv kaggle.json /root/.config/kaggle/

# Step 3: Set correct permissions
!chmod 600 /root/.config/kaggle/kaggle.json

In [None]:
import time
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

N = 100  # desired number of datasets
all_datasets = []
page = 1

while len(all_datasets) < N:
    try:
        datasets = api.dataset_list(page=page)
        if not datasets:
            break
        all_datasets.extend(datasets)
        print(f"Fetched page {page}, total datasets collected: {len(all_datasets)}")
        page += 1
        time.sleep(2)  # wait 2 seconds between requests to avoid rate limiting
    except Exception as e:
        print(f"Error fetching page {page}: {e}")
        break

all_datasets = all_datasets[:N]

print(f"\nTotal datasets fetched: {len(all_datasets)}\n")

for i, dataset in enumerate(all_datasets, start=1):
    dataset_url = f"https://www.kaggle.com/datasets/{dataset.ref}"
    print(f"{i}. {dataset.title} - {dataset_url}")


Fetched page 1, total datasets collected: 20
Fetched page 2, total datasets collected: 40
Fetched page 3, total datasets collected: 60
Fetched page 4, total datasets collected: 80
Fetched page 5, total datasets collected: 99
Fetched page 6, total datasets collected: 117

Total datasets fetched: 100

1. Extrovert vs. Introvert Behavior Data - https://www.kaggle.com/datasets/rakeshkapilavai/extrovert-vs-introvert-behavior-data
2. Students' Social Media Addiction - https://www.kaggle.com/datasets/adilshamim8/social-media-addiction-vs-relationships
3. Global AI Job Market & Salary Trends 2025 - https://www.kaggle.com/datasets/bismasajjad/global-ai-job-market-and-salary-trends-2025
4. üåÆ Taco Sales Dataset (2024‚Äì2025) - https://www.kaggle.com/datasets/atharvasoundankar/taco-sales-dataset-20242025
5. Video Game Sales - https://www.kaggle.com/datasets/siddharth0935/video-game-sales
6. Student Habits vs Academic Performance - https://www.kaggle.com/datasets/jayaantanaath/student-habits-vs-

# Create Datasets of Kaggle

In [None]:
import time
from kaggle.api.kaggle_api_extended import KaggleApi
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Fetch datasets from Kaggle API
datasets = []
N = 2400  # Desired number of datasets
print("Fetching datasets from Kaggle API...")
all_datasets = []
page = 1

while len(all_datasets) < N:
    try:
        dataset_list = api.dataset_list(page=page)
        if not dataset_list:
            print("No more datasets available.")
            break
        all_datasets.extend(dataset_list)
        print(f"Fetched page {page}, total datasets collected: {len(all_datasets)}")
        page += 1
        time.sleep(2)  # Wait 2 seconds to avoid rate limiting
    except Exception as e:
        print(f"Error fetching page {page}: {e}")
        break

all_datasets = all_datasets[:N]  # Ensure we have exactly N datasets

# Convert API datasets to the required format
for dataset in all_datasets:
    datasets.append({
        "dataset_name": str(dataset.title).replace(",", " "),
        "dataset_url": f"https://www.kaggle.com/datasets/{dataset.ref}"
    })

print(f"\nTotal datasets fetched: {len(datasets)}\n")
for i, dataset in enumerate(datasets, start=1):
    print(f"{i}. {dataset['dataset_name']} - {dataset['dataset_url']}")

# Set up Selenium for dynamic content
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# Initialize list to store metadata
metadata_list = []

try:
    for dataset in datasets:
        dataset_name = dataset["dataset_name"]
        url = dataset["dataset_url"]
        print(f"\nProcessing: {dataset_name} ({url})")

        # Step 1: Get Meta Description using requests
        try:
            response = requests.get(url)
            meta_description = "no"
            if response.status_code == 200:
                soup_requests = BeautifulSoup(response.text, "html.parser")
                meta_tag = soup_requests.find("meta", attrs={"name": "description"})
                meta_description = meta_tag.get("content").replace(",", " ") if meta_tag else "no"
        except Exception as e:
            print(f"Error fetching meta description: {e}")
            meta_description = "no"

        # Step 2: Scrape page with Selenium
        headers = "no"
        about_dataset = "no"

        try:
            # Load the page
            driver.get(url)
            time.sleep(12)

            # Scroll to the bottom to load lazy content
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # Click "View more" if present
            try:
                view_more = driver.find_element(By.XPATH, "//button[contains(text(), 'View more')] | //a[contains(text(), 'View more')]")
                view_more.click()
                time.sleep(2)
            except:
                print("No 'View more' button found")

            # Parse page source
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Extract Headers
            header_elements = soup.find_all(["h1", "h2", "h3"])
            headers = " ".join([h.get_text(strip=True).replace(",", " ") for h in header_elements]) if header_elements else "no"
            print("Headers found:", headers[:100] + "..." if len(headers) > 100 else headers)

            # Target "about dataset" for all entries
            target_section = "about dataset"

            # Find About Dataset section
            about_dataset = ""
            about_section = None
            for header in soup.find_all(["h1", "h2", "h3", "div"]):
                header_text = header.get_text(strip=True).lower()
                if target_section in header_text and len(header_text) < 30:
                    about_section = header
                    break

            if about_section:
                print(f"{target_section.title()} section found with tag:", about_section.name)
                parent = about_section.find_parent("div")
                if parent:
                    paragraphs = parent.find_all("p")
                    about_dataset = " ".join([p.get_text(strip=True).replace(",", " ") for p in paragraphs if p.get_text(strip=True)])
                else:
                    for sibling in about_section.find_all_next():
                        if sibling.name in ["h1", "h2", "h3"]:
                            break
                        if sibling.name == "p":
                            about_dataset += sibling.get_text(strip=True).replace(",", " ") + " "
            about_dataset = about_dataset.strip() or "no"
            print(f"{target_section.title()}:", about_dataset[:100] + "..." if len(about_dataset) > 100 else about_dataset)

        except Exception as e:
            print(f"Error scraping {url}: {e}")

        # Store metadata
        metadata_list.append({
            "dataset_name": dataset_name,
            "dataset_url": url,
            "header": headers,
            "about_dataset": about_dataset
        })

        # Progress tracking after each dataset
        print(f"Progress: {len(metadata_list)} out of {len(datasets)} datasets processed")

finally:
    driver.quit()

# Create DataFrame and save to CSV
df = pd.DataFrame(metadata_list)
df.to_csv("kaggle_datasets_metadata.csv", index=False)
print("\nMetadata saved to kaggle_datasets_metadata.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Processing: ALERT - Linguistic Extremism in Religious Text (https://www.kaggle.com/datasets/orvile/alert-linguistic-extremism-in-religious-text)
No 'View more' button found
Headers found: ALERT - Linguistic Extremism in Religious Text ALERT - Linguistic Extremism in Religious Text About ...
About Dataset section found with tag: div
About Dataset: The widespread dissemination of religiously aggressive content on social media platforms poses signi...
Progress: 1687 out of 2400 datasets processed

Processing: Road Sign Detection (https://www.kaggle.com/datasets/andrewmvd/road-sign-detection)
No 'View more' button found
Headers found: Road Sign Detection Road Sign Detection About Dataset About this Dataset How to Cite this Dataset Bi...
About Dataset section found with tag: div
About Dataset: This dataset contains877images of4 distinct classesfor the objective ofroad sign detection.Bounding ...
Progress: 1688 out of 2400 dat

In [None]:
from google.colab import files

# Download the CSV file
files.download("kaggle_datasets_metadata.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>