# Development and testing ideas

by Edgar Bermudez

June, 2025.

In [1]:
import os
from dotenv import load_dotenv

# load the environment variables
load_dotenv()

KAGGLE_API_KEY = os.getenv("KAGGLE_API_KEY")
KAGGLE_USERNAME = os.getenv("KAGGLE_USERNAME")

if not KAGGLE_API_KEY or not KAGGLE_USERNAME:
    raise ValueError("KAGGLE_API_KEY or KAGGLE_USERNAME is not set")



In [2]:
from kaggle.api.kaggle_api_extended import KaggleApi

# load the kaggle api
api = KaggleApi()
api.authenticate()


In [6]:
from kaggle.api.kaggle_api_extended import KaggleApi


def download_dataset(dataset_path, output_path):
    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Initialize and authenticate the API
    api = KaggleApi()
    api.authenticate()
    
    print(f"Downloading dataset: {dataset_path}")
    api.dataset_download_files(
        dataset=dataset_path,  # Full path: owner/dataset-name
        path=output_path,
        unzip=True
    )
    print(f"Dataset downloaded and extracted to: {output_path}")


In [7]:
# test the function

# dataset example and path
dataset_name = "titanic/titanic"
download_path = "data/raw"


In [8]:
# Test with Titanic dataset
download_dataset(dataset_path="titanic/titanic", output_path="data/raw")

Downloading dataset: titanic/titanic
Dataset URL: https://www.kaggle.com/datasets/titanic/titanic


HTTPError: 403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/datasets/download/titanic/titanic?raw=false

In [15]:
from dotenv import load_dotenv
import os
from pathlib import Path
load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

def download_competition_dataset(competition_name, output_path):
    """
    Download a competition dataset from Kaggle.

    Args:
        competition_name (str): The name of the competition.
        output_path (str): The path to the output directory.

    Returns:
        None
    """

    # get the repo root directory (parent of the notebooks directory)
    repo_root = Path(os.getcwd()).parent

    # create the output path
    output_path = os.path.join(repo_root, output_path)

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Initialize and authenticate the API
    api = KaggleApi()
    api.authenticate()
    
    print(f"Downloading competition dataset: {competition_name}")
    api.competition_download_files(
        competition=competition_name,
        path=output_path,
        quiet=False
    )
    print(f"Dataset downloaded to: {output_path}")


In [16]:
# Test with Titanic competition
download_competition_dataset(competition_name="titanic", output_path="data/raw")


Downloading competition dataset: titanic
Downloading titanic.zip to /home/edgar/projects/kaggle_explorer/data/raw


100%|██████████| 34.1k/34.1k [00:00<00:00, 24.4MB/s]


Dataset downloaded to: /home/edgar/projects/kaggle_explorer/data/raw





In [17]:
# unzip the dataset and explore the files

from pathlib import Path
import zipfile

# parameterize the dataset paths
dataset_zip_path = Path('data/raw/titanic.zip')
extraction_dir = Path('data/raw/titanic')

# unzip the dataset
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

# explore the files
for file in extraction_dir.iterdir():
    print(file.name)

gender_submission.csv
test.csv
train.csv


In [None]:
# find out if the dataset is already divided into train and test sets

import os
import pandas as pd
from sklearn.model_selection import train_test_split

# get the dataset path
dataset_path = "data/raw/titanic"

# get the files in the dataset path
files = os.listdir(dataset_path)

if "train.csv" in files and "test.csv" in files:
    print("The dataset is already divided into train and test sets")
    divided = True
else:
    print("The dataset is not already divided into train and test sets")
    divided = False

# if the dataset is divided into train and test sets, read the train and test files
if divided:
    train_file = os.path.join(dataset_path, "train.csv")
    test_file = os.path.join(dataset_path, "test.csv")

    # read the train and test files
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

else:
    # read the dataset
    df = pd.read_csv(os.path.join(dataset_path, "train.csv"))

    # split the dataset into train and test
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.head())
print(test_df.head())