# Ingest Kaggle
Example of ingesting from Kaggle.com.  This ingestion has a couple of unique factors:
- It uses a client library that provides an easier API to use the REST calls.
- It typically provides the data in zipped CSV format.  We want to minimally unzip before saving.

For documentation on the Kaggle API, go to https://www.kaggle.com/docs/api.  This page includes the following important authentication information.  You will need the downloaded file and will use it below.

"In order to use the Kaggle’s public API, you must first authenticate using an API token. Go to the 'Account' tab of your user profile and select 'Create New Token'. This will trigger the download of kaggle.json, a file containing your API credentials."

In [0]:
# Assign variables to clarify inputs to the spark.conf.set() call.
my_scope = "Fall2025SecretScope"   # Databricks secret scope.
my_key = "assign1store"             # Key vaault secret containing storage account access key.
storage_end_point = "assign1store.dfs.core.windows.net"  # Storage account uri.
container_name = "misc"    # Container name.

# The following spark configuration call uses the variables set above.
spark.conf.set(
    "fs.azure.account.key." + storage_end_point,
    dbutils.secrets.get(scope=my_scope, key=my_key))

# To set the URI to be used in the code below, the container name (assign-1-blob) in the string.
uri = "abfss://" + container_name + "@" + storage_end_point + "/" 
print(uri)


In [0]:
# The Kaggle library won't be installed on the Databricks cluster by default.  So we need to 
# install it here.
%pip install kaggle

In [0]:
# When your register with Kaggle, a JSON file containing your account key is provided.  
# Kaggle expects this to be in a well known location in the file system.
# On a PC, this is in c:\users\<YourUserName>\.kaggle\kaggle.json
# For Databricks, we'll put it in the following location.

# Create the directory for Kaggle JSON
dbutils.fs.mkdirs("file:/root/.kaggle")

# Copy the Kaggle JSON file from Azure storage to the expected location
source_location = uri + "AccessInfo/kaggle.json"
dbutils.fs.cp(source_location, "file:/root/.kaggle/kaggle.json")

# Set the access to kaggle.json to 600 (owner: read/write, group: none, other: none)
!chmod 600 /root/.kaggle/kaggle.json


In [0]:
from kaggle.api.kaggle_api_extended import KaggleApi

# Authentication defaults to use the config file in the predefined location.
api = KaggleApi()
api.authenticate()

# Get the competitions list to test out the API.
competitions = api.competitions_list()
print(competitions)

In [0]:
dataset = 'uciml/iris'
out_path = 'datasets/iris'

out_path = uri + "Bronze/Kaggle/Zip"
api.dataset_download_file(dataset, 'Iris.csv', out_path)

api.dataset_download_files(dataset, path=out_path)


In [0]:
from kaggle.api.kaggle_api_extended import KaggleApi

# Authenticate with Kaggle API
api = KaggleApi()
api.authenticate()

# Define dataset and output path
dataset = 'uciml/iris'
out_path = '/dbfs/mnt/assign1store/misc/Bronze/Kaggle/Zip'

# Download dataset files to the specified path, unzipping the files.
api.dataset_download_files(dataset, path=out_path, unzip=True)

# Copy the downloaded files to Azure storage
dbutils.fs.cp("file:" + out_path, uri + "Bronze/Kaggle/Iris", recurse=True)

In [0]:
# Download the data from Kaggle.
api.dataset_download_files(dataset, path=out_path)
# Read the downloaded data into a dataframe.
price_df = spark.read.csv(out_path, header=True)
display(price_df)   

In [0]:
from pyspark.sql.types import DoubleType, DateType

# Convert the data to appropriate types.  Strings are OK for all except the following.
price_types_df = price_df.withColumn("price", price_df["price"].cast(DoubleType()))
price_types_df = price_types_df.withColumn("period", price_df["period"].cast(DateType()))

display(price_types_df)


In [0]:
import json

# Parse the result and convert the monthly price list to a  dataframe.
json_data = json.loads(api_response.content)

response_json = json_data['response']  
total_data_points = response_json['total']
price_df = spark.createDataFrame(response_json['data'])

print(total_data_points)
display(price_df)



In [0]:
# Save the dataframe to delta.
price_types_df.write.mode("overwrite").format("delta").save(uri + "Bronze/EIA")

In [0]:
# Read back in.
price_read_df = spark.read.format("delta").load(uri + "Bronze/EIA")

display(price_read_df)