# Exploration notebook
This notebook is used for ad-hoc exploration of data

## Setup
Run cells in this section to get your environment setup

In [12]:
# Setup module autoreload
%load_ext autoreload
%autoreload 2

In [None]:
# Load environment variables using dotenv

from dotenv import load_dotenv

load_dotenv()

In [None]:
# Create a Spark session for the Databricks compute environment
from pyspark.sql import SparkSession
from ncaa_tournament_predictor.databricks import get_databricks_spark_session

# Explicit typing as SparkSession here to help out intellisense...DatabricksSession intellisense
# isn't very good. In all my exploration so far, the DatabricksSession is compatible with the SparkSession
spark: SparkSession = get_databricks_spark_session()

In [None]:
# Run all cells above this one to setup your environment

## Exploration
This section contains various exploratory cells for getting data, transforming data, etc.

### Proof-of-connectivity

In [None]:
# Proof-of-connectivity using an existing Databricks table

hurricane_data_snippet = spark.sql("select * from object_computing.parametric_insurance.hurricane_data limit 10;")
hurricane_data_snippet.show(5)

In [None]:
hurricane_data_snippet_read = spark.read.table("object_computing.parametric_insurance.hurricane_data")
hurricane_data_snippet_read.show(5)

### Men's college basketball schema setup
Setup the schema/objects for the NCAA men's college basketball data to land

In [None]:
# Create the ncaa_mens_basketball schema
spark.sql("create schema if not exists object_computing.ncaa_mens_basketball;")

In [None]:
# Create a volume for raw data

spark.sql("create volume if not exists object_computing.ncaa_mens_basketball.raw_kaggle_stats")

In [None]:
# Copy raw data into the raw_kaggle_stats volume

import os

notebook_dir = os.path.abspath(os.getcwd())
kaggle_dataset_path = os.path.abspath(
    os.path.join(notebook_dir, "../datasets/kaggle_ncaa_stats")
)
volume_spark_path = "/Volumes/object_computing/ncaa_mens_basketball/raw_kaggle_stats/"


for filename in os.listdir(kaggle_dataset_path):
    spark.copyFromLocalToFs(
        local_path=os.path.join(kaggle_dataset_path, filename),
        dest_path=os.path.join(volume_spark_path, filename)
    )

In [None]:
# Read the Kaggle stats dataset
from ncaa_tournament_predictor import transformation

raw_kaggle_stats = (
    spark.read.format("csv")
        .options(header=True, inferSchema=True, mergeSchema=True)
        .load("dbfs:/Volumes/object_computing/ncaa_mens_basketball/raw_kaggle_stats/")
)
cleaned_ncaa_data