In [0]:
"""
This script prepares data for inference in the Student Success Tool (SST) pipeline.

It reads validated course and cohort data from Delta Lake tables, creates a student-term 
dataset, applies target variable logic (currently using a workaround due to library 
limitations), and saves the processed dataset back to a Delta Lake table.  It's 
designed to run within a Databricks environment.

"""

import logging

import mlflow
from databricks.connect import DatabricksSession
from databricks.sdk.runtime import dbutils

from student_success_tool.analysis import pdp
from student_success_tool import configs

# Disable mlflow autologging (due to Databricks issues during feature selection)
mlflow.autolog(disable=True)

# Configure logging
logging.basicConfig(level=logging.INFO)
logging.getLogger("py4j").setLevel(logging.WARNING)  # Ignore Databricks logger

# Attempt to create a Spark session
try:
    spark_session = DatabricksSession.builder.getOrCreate()
except Exception:
    logging.warning("Unable to create Spark session; are you in a Databricks runtime?")
    spark_session = None

# Databricks workspace identifier
DB_workspace = dbutils.widgets.get("DB_workspace")

# Input parameters from Databricks widgets
institution_id = dbutils.widgets.get("institution_id")
sst_job_id = dbutils.widgets.get("sst_job_id")

# Delta Lake table details (read from job task values set by data ingestion task)
catalog = DB_workspace
read_schema = f"{institution_id}_bronze"
write_schema = f"{institution_id}_silver"

# Read DataFrames from Delta Lake tables (if Spark session is available)
if spark_session:
    df_course = pdp.schemas.RawPDPCourseDataSchema(
        pdp.dataio.read_data_from_delta_table(
            f"{catalog}.{read_schema}.{sst_job_id}_course_dataset_validated",
            spark_session=spark_session,
        )
    )

    df_cohort = pdp.schemas.RawPDPCohortDataSchema(
        pdp.dataio.read_data_from_delta_table(
            f"{catalog}.{read_schema}.{sst_job_id}_cohort_dataset_validated",
            spark_session=spark_session,
        )
    )
else:
    logging.warning("Spark session not initialized. Cannot read dataframes.")
    exit()  # Exit the script if the Spark session is not available.


# Reading the parameters from the institution's configuration file
cfg = configs.load_config(
    f"/Volumes/{DB_workspace}/{institution_id}_bronze/pdp_pipeline_internal/configuration_files/{institution_id}.toml",
    configs.PDPProjectConfigV2,
)

# Read preprocessing features
min_passing_grade = cfg.preprocessing.features.min_passing_grade
min_num_credits_full_time = cfg.preprocessing.features.min_num_credits_full_time
course_level_pattern = cfg.preprocessing.features.course_level_pattern
key_course_subject_areas = cfg.preprocessing.features.key_course_subject_areas
key_course_ids = cfg.preprocessing.features.key_course_ids

# Read preprocessing target params
min_num_credits_checkin = cfg.preprocessing.target.params["min_num_credits_checkin"]
min_num_credits_target = cfg.preprocessing.target.params["min_num_credits_target"]

# Create student-term dataset
df_student_terms = pdp.dataops.make_student_term_dataset(
    df_cohort,
    df_course,
    min_passing_grade=min_passing_grade,
    min_num_credits_full_time=min_num_credits_full_time,
    course_level_pattern=course_level_pattern,
    key_course_subject_areas=key_course_subject_areas,
    key_course_ids=key_course_ids,
)


student_criteria = {
    "enrollment_type": ["FIRST-TIME", "RE-ADMIT", "TRANSFER-IN"],
    # "enrollment_intensity_first_term": ["FULL-TIME", "PART-TIME"], # Example, but commented out.
    # "credential_type_sought_year_1": "Associate's Degree",  # Example, but commented out.
}
intensity_time_limits = [
    ("FULL-TIME", 1.0, "year"),
    ("PART-TIME", 1.0, "year"),
]

df_processed = pdp.targets.failure_to_earn_enough_credits_in_time_from_enrollment.make_inference_dataset(
    df_student_terms,
    min_num_credits_checkin=min_num_credits_checkin,
    min_num_credits_target=min_num_credits_target,
    student_criteria=student_criteria,
    intensity_time_limits=intensity_time_limits,
)


# # Temporary workaround: Create and drop the 'target' label due to library issue.
# # This should be replaced with the updated library function once available.
# df_processed = pdp.targets.failure_to_earn_enough_credits_in_time_from_enrollment.make_labeled_dataset(
#     df_student_terms,
#     min_num_credits_checkin=min_num_credits_checkin,
#     min_num_credits_target=min_num_credits_target,
#     student_criteria=student_criteria,
#     intensity_time_limits=intensity_time_limits,
# )
# df_processed.drop("target", axis=1, inplace=True)  # Drop the temporary target column.

# Save processed dataset to Delta Lake (if Spark session is available)
if spark_session:
    write_table_path = f"{catalog}.{write_schema}.{sst_job_id}_processed_dataset"
    pdp.dataio.write_data_to_delta_table(
        df_processed, write_table_path, spark_session=spark_session
    )
    logging.info(f"Processed dataset written to: {write_table_path}")
else:
    logging.warning("Spark session not initialized. Cannot write processed dataset.")