# Functions Shared Across Notebooks


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.ml.classification import NaiveBayes, DecisionTreeClassifier, LogisticRegression
from enum import Enum

In [None]:

class MLAlgorithm(Enum):
    """
    Enumeration class representing different machine learning algorithms.
    """
    NB = "Naive Bayes"
    DT = "Decision Tree"
    LR = "Logistic Regression"

# Define the schema for the DataFrame
# This schema represents the structure of the DataFrame that will be used for storing ML algorithm results.

schema = StructType(
    [
        StructField("feature_cols", StringType(), False),
        StructField("accuracy", FloatType(), True),
        StructField("precision", FloatType(), True),
        StructField("recall", FloatType(), True),
        StructField("f_measure", FloatType(), True),
        StructField("fp_rate", FloatType(), True),
        StructField("auc_roc", FloatType(), True),
        StructField("conf_matrix_tp", FloatType(), True),
        StructField("conf_matrix_fp", FloatType(), True),
        StructField("conf_matrix_fn", FloatType(), True),
        StructField("conf_matrix_tn", FloatType(), True),
    ]
)



In [None]:

def load_csv(filesystem_endpoint, path):
    """
    Load a CSV file from the specified filesystem endpoint and path.

    Args:
        filesystem_endpoint (str): The endpoint of the filesystem.
        path (str): The path to the CSV file.

    Returns:
        DataFrame: The loaded CSV data as a DataFrame.
    """
    input_file = f"abfss://{filesystem_endpoint}/{path}"
    df = spark.read.format("csv").option("header", "true").load(input_file)

    return df

def save_df_to_csv(df, filesystem_endpoint, path):
    """
    Save a DataFrame to a CSV file at the specified filesystem endpoint and path.

    Args:
        df (DataFrame): The DataFrame to be saved.
        filesystem_endpoint (str): The endpoint of the filesystem.
        path (str): The path to save the CSV file.

    Returns:
        None
    """
    out_file = f"abfss://{filesystem_endpoint}/{path}"
    df.coalesce(1).write.format("csv").option("header", True).mode("overwrite").option("delimiter",",").save(out_file)

def run_ml_algorithm(algorithm, feature_cols, label_col, iter_df, train_pct):
    """
    Run a machine learning algorithm on the given DataFrame.

    Args:
        algorithm (MLAlgorithm): The machine learning algorithm to run.
        feature_cols (list): The list of feature columns.
        label_col (str): The label column.
        iter_df (DataFrame): The input DataFrame.
        train_pct (float): The percentage of data to use for training.

    Returns:
        DataFrame: The predictions made by the machine learning algorithm.
    """
    train, test = iter_df.randomSplit([train_pct, 1 - train_pct], seed=1234)

    if algorithm == MLAlgorithm.NB:
        predictions = run_naive_bayes_algorithm(train, test, label_col)
    elif algorithm == MLAlgorithm.DT:
        predictions = run_decision_tree_algorithm(train, test, label_col)
    elif algorithm == MLAlgorithm.LR:
        predictions = run_logistic_regression_algorithm(train, test, label_col)
    else:
        raise ValueError("Invalid algorithm.")

    return predictions


def run_naive_bayes_algorithm(train, test, label_col):
    """
    Run the Naive Bayes algorithm on the given train and test DataFrames.

    Args:
        train (DataFrame): The training DataFrame.
        test (DataFrame): The test DataFrame.
        label_col (str): The label column.

    Returns:
        DataFrame: The predictions made by the Naive Bayes algorithm.
    """
    nb = NaiveBayes(featuresCol="features", labelCol=label_col, modelType="multinomial")
    nbModel = nb.fit(train)
    predictions = nbModel.transform(test)

    return predictions

def run_decision_tree_algorithm(train, test, label_col):
    """
    Run the Decision Tree algorithm on the given train and test DataFrames.

    Args:
        train (DataFrame): The training DataFrame.
        test (DataFrame): The test DataFrame.
        label_col (str): The label column.

    Returns:
        DataFrame: The predictions made by the Decision Tree algorithm.
    """
    dt = DecisionTreeClassifier(featuresCol = "features", labelCol=label_col, maxBins=500)
    dtModel = dt.fit(train)
    predictions = dtModel.transform(test)

    return predictions

def run_logistic_regression_algorithm(train, test, label_col):
    """
    Run the Logistic Regression algorithm on the given train and test DataFrames.

    Args:
        train (DataFrame): The training DataFrame.
        test (DataFrame): The test DataFrame.
        label_col (str): The label column.

    Returns:
        DataFrame: The predictions made by the Logistic Regression algorithm.
    """
    lr = LogisticRegression(featuresCol="features", labelCol=label_col)
    lrModel = lr.fit(train)
    predictions = lrModel.transform(test)
    
    return predictions