In [1]:
from heapq import merge
import os
import tempfile
import statistics
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

class Utils:

    @staticmethod
    def get_files_in_directory(basedir: str) -> list[str]:
        try:
            return [file for file in os.listdir(basedir) if os.path.isfile(os.path.join(basedir, file))]
        except FileNotFoundError:
            print('file is not located')

    @staticmethod
    def get_path_to_file(name: str, format: str, basedir: str) -> str:
        try:
            return os.path.join(basedir, f'{name}{format}')
        except FileNotFoundError:
            print('file is not located')

    @staticmethod
    def reindex(df: pd.DataFrame, column: str):
        df.set_index(column, inplace=True)

    @staticmethod
    def load_data(file_path: str, filetype: str) -> pd.DataFrame:

        file_readers = {
            'csv': pd.read_csv,
            'parquet': pd.read_parquet,
        }

        if filetype not in file_readers:
            raise ValueError("Unsupported file type")

        try:
            df = file_readers[filetype](file_path)
            return df
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found at path: {file_path}")
        except Exception as e:
            raise Exception(f"Error reading file: {e}")

    @staticmethod
    def merge_frames(data_frames: list[pd.DataFrame], mapping: dict, dir: str, index: str) -> pd.DataFrame:
        merged_frame = None
        return merged_frame


    @staticmethod
    def plot_graphs(x_data, y_data, x_label, y_label, title):
        """
        Use the Mathplot lib to plot data points provide and respective x-axis and y-axis labels
        :param x_data: Data for x-axis
        :param y_data: Data for y-axis
        :param x_label: Label for x-axis
        :param y_label: Label FOR Y-axis
        :param title: Title for the plot
        :return: return tuple (fig, ax)
        """

        plt.clf()

        fig, ax = plt.subplots()
        ax.plot(x_data, y_data)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        ax.set_title(title)

        return (fig, ax)

    @staticmethod
    def plot_residual_graphs(y_hat, y_test, lowess:bool=False):
        '''
        This function will regress y on x (possibly as a robust or polynomial regression) 
        and then draw a scatterplot of the residuals. You can optionally fit a lowess smoother
         to the residual plot, which can help in determining if there is structure to the residuals.
        '''
        fig, ax = plt.subplots(figsize=(16,8))
        sns.residplot(x=y_hat, y=y_test, lowess=lowess)

        return (plt, fig, ax)

    @staticmethod
    def get_mlflow_directory_path(*paths, create_dir=True):
        """
        Get the current running path where mlruns is created. This is the directory from which
        the python file containing MLflow code is executed. This method is used for artifacts, such
        as images, where we want to store plots.
        :param paths: list of directories below mlfruns, experimentID, mlflow_run_id
        :param create_dir: detfault is True
        :return: path to directory.
        """

        cwd = os.getcwd()
        dir = os.path.join(cwd, "mlruns", *paths)
        if create_dir:
            if not os.path.exists(dir):
                os.mkdir(dir, mode=0o755)
        return dir

    @staticmethod
    def get_temporary_directory_path(prefix, suffix):
        """
        Get a temporary directory and files for artifacts
        :param prefix: name of the file
        :param suffix: .csv, .txt, .png etc
        :return: object to tempfile.
        """

        temp = tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix)
        return temp

    @staticmethod
    def print_pandas_dataset(d):
        print("rows = %d; columns=%d" % (d.shape[0], d.shape[1]))
        print(d.head())

    @staticmethod
    def get_feature_target_subsets(df:pd.DataFrame, target:str):
        X = df.drop(columns=target, axis=1)
        y = df[target]
        return X, y
    
    @staticmethod
    def get_train_test_validation_split(df:pd.DataFrame):
        return np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
