In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans
from sklearn import metrics
import string
import re

import itertools
from tqdm import tqdm

# Model Evaluation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import metrics
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import make_scorer, recall_score, log_loss
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split

# Time Series Analysis
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.graph_objects as go


from sklearn.model_selection import RandomizedSearchCV
import joblib

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

In [None]:
def perform_seasonal_decomposition(df, col=None, model_=None):
    """
    Perform seasonal decomposition of a time series and return the decomposition results.

    Args:
        df (pd.DataFrame): A DataFrame containing the time series data.
        col (str): The name of the column in the DataFrame to decompose.
        model_ (str): The model for decomposition ('additive' or 'multiplicative').

    Returns:
        dict: A dictionary containing the plot and decomposition results.

    Example:
        >>> import pandas as pd
        >>> data = pd.read_csv('time_series_data.csv')
        >>> result = perform_seasonal_decomposition(data, col='Value', model_='additive')
    """
    if col is None:
        col = df.columns[0]  # If 'col' is not specified, use the first column.

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in the DataFrame.")

    # Perform seasonal decomposition
    decomposition = seasonal_decompose(df[col], model=model_)

    # Plot the original time series, trend, seasonality, and residuals
    plt.figure(figsize=(12, 8))
    
    plt.subplot(4, 1, 1)
    plt.plot(df[col], label='Original')
    plt.legend(loc='upper left')
    plt.title('Original Time Series')

    plt.subplot(4, 1, 2)
    plt.plot(decomposition.trend, label='Trend')
    plt.legend(loc='upper left')
    plt.title('Trend Component')

    plt.subplot(4, 1, 3)
    plt.plot(decomposition.seasonal, label='Seasonal')
    plt.legend(loc='upper left')
    plt.title('Seasonal Component')

    plt.subplot(4, 1, 4)
    plt.plot(decomposition.resid, label='Residuals')
    plt.legend(loc='upper left')
    plt.title('Residuals')

    plt.tight_layout()
    
    seasonal_decomposition_result = {'plot': plt, 'decomposition': decomposition} 
               
    return seasonal_decomposition_result

# Model Evalution

In [None]:
import pandas as pd
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score, balanced_accuracy_score
import joblib

def train_and_score_classifiers(X_train, y_train, X_test, y_test, classifiers, pipeline):
    """
    Train and score multiple classifiers and return performance metrics in a DataFrame.

    This function takes training and test data, a set of classifiers, and a scikit-learn pipeline. It trains
    each classifier using a RandomizedSearchCV approach and evaluates their performance on the test data.

    Parameters:
    -----------
    X_train : pd.DataFrame
        The training data.

    y_train : pd.Series
        The target labels for the training data.

    X_test : pd.DataFrame
        The test data for evaluation.

    y_test : pd.Series
        The target labels for the test data.

    classifiers : dict
        A dictionary containing classifiers, their names as keys, and their corresponding parameter grids.
        Example: {'RandomForest': {'model': RandomForestClassifier(), 'parameters': {'n_estimators': [10, 100, 500]}}}

    pipeline : sklearn.pipeline.Pipeline
        A scikit-learn pipeline containing data preprocessing steps and a RandomizedSearchCV step.

    Returns:
    --------
    df : pd.DataFrame
        A DataFrame containing the following performance metrics for each classifier:
        - 'Model': The classifier's name.
        - 'Recall': The recall score.
        - 'Precision': The precision score.
        - 'F1': The F1 score.
        - 'Accuracy': The accuracy score.
        - 'AUC ROC': The area under the ROC curve.
        - 'Balanced Accuracy': The balanced accuracy score.

    class_dict : dict
        A dictionary containing information about each trained classifier and their predictions.
        The structure is {'Classifier Name': {'model': classifier, 'parameters': params, 'X_test': test_data,
        'y_test': test_labels, 'predictions': predicted_labels, 'prob_predictions': probability_predictions,
        'model_fit': fitted_classifier}}.
    """
    class_dict = {}
    df = pd.DataFrame(columns=['Model', 'Recall', 'Precision', 'F1', 'Accuracy', 'AUC ROC', 'Balanced Accuracy'])
   
    # Preprocess training and test datasets
    X_train_pre, X_train_s, y_train_s, groups_train_s, selected_train_cols = pipeline.named_steps['Preprocessor'].fit_transform(X_train, y_train)
    X_test_pre, X_test_s, y_test_s, groups_test_s, selected_test_cols = pipeline.named_steps['Preprocessor'].fit_transform(X_test, y_test)

    dec = 3
    for c in classifiers:
        class_dict[c] = {}
        classifier = classifiers[c]['model']
        params = classifiers[c]['parameters']

        print('Training and scoring: ' + str(c))

        # Save classifier and params for return
        class_dict[c]['model'] = classifier
        class_dict[c]['parameters'] = params

        # Fit model and test with holdout data
        pipeline_fit = pipeline.named_steps['RandomSearchCV'].fit(X_train_s, y_train_s, groups=groups_train_s, base_estimator=classifier, parameters=params)
        filename = f'models/{c}_hospitalization_prediction.sav'
        joblib.dump(pipeline_fit, filename)
        results = pipeline.named_steps['RandomSearchCV'].predict(X_test_s, y_test_s, selected_train_cols)
        X_test = results['X_test']
        y_test = results['y_test']
        predicted = results['predictions']
        prob_predictions = results['prob_predictions']
        fitted = results['model_fit']
        
        # Save y_test, predicted, and pipeline for return
        class_dict[c]['X_test'] = X_test
        class_dict[c]['y_test'] = y_test
        class_dict[c]['predictions'] = predicted
        class_dict[c]['prob_predictions'] = prob_predictions
        class_dict[c]['model_fit'] = fitted

        # Score model
        recall = recall_score(y_test, predicted)
        accuracy = accuracy_score(y_test, predicted)
        precision = precision_score(y_test, predicted)
        f1 = f1_score(y_test, predicted)
        auc_roc = roc_auc_score(y_test, predicted)
        bal_acc = balanced_accuracy_score(y_test, predicted)

        # append rows to an empty DataFrame
        df = df.append({'Model': c, 'Recall': round(recall, dec), 'Precision': round(precision, dec), \
                            'F1': round(f1, dec), 'Accuracy': round(accuracy, dec), 'AUC ROC': round(auc_roc, dec), 'Balanced Accuracy': round(bal_acc, dec)}, ignore_index=True)
    return df, class_dict

# Sklearn Custom Classes

In [None]:
class CustomSelectFromModel(BaseEstimator, ClassifierMixin):
    """
    A custom transformer for feature selection based on a scikit-learn estimator.

    This transformer selects features from the input data using a provided estimator.
    It is designed for use within a scikit-learn pipeline.

    Parameters:
    -----------
    estimator : object, optional
        An estimator used for feature selection. It should have a `fit` method.
        Default is None.

    threshold : float, optional
        The threshold for feature selection. Features with importance scores greater than or equal
        to this threshold will be selected. Default is None.

    Methods:
    --------
    fit(self, X, y=None, base_estimator=None, parameters=None):
        Fit the transformer to the data. This method does not perform feature selection but is included
        for compatibility within a scikit-learn pipeline.

    transform(self, X):
        Transform the input data by selecting relevant features using the provided estimator and threshold.

    Returns:
    --------
    X_transformed : pd.DataFrame
        The transformed DataFrame with selected features.
    y : pd.Series
        The target variable (labels).
    groups : pd.Series
        A group identifier for data grouping or split purposes.
    selected_train_cols : list
        List of column names that were selected.

    Example:
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> custom_selector = CustomSelectFromModel(estimator=RandomForestClassifier(), threshold=0.2)
    >>> X_transformed, y, groups, selected_cols = custom_selector.transform(X)
    >>> print(X_transformed.head())
    >>> print(selected_cols)
    """
    
    def __init__(self, estimator=None, threshold=None):
        self.estimator = estimator
        self.threshold = threshold
    
    def fit(self, X, y=None, base_estimator=None, parameters=None):
        return self
    
    def transform(self, X):   
        X_pre = X
        X_split, y_split, groups_split = split_data_by_groups(X)
        feature_selector = SelectFromModel(self.estimator, threshold=self.threshold)
        feature_selector.fit(X_split, y_split) 
        selected_features = feature_selector.get_support()
        selected_train_cols = X_split.columns[selected_features].to_list()
        X_split = X_split[selected_train_cols]
        return X_pre, X_split, y_split, groups_split, selected_train_cols

In [None]:
# class CustomRandomizedSearchCV(BaseEstimator, ClassifierMixin):
#     """
#     Customized Randomized Search Cross-Validation for hyperparameter tuning of a scikit-learn classifier.

#     This class extends scikit-learn's BaseEstimator and ClassifierMixin to create a custom Randomized Search CV
#     that fits a classifier to the data, performs hyperparameter tuning, and provides predictions.

#     Parameters:
#     scoring (str or callable, optional): A scoring strategy to evaluate the model's performance during tuning.
#         Default is None.
#     n_splits (int, optional): The number of cross-validation splits to use. Default is None.

#     Attributes:
#     scoring (str or callable): The scoring strategy used during hyperparameter tuning.
#     n_splits (int): The number of cross-validation splits used for model evaluation.

#     Methods:
#     fit(X, y=None, base_estimator=None, parameters=None):
#         Fit the model to the data, perform hyperparameter tuning, and save the best model.

#     predict(X):
#         Make predictions on new data using the best-fitted model.

#     """
#     def __init__(self, scoring=None, n_splits=None):
#         self.scoring = scoring
#         self.n_splits = n_splits
    
#     def fit(self, X, y=None, base_estimator=None, parameters=None):
#         X_train, y_train, groups_train = split_data_by_groups(X)
#         cv_splits = GroupTimeSeriesSplit(n_splits=self.n_splits).split(X=X_train, y=y_train, groups=groups_train)
#         rscv = RandomizedSearchCV(base_estimator, parameters,scoring=self.scoring,cv=cv_splits)
#         rscv.fit(X=X_train, y=y_train)
#         filename = 'models/hospitalization_prediction.sav'
#         joblib.dump(rscv, filename)
    
#     def predict(self, X):     
#         X_test = X
#         X, y, groups = split_data_by_groups(X_test)
#         y_test = y.to_numpy()
#         filename = 'models/hospitalization_prediction.sav'
#         rs_cv = joblib.load(filename)
#         predictions = rs_cv.predict(X)
#         prob_predictions = rs_cv.predict_proba(X)
#         results = {
#                     'X_test': X_test, 
#                     'y_test': y_test, 
#                     'predictions':predictions, 
#                     'prob_predictions':prob_predictions, 
#                     'fitted_rscv': rs_cv
#         }
#         return results

In [None]:
class CustomRandomizedSearchCV(BaseEstimator, ClassifierMixin):
    """
    A custom class for performing randomized hyperparameter tuning and prediction using scikit-learn models.

    This class uses RandomizedSearchCV to optimize hyperparameters and then provides prediction functionality.

    Parameters:
    -----------
    scoring : str, optional
        The scoring metric for hyperparameter optimization.
        Default is None.

    n_splits : int, optional
        The number of splits to be used in cross-validation.
        Default is None.

    Methods:
    --------
    fit(self, X, y=None, groups=None, base_estimator=None, parameters=None):
        Fit the CustomRandomizedSearchCV to the data by optimizing hyperparameters using RandomizedSearchCV.

    predict(self, X, y=None, selected_columns=None):
        Make predictions on new data using the trained model.

    Returns:
    --------
    results : dict
        A dictionary containing the following items:
        - 'X_test': The transformed test data.
        - 'y_test': The ground truth test labels.
        - 'predictions': Predicted labels.
        - 'prob_predictions': Predicted probabilities.
        - 'fitted_rscv': The fitted RandomizedSearchCV model.

    Example:
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> from sklearn.metrics import accuracy_score
    >>> custom_rscv = CustomRandomizedSearchCV(scoring='accuracy', n_splits=5)
    >>> custom_rscv.fit(X_train, y_train, groups=group_labels, base_estimator=RandomForestClassifier(), parameters=param_grid)
    >>> results = custom_rscv.predict(X_test, y_test, selected_columns)
    >>> accuracy = accuracy_score(results['y_test'], results['predictions'])
    >>> print(f"Accuracy: {accuracy}")
    """

    def __init__(self, scoring=None, n_splits=None):
        self.scoring = scoring
        self.n_splits = n_splits
    
    def fit(self, X, y=None, groups=None, base_estimator=None, parameters=None):
        """
        Fit the CustomRandomizedSearchCV to the data by optimizing hyperparameters using RandomizedSearchCV.

        Parameters:
        -----------
        X : pd.DataFrame
            The input data.

        y : pd.Series, optional
            The target variable (labels).

        groups : pd.Series, optional
            A group identifier for data grouping or split purposes.

        base_estimator : object, optional
            The base estimator for hyperparameter optimization.

        parameters : dict, optional
            Hyperparameter search space.

        Returns:
        --------
        None
        """
        cv_splits = GroupTimeSeriesSplit(n_splits=self.n_splits).split(X=X, y=y, groups=groups)
        rscv = RandomizedSearchCV(base_estimator, parameters, scoring=self.scoring, cv=cv_splits)
        rscv.fit(X=X, y=y)
        filename = 'models/hospitalization_prediction.sav'
        joblib.dump(rscv, filename)
    
    def predict(self, X, y=None, selected_columns=None):
        """
        Make predictions on new data using the trained model.

        Parameters:
        -----------
        X : pd.DataFrame
            The test data for prediction.

        y : pd.Series, optional
            The ground truth test labels.

        selected_columns : list, optional
            List of selected columns used for prediction.

        Returns:
        --------
        results : dict
            A dictionary containing prediction results and data:
            - 'X_test': The transformed test data.
            - 'y_test': The ground truth test labels.
            - 'predictions': Predicted labels.
            - 'prob_predictions': Predicted probabilities.
            - 'fitted_rscv': The fitted RandomizedSearchCV model.
        """
        y_test = y.to_numpy()
        appended_df = append_new_columns(X, selected_columns)
        X_test = appended_df[selected_columns]
        filename = 'models/hospitalization_prediction.sav'
        rs_cv = joblib.load(filename)
        predictions = rs_cv.predict(X_test)
        prob_predictions = rs_cv.predict_proba(X_test)
        results = {
            'X_test': X_test,
            'y_test': y_test,
            'predictions': predictions,
            'prob_predictions': prob_predictions,
            'model_fit': rs_cv
        }
        return results

In [None]:
class DataFrameMerger(BaseEstimator, TransformerMixin):
    """
    Merge DataFrames using specified parameters.

    Args:
        right_df (pandas.DataFrame): The DataFrame to merge with.
        join_on (str or list): The column(s) to join on.
        how_join (str): The type of merge operation (e.g., 'inner', 'left', 'right', 'outer').

    Returns:
        pandas.DataFrame: Merged DataFrame.
    """

    def __init__(self, right_df=None, join_on=None, how_join=None):
        self.right_df = right_df
        self.on = join_on
        self.how = how_join

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """
        Transform the input DataFrame by merging it with another DataFrame.

        Args:
            X (pandas.DataFrame): The input DataFrame to merge.

        Returns:
            pandas.DataFrame: Transformed DataFrame after merging.
        """
        try:
            if self.right_df is not None and self.on is not None and self.how is not None:
                X = X.merge(self.right_df, on=self.on, how=self.how)
                return X
            else:
                raise ValueError("Missing required parameters: right_df, join_on, and how_join.")
        except Exception as e:
            raise Exception(f"An error occurred while merging DataFrames: {str(e)}")

In [None]:
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer for one-hot encoding categorical features using the
    Pandas `get_dummies` function.

    Parameters:
    -----------
    *args, **kwargs:
        Additional positional and keyword arguments that can be passed to the parent class.
    pandas_params : dict, optional (default={})
        Additional parameters to pass to the Pandas `get_dummies` function.
    
    Attributes:
    -----------
    _pandas_params : dict
        Additional parameters to pass to the Pandas `get_dummies` function.

    Methods:
    --------
    fit(self, X, y=None):
        Fit the transformer to the data (no actual fitting is needed in this case).
        
        Parameters:
        -----------
        X : array-like or DataFrame
            The input data.
        y : array-like, optional (default=None)
            Target variable (ignored, as this transformer does not require target information).

        Returns:
        --------
        self : GetDummiesTransformer
            Returns the instance of the transformer.

    transform(self, X, y=None):
        Transform the input data by applying the Pandas `get_dummies` function.

        Parameters:
        -----------
        X : array-like or DataFrame
            The input data to be one-hot encoded.
        y : array-like, optional (default=None)
            Target variable (ignored, as this transformer does not require target information).

        Returns:
        --------
        encoded_data : DataFrame
            A DataFrame containing the one-hot encoded representation of the input data.
    """
    def __init__(self, columns=None, prefix=None):
        self.columns = columns
        self.prefix = prefix

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.get_dummies(X, columns=self.columns, prefix=self.prefix)

In [None]:
class DataFrameAggregator(BaseEstimator, TransformerMixin):
    """
    Aggregate data in a DataFrame based on specified grouping columns and aggregation functions.

    Args:
        grouping_cols (list or str): The column(s) by which data should be grouped.
        aggregation (dict): Dictionary specifying the columns and aggregation functions to apply.

    Returns:
        pandas.DataFrame: DataFrame with aggregated data.
    """

    def __init__(self, grouping_cols=None, aggregation=None):
        self.grouping_cols = grouping_cols
        self.aggregation = aggregation
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """
        Transform the input DataFrame by aggregating data based on grouping columns and aggregation functions.

        Args:
            X (pandas.DataFrame): The input DataFrame to be transformed.

        Returns:
            pandas.DataFrame: Transformed DataFrame with aggregated data.
        """
        try:
            if self.grouping_cols is not None and self.aggregation is not None:
                X = X.groupby(self.grouping_cols).agg(self.aggregation).reset_index()
                X = X.loc[:,~X.columns.duplicated()].copy() 
                return X
            else:
                raise ValueError("Missing required parameters: grouping_cols and aggregation.")
        except Exception as e:
            raise Exception(f"An error occurred while aggregating the DataFrame: {str(e)}")

In [None]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
    """
    A scikit-learn compatible transformer for selecting specific columns from a DataFrame.

    This class allows you to specify a list of column names to extract from a DataFrame. It is designed
    to work seamlessly with scikit-learn pipelines and transformers.

    Parameters:
    -----------
    columns : list
        A list of column names to select from the input DataFrame.

    Attributes:
    -----------
    columns : list
        The list of column names to be selected.

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. This method does nothing and is included to maintain
        scikit-learn compatibility.

    transform(X, y=None):
        Transform the input DataFrame by selecting the specified columns.
        
        Parameters:
        -----------
        X : pandas DataFrame
            The input DataFrame to select columns from.

        Returns:
        --------
        selected_columns : pandas DataFrame
            A new DataFrame containing only the specified columns from the input DataFrame.
    """
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for column in self.columns:
            if column not in X.columns:
                X[column] = 0  # You can set any default value          
        return X[self.columns]

In [None]:
class DataframeFunctionTransformer:
    """
    Custom transformer for applying a user-defined function to a DataFrame.

    Parameters:
    -----------
    func : callable
        A function that takes a DataFrame as input and returns a transformed DataFrame.

    Attributes:
    -----------
    func : callable
        The user-defined function to be applied to the input DataFrame.

    Methods:
    --------
    transform(input_df, **transform_params):
        Applies the user-defined function to the input DataFrame and returns the transformed DataFrame.

        Parameters:
        -----------
        input_df : pandas.DataFrame
            The input DataFrame to apply the function to.

        Returns:
        --------
        pandas.DataFrame
            A new DataFrame resulting from the application of the user-defined function.

    fit(X, y=None, **fit_params):
        Fits the transformer to the input data. No actual fitting is performed, and the transformer is returned as is.

        Parameters:
        -----------
        X : pandas.DataFrame
            The input DataFrame, not used for fitting.

        y : None, optional
            Target variable, not used.

        Returns:
        --------
        self
            The fitted transformer instance.
    """
    
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

# Pipeline Functions

In [None]:
def impute_missing_values_by_group(df, group_key=None, sort_cols=None):
    """
    Impute missing values within groups by forward and backward filling.

    Args:
        df (pandas.DataFrame): Input DataFrame.
        group_key (str): The column name used for grouping data.
        sort_cols (list, optional): List of columns by which the DataFrame should be sorted before imputation.

    Returns:
        pandas.DataFrame: DataFrame with missing values imputed within groups.
    """
    
    df = df.loc[:,~df.columns.duplicated()].copy()
    
    df = df.sort_values(by=sort_cols) if sort_cols else df

    some_cols = df.columns.to_list()
    some_cols.remove(group_key)

    df[[group_key] + some_cols] = df[[group_key] + some_cols].groupby(df[group_key]).ffill().groupby(df[group_key]).bfill()
    
    df.drop_duplicates(inplace=True)

    return df

In [None]:
def hosp_rate_cluster(df):
    """
    Cluster patients based on their health statistics, especially hospitalization rates.

    Args:
        df (pandas.DataFrame): Input DataFrame with patient health statistics.

    Returns:
        pandas.DataFrame: DataFrame with an additional 'cluster' column indicating the patient clusters and hospitalization rate.
    """

    # Summarize patient health statistics
    patient_stats_df = summarize_patient_health_stats(df)
    patient_stats_df = patient_stats_df.replace(np.nan, 0)
    data = patient_stats_df[['uid', 'age', 'bp_sys', 'bp_dia', 'hosp_per_visit']].drop('uid', axis=1)
    X = StandardScaler().fit_transform(data)

    kmeans_results_df = pd.DataFrame(columns=['no_clusters', 'calinski_harabasz_score', 'y_hat'])

    C = range(2, 7)
    for c in C:
        k_means = KMeans(n_clusters=c)
        model = k_means.fit(X)
        y_hat = k_means.predict(X)

        labels = k_means.labels_
        cal = metrics.calinski_harabasz_score(X, labels)

        kmeans_results_df = kmeans_results_df.append({'no_clusters': c, 'calinski_harabasz_score': cal, 'y_hat': y_hat},
                                                      ignore_index=True)

    optimal_no_clusters = int(kmeans_results_df[kmeans_results_df.calinski_harabasz_score == kmeans_results_df.calinski_harabasz_score.max()] \
        ['no_clusters'].values[0])

    y_hat_optimal = kmeans_results_df[kmeans_results_df.no_clusters == optimal_no_clusters].y_hat.values[0]

    c_df = pd.DataFrame(y_hat_optimal, columns=['cluster'])
    clusters_df = patient_stats_df.join(c_df)
    clusters_mean_df = clusters_df.groupby('cluster').mean()
    clusters_mean_df = clusters_mean_df.sort_values(by='hosp_per_visit', ascending=False)
    clusters_mean_df = clusters_mean_df.reset_index()

    clusters_mean_df = clusters_mean_df.reset_index().rename(columns={'index': 'cluster_new'})

    clusters_df = clusters_df.merge(clusters_mean_df[['cluster', 'cluster_new']], left_on='cluster', right_on='cluster').drop('cluster', axis=1).rename(columns={'cluster_new': 'cluster'})

    clusters_df = clusters_df[['uid', 'cluster', 'hosp_per_visit']]

    df = df.merge(clusters_df, left_on='uid', right_on='uid')

    return df

In [None]:
def add_seasonality(df):
    """
    Add seasonal decomposition data to the given DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame to which seasonal decomposition data will be added.

    Returns:
    DataFrame: The input DataFrame with seasonal decomposition data added.
    
    Raises:
    FileNotFoundError: If the seasonal decomposition data file is not found.
    """
    try:
        # Load seasonal decomposition data
        seasonal_df = pd.read_parquet('data/seasonal_feature.parquet')
        
        # Merge the seasonal data into the input DataFrame
        df = df.merge(seasonal_df, left_on='visit_date_month', right_on='month').rename(columns={'seasonal':'seasonal_decomp'})
        df.drop('month',axis=1, inplace=True)
        
        df.replace([np.inf, -np.inf], 0, inplace=True)
        df = df.loc[:, ~df.columns.duplicated()].copy()   
        return df
    except FileNotFoundError:
        raise FileNotFoundError("Seasonal decomposition data file 'data/seasonal_feature.parquet' not found.")

In [None]:
def shift_hospitalizations_one_week(df):
    """
    Shift hospitalization-related columns in a DataFrame by one week to create previous week statistics.

    Parameters:
        df (pandas.DataFrame): The DataFrame containing hospitalization-related data.

    Returns:
        pandas.DataFrame: A new DataFrame with additional columns for previous week hospitalization statistics.

    """
    df['ttl_hosp_prev_week'] = df['ttl_hosp_week'].shift(1)
    df['ttl_hosp_prev_week'] = df['ttl_hosp_prev_week'].fillna(0)

    df['ttl_hosp_count_prev_week'] = df['ttl_hosp_count'].shift(1)
    df['ttl_hosp_count_prev_week'] = df['ttl_hosp_count_prev_week'].fillna(0)

    df['ttl_visits_prev_week'] = df['ttl_visits'].shift(1)
    df['ttl_visits_prev_week'] = df['ttl_visits_prev_week'].fillna(0)

    df['hosp_per_visit_prev_week'] = df.ttl_hosp_count_prev_week / df.ttl_visits_prev_week
    df['hosp_per_visit_prev_week'] = df['hosp_per_visit_prev_week'].fillna(0)
    
    df.dropna(inplace=True)   

    return df

# Utilities

## General

In [None]:
def percentage_of_missing_values(df):
    """
    Calculate the percentage of missing values for each column in a DataFrame.

    This function computes the percentage of missing (NaN or null) values for each column in the input DataFrame
    and returns a DataFrame with two columns: 'column_name' and 'percent_missing'. The resulting DataFrame
    is sorted in descending order by the percentage of missing values.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame for which the percentage of missing values should be calculated.

    Returns:
    --------
    pandas.DataFrame
        A DataFrame with two columns: 'column_name' and 'percent_missing', indicating the columns and their
        respective percentages of missing values, sorted in descending order.

    Example:
    --------
    >>> import pandas as pd
    >>> data = {'A': [1, 2, None, 4, 5], 'B': [None, 2, 3, None, 5]}
    >>> df = pd.DataFrame(data)
    >>> result = percentage_of_missing_values(df)
    >>> print(result)
    
       column_name  percent_missing
    0           A             20.0
    1           B             40.0
    """
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                         'percent_missing': percent_missing.round(2)})
    missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
    missing_value_df.reset_index(inplace=True)
    return missing_value_df[['column_name','percent_missing']]

In [None]:
def log_transform(df, features=None):
    """
    Apply a natural logarithm transformation to specified features in the DataFrame.

    Args:
        df (pandas.DataFrame): Input DataFrame.
        features (list or str): Name or list of names of the feature(s) to be log-transformed.

    Returns:
        pandas.DataFrame: DataFrame with the specified feature(s) log-transformed.
    """
    try:
        if isinstance(features, str):
            features = [features]
        df[features] = np.log(df[features])
        return df
    except Exception as e:
        raise e

In [None]:
def standardize_column_names(df):
    """
    Standardizes the column names of a DataFrame by converting them to lowercase,
    replacing punctuation with underscores, and removing leading/trailing underscores.

    Args:
        df (pd.DataFrame): The DataFrame whose column names need to be standardized.

    Returns:
        pd.DataFrame: The DataFrame with standardized column names.

    Example:
        >>> df = pd.DataFrame({'First Name': [1, 2], 'Last Name': [3, 4]})
        >>> df = standardize_column_names(df)
        >>> print(df.columns)
        Index(['first_name', 'last_name'], dtype='object')
    """
    new_cols = []
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

    for c in df.columns.to_list():
        c_mod = c.lower()
        c_mod = c_mod.translate(translator)
        c_mod = '_'.join(c_mod.split(' '))
        if c_mod[-1] == '_':
            c_mod = c_mod[:-1]
        c_mod = re.sub(r'\_+', '_', c_mod)
        c_mod = re.sub('_+$', '', c_mod)
        new_cols.append(c_mod)
    df.columns = new_cols
    return df

In [None]:
def get_null_counts(df):
    """
    Calculate the total count of missing (NaN and None) values in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame for which missing values are counted.

    Returns:
        int: The total count of missing values in the DataFrame.

    Example:
        >>> import pandas as pd
        >>> data = {'A': [1, 2, None], 'B': [3, None, 5], 'C': [None, None, None]}
        >>> df = pd.DataFrame(data)
        >>> missing_count = get_null_counts(df)
        >>> print(missing_count)
        6
    """
    nan_count = df.isna().sum().sum()
    null_count = df.isnull().sum().sum()
    total_missing = nan_count + null_count
    return total_missing

In [None]:
def get_similar_value_cols(df, percent=90):
    """
    Get column names in a DataFrame where a majority of values are the same, excluding binary encoded columns.

    Parameters:
    - df (pandas.DataFrame): The DataFrame to analyze.
    - percent (int, optional): The threshold percentage for considering a column as having similar values.
      Columns with more than 'percent' percentage of the same value will be included. Default is 90.

    Returns:
    - sim_val_cols (list): A list of column names with a majority of similar values.
    """

    count = 0
    sim_val_cols = []
    for col in df.columns:
        percent_vals = (df[col].value_counts() / len(df) * 100).values
        # filter columns where more than 90% values are same and leave out binary encoded columns
        if percent_vals[0] > percent and len(percent_vals) > 2:
            sim_val_cols.append(col)
            count += 1
    print("Total columns with majority singular value shares: ", count)
    return sim_val_cols

In [None]:
def dataframe_to_lower(df):
    """
    Convert all string values in a DataFrame to lowercase while preserving non-string values.

    Parameters:
    df (pandas.DataFrame): The input DataFrame containing mixed data types.

    Returns:
    pandas.DataFrame: A new DataFrame with all string values converted to lowercase, while non-string values remain unchanged.
    """
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    return df

In [None]:
def detect_outliers(df, columns):
    """
    Detect and remove outliers from a DataFrame.

    This function identifies outliers in the specified columns of a DataFrame and removes them.
    Outliers are defined as data points that fall outside of the 1.5 * Interquartile Range (IQR)
    from the lower and upper quartiles.

    Parameters:
    - df (pandas.DataFrame): The DataFrame to analyze and clean.
    - columns (list): A list of column names in the DataFrame to check for outliers.

    Returns:
    - df_clean (pandas.DataFrame): The DataFrame with outliers removed.

    Example Usage:
    cleaned_data = detect_outliers(data_df, ['column1', 'column2'])
    """

    outliers_lst = []
    leave_cols = []  # Columns you may want to leave out
    # For each feature, find the data points with extreme high or low values
    for feature in columns:

        if feature not in leave_cols:
            Q1 = df[feature].quantile(0.25)
            Q3 = df[feature].quantile(0.75)
            step = 1.5 * (Q3 - Q1)

            # Find outliers
            outliers_rows = df.loc[~((df[feature] >= Q1 - step) & (df[feature] <= Q3 + step)), :]
            outliers_lst.append(list(outliers_rows.index))

    outliers = list(itertools.chain.from_iterable(outliers_lst))
    # List of duplicate outliers
    dup_outliers = list(set([x for x in tqdm(outliers, position=0) if outliers.count(x) > 1]))
    df_clean = df.loc[~df.index.isin(dup_outliers)]
    print("Processed outliers")
    return df_clean

## Project Specific

In [1]:
def load_health_stats_data():
    """
    Load health statistics data from CSV files and merge them into a single DataFrame.

    This function reads patient data, visit data, and health statistics data from separate CSV files,
    performs necessary data type casting, and then merges them into a single DataFrame.

    Returns:
    pandas.DataFrame: A DataFrame containing merged health statistics data for patients and visits.
    """
    patients_df = cast_dtypes_patients(pd.read_csv('./data/diagnosis_patients.csv'))  
    visits_df = cast_dtypes_visits(pd.read_csv('./data/diagnosis_visits.csv'))  
    health_stats_df = cast_dtypes_health_stats(pd.read_csv('./data/diagnosis_health_stats.csv'))
    health_stats_patients_df = health_stats_df.merge(patients_df, on=['uid'], how='inner')
    health_stats_patients_visits_df = pd.concat([health_stats_patients_df, visits_df])    
    return health_stats_patients_visits_df

In [None]:
def plot_cross_val(n_splits: int,
                   splitter_func,
                   df: pd.DataFrame,
                   title_text: str) -> None:
  
    """Function to plot the cross validation of various
    sklearn splitter objects."""

    split = 1
    plot_data = []

    for train_index, valid_index in splitter_func:
        plot_data.append([train_index, 'Train', f'{split}'])
        plot_data.append([valid_index, 'Test', f'{split}'])
        split += 1

    plot_df = pd.DataFrame(plot_data,
                           columns=['Index', 'Dataset', 'Split'])\
                           .explode('Index')

    fig = go.Figure()
    for _, group in plot_df.groupby('Split'):
        fig.add_trace(go.Scatter(x=group['Index'].loc[group['Dataset'] == 'Train'],
                                 y=group['Split'].loc[group['Dataset'] == 'Train'],
                                 name='Train',
                                 line=dict(color="blue", width=10)
                                 ))
        fig.add_trace(go.Scatter(x=group['Index'].loc[group['Dataset'] == 'Test'],
                                 y=group['Split'].loc[group['Dataset'] == 'Test'],
                                 name='Test',
                                 line=dict(color="goldenrod", width=10)
                                 ))

    fig.update_layout(template="simple_white", font=dict(size=20),
                      title_text=title_text, title_x=0.5, width=850,
                      height=450, xaxis_title='Index', yaxis_title='Split')

    legend_names = set()
    fig.for_each_trace(
        lambda trace:
        trace.update(showlegend=False)
        if (trace.name in legend_names) else legend_names.add(trace.name))

    return fig.show()

In [2]:
def train_test_split_by_year(df, year):
    """
    Split a DataFrame into training and testing sets based on a specified year.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to be split.
    year (int): The year used for splitting the data into training and testing sets.

    Returns:
    X_train (pandas.DataFrame): The training set containing rows with visit dates before the specified year.
    X_test (pandas.DataFrame): The testing set containing rows with visit dates in the specified year.
    y_train (None): Placeholder for the training target variable (not used in this function).
    y_test (None): Placeholder for the testing target variable (not used in this function).
    """
    X_train = df[df.visit_date_year < year]
    y_train = None
    X_test = df[df.visit_date_year == year]
    y_test = None
    return X_train, X_test, y_train, y_test

In [None]:
def bi_results(health_stats_df, X_test, y_test, best_classifier):
    """
    Prepare results for visualizing health statistics and medications and performing
    categorical statistics analysis in a Business Intelligence (BI) tool.

    This function processes health statistics data, test data, model predictions, and
    probabilities generated by the best classifier to provide structured data for
    visualizing health statistics and medications and for performing categorical
    statistics analysis in a BI tool.

    Parameters:
    - health_stats_df (DataFrame): Unprocessed health statistics data.
    - X_test (DataFrame): Test data with health statistics features.
    - y_test (Series): True labels for the test data.
    - best_classifier (dict): A dictionary containing the best classifier with
      'model_fit', 'predictions', and 'prob_predictions' keys.

    Returns:
    - health_stats_results_df (DataFrame): Results for visualizing health statistics,
      including predictions, probabilities, and risk classification.
    - health_stats_results_meds_df (DataFrame): Results for analyzing medications,
      including medication descriptions.
    - stats_risk_imp_df (DataFrame): Results for categorical statistics analysis,
      imputed for both low and high-risk patient categories.

    Example:
    health_stats_data = load_health_statistics_data()
    X_test, y_test, best_classifier = load_test_data_and_classifier()
    health_results, meds_results, stats_results = bi_results(health_stats_data, X_test, y_test, best_classifier)

    Notes:
    - The 'health_stats_results_df' is structured for visualizing health
      statistics and risk classification.
    - The 'health_stats_results_meds_df' provides medication information for analysis.
    - The 'stats_risk_imp_df' is suitable for analyzing categorical statistics
      with imputed values for both low and high-risk patients.
    """
    
    # Provide results for all datapoints other than medications and categorical values
    X_test_pre, X_test_s, y_test_s, groups_test_s, selected_train_cols = pipeline.named_steps['Preprocessor'].fit_transform(X_test, y_test)
    classes = best_classifier['model_fit'].best_estimator_.classes_
    predicted = best_classifier['predictions']
    prob_predicted = best_classifier['prob_predictions']
    predicted_df = pd.DataFrame(predicted, columns = ['predicted'])
    prob_predicted_df = pd.DataFrame(prob_predicted, columns = ['no_hosp_prob', 'hosp_prob'])
    health_stats_results_df = X_test_pre.join(predicted_df).join(prob_predicted_df)
    health_stats_results_df['risk'] = np.where(health_stats_results_df.cluster.isin([0,1]), 'high_risk', 'low_risk')
    health_stats_results_df = health_stats_results_df[visit_cols+health_stats_num_cols+['risk','no_hosp_prob', 'hosp_prob','hosp_per_visit', 'predicted']]
    health_stats_results_df.age = health_stats_results_df.age.astype(int)
    health_stats_results_df[['bp_sys', 'bp_dia', 'weight_lbs', 'temp_f', 'pulse',
           'height_val', 'resp_val']] = health_stats_results_df[['bp_sys', 'bp_dia', 'weight_lbs', 'temp_f', 'pulse',
           'height_val', 'resp_val']].apply(lambda x: x.round(1))
    
    # Provide results for analysis of medications
    uids_results = health_stats_results_df.uid.unique().tolist()
    health_stats_in_results_df = health_stats_df[health_stats_df.uid.isin(uids_results)]
    health_stats_results_meds_df = health_stats_in_results_df[['uid', 'med_profile_medication']].drop_duplicates()
    health_stats_results_meds_df = health_stats_results_meds_df[~health_stats_results_meds_df.med_profile_medication.isna()]
    health_stats_results_meds_df = dataframe_to_lower(health_stats_results_meds_df)
    med_descriptions_df = pd.read_csv('data/health_stats_results_meds_desc_l.csv')
    health_stats_results_meds_df = health_stats_results_meds_df.merge(med_descriptions_df, left_on='med_profile_medication', right_on='medication')[['uid', 'med_profile_medication','description']]
    health_stats_results_meds_df.description = health_stats_results_meds_df.description.fillna('no description available')
    
    # Provide results for categorical statistics based patient risk category
    health_stats_results_stats_df = health_stats_in_results_df.drop('med_profile_medication', axis=1).select_dtypes(object).drop_duplicates()
    health_stats_results_stats_df = dataframe_to_lower(health_stats_results_stats_df)
    stats_risk_df = health_stats_results_stats_df[health_stats_results_stats_df.uid.isin(uids_results)].merge(health_stats_results_df[['uid','risk']], on='uid', how='left')
    stats_risk_lr_df = stats_risk_df[stats_risk_df.risk == 'low_risk']
    stats_risk_hr_df = stats_risk_df[stats_risk_df.risk == 'high_risk']
    
    # Create a SimpleImputer for low risk data
    low_risk_imputer = SimpleImputer(strategy='most_frequent')
    low_risk_imputer.fit(stats_risk_lr_df)

    # Create a SimpleImputer for high risk data
    high_risk_imputer = SimpleImputer(strategy='most_frequent')
    high_risk_imputer.fit(stats_risk_hr_df)

    # Impute missing values in the low risk data
    imputed_low_risk_data = low_risk_imputer.transform(stats_risk_lr_df)

    # Impute missing values in the high risk data
    imputed_high_risk_data = high_risk_imputer.transform(stats_risk_hr_df)

    imputed_low_risk_data_df = pd.DataFrame(imputed_low_risk_data, columns=stats_risk_lr_df.columns.to_list()).drop_duplicates()
    imputed_high_risk_data_df = pd.DataFrame(imputed_high_risk_data, columns=stats_risk_hr_df.columns.to_list()).drop_duplicates()
    stats_risk_imp_df = pd.concat([imputed_low_risk_data_df, imputed_high_risk_data_df])
    
    return health_stats_results_df, health_stats_results_meds_df, stats_risk_imp_df

In [None]:
def append_new_columns(df, new_columns):
    """
    Appends new columns to a DataFrame to ensure consistency between training and prediction datasets.

    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame to which new columns will be appended.

    new_columns : list
        A list of column names that need to be added to the DataFrame.

    Returns:
    --------
    pd.DataFrame
        The DataFrame with the new columns added. If a column already exists in the DataFrame, it will be skipped.

    Example:
    --------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'feature1': [1, 2, 3], 'feature2': [4, 5, 6]})
    >>> new_columns = ['feature1', 'feature3']
    >>> result = append_new_columns(data, new_columns)
    >>> print(result)
       feature1  feature2  feature3
    0         1         4         0
    1         2         5         0
    2         3         6         0
    """
    for col in new_columns:
        if col not in df.columns:
            df[col] = 0
    return df

In [None]:
def split_data_by_groups(df):
    """
    Split the input DataFrame into feature matrix (X), target variable (y), and groups for cross-validation.

    Parameters:
    df (DataFrame): The input DataFrame with the data to be split.

    Returns:
    dict: A dictionary containing the split data with keys 'X', 'y', and 'groups'.
    
    Raises:
    KeyError: If any required columns are missing in the input DataFrame.
    """
    try:
        # Drop unnecessary columns
        df = df.drop(columns=['visit_date_week_no', 'ttl_visits', 'ttl_hosp_week', 'ttl_hosp_count'], axis=1)

        # Sort the DataFrame
        df.sort_values(by=['visit_date_year', 'visit_date_month', 'visit_date_week_dt'], inplace=True)
        
        # Create groups based on year and month
        df_groups = df[['visit_date_year', 'visit_date_month']].drop_duplicates().reset_index().reset_index()[['visit_date_year', 'visit_date_month', 'level_0']].rename(columns={'level_0':'group'})
        df = df.merge(df_groups, on=['visit_date_year', 'visit_date_month'])

        # Extract features, target, and groups
        X = df.drop(columns=['uid', 'hospitalized', 'visit_date_year', 'visit_date_month', 'group', 'visit_date_week_dt'])
        y = df.hospitalized.astype(int)
        groups = df['group'].to_numpy()
        X = X.set_index(groups)

        return X, y, groups
    except KeyError as e:
        raise KeyError(f"Required columns are missing in the input DataFrame: {e}")

In [None]:
def split_by_year_month_groups(df):
    """
    Split the input DataFrame into feature matrix (X), target variable (y), and groups for cross-validation.

    Parameters:
    df (DataFrame): The input DataFrame with the data to be split.

    Returns:
    dict: A dictionary containing the split data with keys 'X', 'y', and 'groups'.
    
    Raises:
    KeyError: If any required columns are missing in the input DataFrame.
    """
    try:
        # Drop unnecessary columns
        df = df.drop(columns=['visit_date_week_no', 'ttl_visits', 'ttl_hosp_week', 'ttl_hosp_count'], axis=1)

        # Sort the DataFrame
        df.sort_values(by=['visit_date_year', 'visit_date_month', 'visit_date_week_dt'], inplace=True)
        
        # Create groups based on year and month
        df_groups = df[['visit_date_year', 'visit_date_month']].drop_duplicates().reset_index().reset_index()[['visit_date_year', 'visit_date_month', 'level_0']].rename(columns={'level_0':'group'})
        df = df.merge(df_groups, on=['visit_date_year', 'visit_date_month'])

        # Extract features, target, and groups
        X = df.drop(columns=['uid', 'hospitalized', 'visit_date_year', 'visit_date_month', 'group', 'visit_date_week_dt'])
        y = df.hospitalized.astype(int)
        groups = df['group'].to_numpy()
        X = X.set_index(groups)

        return X, y, groups
    except KeyError as e:
        raise KeyError(f"Required columns are missing in the input DataFrame: {e}")

In [None]:
def cast_dtypes_health_stats(df):
    """
    Preprocess a DataFrame containing health statistics and medication data by performing the following operations:

    1. Remove the 'med_profile_medication' field, which causes duplicates in the data, and deduplicate the DataFrame based on 'uid' and 'visit_date'.
    2. Dummy encode medication data to remove duplication.
    3. Filter medications based on their correlation with hospitalizations.
    4. Rejoin medications with other health statistics in the original DataFrame.
    5. Cast specific columns to the correct data types.
    6. Convert date-related columns to their appropriate data types.

    Parameters:
    -----------
    df : pandas DataFrame
        The input DataFrame containing health statistics and medication data.

    Returns:
    --------
    preprocessed_df : pandas DataFrame
        A preprocessed DataFrame with the specified operations applied.
    """
    # Cast specific columns to float data type
    df[['bp_sys', 'bp_dia', 'weight_lbs', 'temp_f', 'pulse', 'height_val', 'resp_val']] = \
    df[['bp_sys', 'bp_dia', 'weight_lbs', 'temp_f', 'pulse', 'height_val', 'resp_val']].astype('float')

    # Convert date-related columns to appropriate data types
    df.visit_date_year = df.visit_date_year.astype(int)
    df.visit_date_month = df.visit_date_month.astype(int)
    df.visit_date_week_no = df.visit_date_week_no.astype(int)
    df.visit_date_week_dt = pd.to_datetime(df.visit_date_week_dt)
    df.visit_date = pd.to_datetime(df.visit_date)
    
    return df

In [None]:
def cast_dtypes_patients(df):
    """
    Preprocess a DataFrame containing patient data by performing several operations:
    
    3. Convert the 'age' column to integer data type.
    4. Convert the 'p_uid' and 'gender' columns to string data type.

    Parameters:
    -----------
    df : pandas DataFrame
        The input DataFrame containing patient data.

    Returns:
    --------
    preprocessed_df : pandas DataFrame
        A preprocessed DataFrame with duplicate rows removed, patients under 18 filtered out,
        'age' column converted to integer, and 'p_uid' and 'gender' columns converted to string.
    """
    # Convert 'age' to integer
    df.age = df.age.astype(int)
    
    # Convert 'p_uid' and 'gender' to string
    df[['uid', 'gender']] = df[['uid', 'gender']].astype(str)
    
    return df

In [None]:
def cast_dtypes_visits(df):
    """
    Preprocess a DataFrame containing visit data by performing several operations:

    1. Remove duplicate rows from the DataFrame.
    2. Convert the 'visit_date_year', 'visit_date_month', and 'visit_date_week_no' columns to strings.
    3. Convert the 'visit_date_week_dt' column to a datetime field.
    4. Convert 'ttl_visits', 'ttl_hosp_week', 'ttl_hosp_count', and 'hospitalized' columns to integers.

    Parameters:
    -----------
    df : pandas DataFrame
        The input DataFrame containing visit data.

    Returns:
    --------
    preprocessed_df : pandas DataFrame
        A preprocessed DataFrame with duplicate rows removed, data type conversions for date-related columns,
        and integer conversion for relevant numerical columns.
    """
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    
    # Convert year, month, and week_no to strings
    df.visit_date_year = df.visit_date_year.astype(int)
    df.visit_date_month = df.visit_date_month.astype(int)
    df.visit_date_week_no = df.visit_date_week_no.astype(int)

    # Convert 'visit_date_week_dt' to datetime
    df.visit_date_week_dt = pd.to_datetime(df.visit_date_week_dt)

    # Convert selected columns to integers
    df[['ttl_visits', 'ttl_hosp_week', 'ttl_hosp_count', 'hospitalized']] = \
    df[['ttl_visits', 'ttl_hosp_week', 'ttl_hosp_count', 'hospitalized']].astype(int)

    return df

In [None]:
def summarize_patient_health_stats(df):
    """
    Summarize patient health-related statistics and hospitalization information for each patient.

    This function takes a DataFrame containing patient health-related data, including unique patient
    identifiers ('uid'), and computes summary statistics for various health metrics such as age, total score,
    wound number, systolic and diastolic blood pressure, weight, temperature, pulse rate, height, and respiration rate.
    Additionally, it calculates the ratio of hospitalizations to total visits for each patient.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing patient health data, including 'uid', 'age', 'total_score',
      'wound_number', 'bp_sys' (systolic blood pressure), 'bp_dia' (diastolic blood pressure), 'weight_lbs', 'temp_f'
      (temperature in Fahrenheit), 'pulse' (pulse rate), 'height_val' (height value), 'resp_val' (respiration value),
      'ttl_hosp_count' (total hospitalizations), and 'ttl_visits' (total visits).

    Returns:
    - df_patient_stats (pandas.DataFrame): A DataFrame summarizing the health statistics and hospitalization ratio for each patient.
      It includes the 'uid', mean values for the specified health metrics, and 'hosp_per_visit' (hospitalizations per visit).

    Example Usage:
    df_patient_stats = summarize_patient_health_stats(patient_data_df)
    """

    df_mean_health_stats = df[['uid', 'age', 'bp_sys', 'bp_dia', 'weight_lbs',
        'temp_f', 'pulse', 'height_val', 'resp_val']].groupby('uid').mean().reset_index()
    
    df_hosp_count = df[['uid', 'ttl_hosp_count', 'ttl_visits']].groupby('uid').max().reset_index()
    df_hosp_count['hosp_per_visit'] = df_hosp_count.ttl_hosp_count / df_hosp_count.ttl_visits
    df_hosp_count = df_hosp_count[['uid', 'hosp_per_visit']]
    df_hosp_count.replace([np.inf, -np.inf], 0, inplace=True)  
    df_patient_stats = df_mean_health_stats.merge(df_hosp_count, left_on='uid', right_on='uid')
    df_patient_stats.hosp_per_visit.replace([np.inf, -np.inf], 0, inplace=True)  
    df_patient_stats.hosp_per_visit = df_patient_stats.hosp_per_visit.fillna(0)
    return df_patient_stats