In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [459]:
import os

modules_dir = "./modules"
os.makedirs(modules_dir, exist_ok=True)

In [2]:
%%writefile {modules_dir}/data_prep.py
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


import matplotlib.pyplot as plt
import seaborn as sns

# Directory to save the plots
plot_save_dir = 'plots'

def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def check_missing_values(df, artifact_save_dir='artefacts'):
    """
    Check and visualize missing values in a DataFra: str=me.

    Parameters:
        df (pd.DataFrame): The DataFrame to check for missing values.
        artifact_save_dir (str): Directory to save the heatmap plot and other artifacts.

    Returns:
        None
    """
    log = logging.getLogger(__name__)

    # Check for missing values and compute the count of missing values in each column
    missing_values = df.isnull().sum()

    # Plot a heatmap of missing values
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.xlabel('Columns')
    plt.ylabel('Rows')

    # List the number of missing values in each column
    log.info("Number of missing values in each column:")
    for column, count in missing_values.items():
        if count > 0:
            log.info(f"Column '{column}' had {count} missing values.")

    # Create the artifact_save_dir directory if it doesn't exist
    if not os.path.exists(artifact_save_dir):
        os.makedirs(artifact_save_dir)

    # Save the heatmap plot as an image in the specified directory
    plot_name = 'missing_values_heatmap'
    plot_save_path = os.path.join(artifact_save_dir, f"{plot_name}.png")
    plt.savefig(plot_save_path)
    log.info(f'{plot_name} saved at: {plot_save_path}')

    plt.show()

    

def replace_missing_values(df, ms_threshold: int, artifact_save_dir='artefacts'):
    """
    Replace missing values in a DataFrame using interpolation and iterative imputation.

    Parameters:
        df (pd.DataFrame): The DataFrame containing missing values.
        ms_threshold (int): Threshold to switch between interpolation and iterative imputer.
        artifact_save_dir (str, optional): Directory to save artifacts (e.g., logs) (default: None).

    Returns:
        pd.DataFrame: DataFrame with missing values replaced.
    """
    # Create a logger
    log = logging.getLogger(__name__)

    # If an artifact_save_dir is specified, configure the logger to save logs to that directory
    if artifact_save_dir:
        log_filename = 'replace_missing_values.log'
        log_filepath = os.path.join(artifact_save_dir, log_filename)

        # Configure the logger
        logging.basicConfig(filename=log_filepath, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    # Threshold to switch between interpolation and iterative imputer
    interpolation_threshold = ms_threshold

    # Count the missing values in each column
    missing_values = df.isnull().sum()

    # List to store column names that need imputation
    columns_to_impute = []

    # Identify columns where the gap between missing values is less than the threshold
    for column, count in missing_values.items():
        if count > 0:
            indices = df[column].index[df[column].isnull()]
            differences = np.diff(indices)
            if all(diff <= interpolation_threshold for diff in differences):
                columns_to_impute.append(column)

    # Separate columns for interpolation and iterative imputer
    columns_to_interpolate = [col for col in columns_to_impute if col not in columns_to_impute]
    columns_to_iterative_impute = [col for col in columns_to_impute if col in columns_to_impute]

    # Replace missing values with interpolation
    if len(columns_to_interpolate) > 0:
        imputer = SimpleImputer(strategy='nearest')
        df[columns_to_interpolate] = imputer.fit_transform(df[columns_to_interpolate])
        for column in columns_to_interpolate:
            log.info(f"Imputed '{column}' using 'nearest' strategy.")

    # Replace missing values with iterative imputer
    if len(columns_to_iterative_impute) > 0:
        imputer = IterativeImputer()
        df[columns_to_iterative_impute] = imputer.fit_transform(df[columns_to_iterative_impute])
        for column in columns_to_iterative_impute:
            log.info(f"Imputed '{column}' using 'iterative' strategy.")

    return df

    
def drop_highly_correlated_features(df, corr_threshold=0.8, plot_heatmaps=True, artifact_save_dir='artefacts'):
    """
    Perform feature selection based on Spearman correlation coefficient.

    Parameters:
    - df: pandas DataFrame containing the dataset.
    - corr_threshold: The threshold for correlation above which features will be dropped (default is 0.8).
    - plot_heatmaps: Whether to plot heatmaps before and after dropping (default is True).
    - artifact_save_dir: Directory to save the correlation heatmap plots (default is None).

    Returns:
    - A DataFrame with the highly correlated features dropped.
    """
    # Create a logger
    log = logging.getLogger(__name__)

    if artifact_save_dir and not os.path.exists(artifact_save_dir):
        os.makedirs(artifact_save_dir)
    
    # Calculate the correlation matrix (Spearman by default in pandas)
    corr_matrix = df.corr(method='spearman')
    
    if plot_heatmaps:
        # Plot the correlation heatmap before dropping
        fig_before = plt.figure(figsize=(8, 6))
        plt.title("Correlation Heatmap (Before Dropping)")
        sns_plot_before = sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
        
        # Save the plot as an image file
        if artifact_save_dir:
            plt.savefig(os.path.join(artifact_save_dir, "correlation_heatmap_before.png"))
            log.info("Correlation heatmap (Before Dropping): %s", os.path.join(artifact_save_dir, "correlation_heatmap_before.png"))
        
        plt.show()
    
    # Create a set to store the columns to drop
    columns_to_drop = set()
    
    # Create a list to store the names of the dropped columns
    dropped_columns = []
    
    # Iterate through the columns and identify highly correlated features
    for col1 in corr_matrix.columns:
        for col2 in corr_matrix.columns:
            if col1 != col2 and abs(corr_matrix.loc[col1, col2]) >= corr_threshold:
                # Check if col1 or col2 should be dropped based on their mean correlation
                mean_corr_col1 = corr_matrix.loc[col1, :].drop(col1).abs().mean()
                mean_corr_col2 = corr_matrix.loc[col2, :].drop(col2).abs().mean()
                
                if mean_corr_col1 > mean_corr_col2:
                    columns_to_drop.add(col1)
                    dropped_columns.append(col1)
                else:
                    columns_to_drop.add(col2)
                    dropped_columns.append(col2)
    
    # Drop the highly correlated features from the DataFrame
    df = df.drop(columns=columns_to_drop)
    
    if plot_heatmaps:
        # Calculate the correlation matrix after dropping
        corr_matrix_after_drop = df.corr(method='spearman')
        
        # Plot the correlation heatmap after dropping
        fig_after = plt.figure(figsize=(8, 6))
        plt.title("Correlation Heatmap (After Dropping)")
        sns_plot_after = sns.heatmap(corr_matrix_after_drop, annot=True, cmap='coolwarm', fmt=".2f")
        
        # Save the plot as an image file
        if artifact_save_dir:
            plt.savefig(os.path.join(artifact_save_dir, "correlation_heatmap_after.png"))
            log.info("Correlation heatmap (After Dropping): %s", os.path.join(artifact_save_dir, "correlation_heatmap_after.png"))
        
        plt.show()
    
    # Log the names of the dropped columns
    log.info("Dropped columns: %s", dropped_columns)

    return df

def drop_columns(data, columns_to_drop):
    """
    Drop selected columns from a DataFrame.

    Parameters:
    - data: pandas DataFrame containing the dataset.
    - columns_to_drop: Single column name or a list of column names to be dropped.

    Returns:
    - A DataFrame with the specified columns dropped.
    """
    if isinstance(columns_to_drop, str):
        # If a single column name is provided, convert it to a list
        columns_to_drop = [columns_to_drop]

    # Drop the specified columns from the DataFrame
    df = data.drop(columns=columns_to_drop, errors='ignore')
    
    # Log the names of the dropped columns
    log.info("bad_columns_dropped are/is: %s", columns_to_drop)
    
    return df

def drop_high_cardinality_features(df, max_unique_threshold=0.9):
    """
    Drop high cardinality features (columns) from a DataFrame based on a threshold.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        max_unique_threshold (float): The maximum allowed fraction of unique values in a column (default is 0.9).

    Returns:
        pd.DataFrame: The DataFrame with high cardinality columns dropped.
    """
    if df is None:
        raise ValueError("Input DataFrame 'df' cannot be None.")
        
    # Calculate the maximum number of allowed unique values for each column
    max_unique_values = len(df) * max_unique_threshold
    
    # Identify and drop columns with unique values exceeding the threshold
    high_cardinality_columns = [col for col in df.columns if df[col].nunique() > max_unique_values]
    
    # Log the names of the dropped columns
    if high_cardinality_columns:
        log.info(f"Dropped high cardinality columns: {', '.join(high_cardinality_columns)}")
    
    df_dropped = df.drop(columns=high_cardinality_columns)
    
    return df_dropped

def select_categorical_columns(data):
    """
    Select categorical columns from a DataFrame.

    Parameters:
    - data: pandas DataFrame containing the dataset.

    Returns:
    - A list of column names that are categorical.
    """
    categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
    return categorical_columns


if __name__ == "__main__":
    main()

Writing ./modules/data_prep.py


In [493]:
import sys
sys.path.insert(0, "./src/modules")
import data_prep_functions as dp

In [27]:
!pip install scikit-optimize



In [460]:
%%writefile {modules_dir}/__init__.py



Writing ./modules/__init__.py


In [49]:
%%writefile {modules_dir}/tune_train_test.py
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import logging
import os
import json
import joblib

# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)



def custom_train_test_split(data, target_column, test_size=0.2, random_state=101, time_series=False):
    """
    Split the dataset into training and testing sets.

    Parameters:
    - data: pandas DataFrame containing the dataset.
    - target_column: Name of the target column.
    - test_size: Proportion of the dataset to include in the test split (default is 0.2).
    - random_state: Seed for random number generation (optional).
    - time_series: Set to True if the data is time series data (default is False).

    Returns:
    - X_train, X_test, y_train, y_test: The split datasets.
    """
    if time_series:
        # For time series data, split by a specific time point
        data = data.sort_index()  # Sort by time index if not already sorted
        n = len(data)
        split_index = int((1 - test_size) * n)
        X_train, X_test = data.iloc[:split_index, :-1], data.iloc[split_index:, :-1]
        y_train, y_test = data.iloc[:split_index][target_column], data.iloc[split_index:][target_column]
    else:
        # For regular (cross-sectional) data, use train_test_split
        X = data.drop(columns=[target_column])
        y = data[target_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test



def hyperparameter_tuning(X_train, y_train, model_prefix:str, param_grid=None, random_search=False, bayesian_search=False, n_iter=10, random_seed=101):
    """
    Train a Histogram Gradient Boosting Classifier and tune its hyperparameters.

    Parameters:
    - X_train, y_train: Training data and labels.
    - X_test, y_test: Testing data and labels.
    - model_prefix: Prefix for model artifacts.
    - param_grid: Hyperparameter grid to search (default is None).
    - random_search: Whether to use random search instead of grid search (default is False).
    - bayesian_search: Whether to use Bayesian hyperparameter search (default is False).
    - n_iter: Number of parameter settings that are sampled (only for random_search or bayesian_search).

    Returns:
    - Trained model, best hyperparameters, and test accuracy.
    """
    # Identify categorical columns
    categorical_features = list(X_train.select_dtypes(include=['category', 'object']).columns)
    # Create a ColumnTransformer to apply one-hot encoding to categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features)
        ],
        remainder='passthrough'  # Keep non-categorical columns as-is
    )
    # Create a Histogram Gradient Boosting Classifier
    clf = HistGradientBoostingClassifier(random_state=42)
    
    # Combine preprocessing and classifier into a single pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])

    if not bayesian_search:
        # Define hyperparameters for grid search or random search
        hyperparameters = {
            'classifier__max_iter': [100, 200, 300],  # Adjust the values as needed
            'classifier__learning_rate': [0.001, 0.01, 0.1],  # Adjust the values as needed
            'classifier__max_depth': [3, 4, 5],  # Adjust the values as needed
            'classifier__l2_regularization': [0.0, 0.1, 0.2]  # Adjust the values as needed
        }

        if random_search:
            # Use RandomizedSearchCV
            search = RandomizedSearchCV(pipeline, param_distributions=hyperparameters, n_iter=n_iter, scoring='accuracy', n_jobs=-1, random_state=random_seed)
        else:
            # Use GridSearchCV
            search = GridSearchCV(pipeline, param_grid=hyperparameters, scoring='accuracy', n_jobs=-1, random_state=random_seed)
    else:
        # Use Bayesian hyperparameter search with BayesSearchCV
        param_grid = {
            'classifier__max_iter': (100, 300),
            'classifier__learning_rate': (0.001, 0.1),
            'classifier__max_depth': (3, 5),
            'classifier__l2_regularization': (0.0, 0.2)
        }

        search = BayesSearchCV(pipeline, param_grid, n_iter=n_iter, cv=TimeSeriesSplit(n_splits=3), scoring='accuracy', n_jobs=-1, random_state=random_seed)

    # Fit the search to the training data
    search.fit(X_train, y_train)

    # Get the best hyperparameters and the best estimator (trained model)
    best_params = search.best_params_
    best_estimator = search.best_estimator_
    
    log.info('Parameters chosen are:')
    log.info(best_params)
    
    log.info('The best estimator is:')
    log.info(best_estimator)
    
    # Evaluate the best model on the test data
   # y_pred = best_estimator.predict(X_test)
   # test_accuracy = accuracy_score(y_test, y_pred)
   # log.info(f'Test Accuracy: {test_accuracy:.2f}')
    
    # Save the best model to a file
    model_filename = f'{model_prefix}_best_model.joblib'
    joblib.dump(best_estimator, model_filename)
    
    # Save best hyperparameters to a JSON file
    hyperparameters_filename = f'{model_prefix}_hyperparameters.json'
    log.info(f'Saving best hyperparameters for {model_prefix} as {hyperparameters_filename}')
    with open(hyperparameters_filename, 'w') as f:
        json.dump(best_params, f)
        
    return best_params, hyperparameters_filename

def train_model(X_train, y_train, model_name:str, hyperparam: dict=None, hyperparam_filename: str=None):
    # Identify categorical columns
    categorical_features = list(X_train.select_dtypes(include=['category', 'object']).columns)
    # Create a ColumnTransformer to apply one-hot encoding to categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features)
        ],
        remainder='passthrough'  # Keep non-categorical columns as-is
    )
    X_train_transformed = preprocessor.fit_transform(X_train)
    if hyperparam_filename is not None:
        log.info(f'Loading in hyperparameters: {hyperparam_filename}')
        with open(hyperparam_filename, 'r') as f:
            best_params = json.load(f)
    elif hyperparam is not None:
        best_params = hyperparam
    else:
        raise ValueError('Either hyperparam or hyperparam_filename must be assigned')
    
    # Create and train the model with the specified hyperparameters
    log.info('Training Model')
    trained_model = HistGradientBoostingClassifier(class_weight='balanced',
        max_iter=best_params['classifier__max_iter'],
        learning_rate=best_params['classifier__learning_rate'],
        max_depth=best_params['classifier__max_depth'],
        l2_regularization=best_params['classifier__l2_regularization'],
        random_state=10
    )
    trained_model.fit(X_train_transformed, y_train)
    
    # Save the trained model to a file
    log.info(f'Saving {model_name}')
    joblib.dump(trained_model, model_name)
    
    return trained_model

def predict_model(trained_model, X_test, inference_col_name):
    """
    Predict using a trained machine learning model.

    Parameters:
    - trained_model: The trained machine learning model.
    - X_test: The test dataset on which to make predictions.
    - inference_col_name: The name of the column to store predictions in the inference DataFrame.

    Returns:
    - inference_df: The DataFrame containing predictions.
    - inference_col_name: The name of the column where predictions are stored.
    - predictions: The predictions made by the model.
    """
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    import pandas as pd
    
    # Identify categorical columns
    categorical_features = list(X_test.select_dtypes(include=['category', 'object']).columns)
    
    # Create a ColumnTransformer to apply one-hot encoding to categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features)
        ],
        remainder='passthrough'  # Keep non-categorical columns as-is
    )
    
    # Fit and transform the test data
    X_test_transformed = preprocessor.fit_transform(X_test)
    
    # Get the one-hot encoded feature names
    ohe = preprocessor.named_transformers_['cat']
    cat_feature_names = list(ohe.get_feature_names_out(input_features=categorical_features))
    
    # Combine the one-hot encoded feature names and non-categorical column names
    all_column_names = cat_feature_names + list(X_test.select_dtypes(exclude=['category', 'object']).columns)
    
    # Convert X_test_transformed to a DataFrame with appropriate column names
    inference_df = pd.DataFrame(X_test_transformed, columns=all_column_names)
    
    # Make predictions using the trained model
    predictions = trained_model.predict(X_test_transformed)
    
    # Add predictions to the DataFrame with the specified column name
    inference_df[inference_col_name] = predictions
    
    return inference_df, inference_col_name, predictions


if __name__ == "__main__":
    main()

Overwriting ./modules/tune_train_test.py


In [10]:
%%writefile {modules_dir}/evaluation.py
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import logging
import os
import json
import joblib

# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

# Directory to save the plots
plot_save_dir = 'plots'

def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

import os
import logging
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, classes,
                          model_name: str,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues,
                          artifact_save_dir: str):
    """
    This function plots the confusion matrix.

    :param y_true: True labels of the data.
    :param y_pred: Predicted labels of the data.
    :param classes: List of class labels (e.g., ['Class 0', 'Class 1']).
    :param model_name: Name of the model for plot naming.
    :param normalize: If True, normalize the confusion matrix.
    :param title: Title of the plot.
    :param cmap: Colormap for the plot.
    :param artifact_save_dir: Directory where artifacts including plots will be saved.
    """
    if not title:
        if normalize:
            title = 'Normalized Confusion Matrix'
        else:
            title = 'Confusion Matrix'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        fmt = '.2f'
    else:
        fmt = 'd'

    # Create figure and axis
    plt.figure(figsize=(8, 6))
    ax = plt.gca()

    # Plot the confusion matrix
    sns.heatmap(cm, annot=True, fmt=fmt, cmap=cmap,
                xticklabels=classes, yticklabels=classes)
    
    # Customize plot labels and appearance
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(title)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)

    # Display the plot
    plt.tight_layout()
    
    # Artifact save directory for plots
    if not os.path.exists(artifact_save_dir):
        os.makedirs(artifact_save_dir)
    
    # Save the heatmap plot as an image
    plot_name = f'{model_name}_confusion_matrix'  # Set the desired plot name
    plot_save_path = os.path.join(artifact_save_dir, f"{plot_name}.png")  # Format the file name
    plt.savefig(plot_save_path)
    log.info(f'{plot_name} saved')
    
    plt.show()


def shap_feature_importance(X_test, model_name, n_features, artifact_save_dir: str):
    
    model_prefix = os.path.splitext(model_name)[0]
    if not os.path.exists(artifact_save_dir):
        log.info(f'Creating {artifact_save_dir} directory in {os.getcwd()}')
        os.makedirs(artifact_save_dir)
        
    log.info(f'Loading {model_name}')
    model = joblib.load(model_name)
    
    log.info('Generating SHAP Values')
    
    # Convert X_test to a pandas DataFrame
    X_test_df = pd.DataFrame(X_test)  # Assuming X_test is a 2D array
    
    # Identify categorical columns
    categorical_features = list(X_test_df.select_dtypes(include=['category', 'object']).columns)
    
    # Create a ColumnTransformer to apply one-hot encoding to categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features)
        ],
        remainder='passthrough'  # Keep non-categorical columns as-is
    )
    
    # Fit and transform the test data
    X_test_transformed = preprocessor.fit_transform(X_test_df)
    
    # Generate SHAP values using the transformed data
    explainer = shap.Explainer(model)
    shap_values = explainer.shap_values(X_test_transformed)
    
    log.info('Saving SHAP Plot to Artifact Directory')
    plt.clf()
    feature_names = (
        list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)) +
        list(X_test_df.select_dtypes(exclude=['category', 'object']).columns)
    )
    shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, show=False)
    fig = plt.gcf()
    
    # Artifact save directory for plots
    plot_save_dir = os.path.join(artifact_save_dir, 'shap_summary_plots')
    if not os.path.exists(plot_save_dir):
        os.makedirs(plot_save_dir)
    
    plot_name = os.path.join(plot_save_dir, f'{model_prefix}_SHAP_summary.png')
    plt.savefig(plot_name)
    log.info(f'{plot_name} saved.')
    
    log.info(f'Extracting Top {n_features} Important features for the model')
    feature_importances = np.abs(shap_values).mean(axis=0)
    feature_importances_dict = dict(zip(feature_names, feature_importances))
    sorted_features = sorted(feature_importances_dict.items(), key=lambda x: x[1], reverse=True)
    
    log.info(f'Top {n_features} Important features are:')
    top_n_feature = sorted_features[:n_features]
    top_n_feature_names = [feature[0] for feature in top_n_feature]  # Extract feature names
    top_n_feature_importances = [feature[1] for feature in top_n_feature]  # Extract importances
    
    log.info(', '.join([f'{feature}: {importance}' for feature, importance in top_n_feature]))
    
    return plot_name, shap_values, top_n_feature_names, top_n_feature_importances


def shap_dependence_plots(X_test: pd.DataFrame, features: list, model_name: str, shap_values: np.array, artifact_save_dir: str):
    model_prefix = os.path.splitext(model_name)[0]
    if not os.path.exists(artifact_save_dir):
        log.info(f'Creating {artifact_save_dir} directory in {os.getcwd()}')
        os.makedirs(artifact_save_dir)
    
    log.info(f'Loading {model_name}')
    model = joblib.load(model_name)
    
    log.info('Generating SHAP Values')
    
    # Convert X_test to a pandas DataFrame
    X_test_df = pd.DataFrame(X_test)  # Assuming X_test is a 2D array
    
    # Identify categorical columns
    categorical_features = list(X_test_df.select_dtypes(include=['category', 'object']).columns)
    
    # Create a ColumnTransformer to apply one-hot encoding to categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features)
        ],
        remainder='passthrough'  # Keep non-categorical columns as-is
    )
    
    # Fit and transform the test data
    X_test_transformed = preprocessor.fit_transform(X_test_df)
    
    # Generate feature names
    feature_names = (
        list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)) +
        list(X_test_df.select_dtypes(exclude=['category', 'object']).columns)
    )

    for index, feature_name in enumerate(features):
        # Find the corresponding integer index for the feature name
        try:
            feature_index = feature_names.index(feature_name)
        except ValueError:
            log.warning(f'Feature {feature_name} not found in feature_names. Skipping.')
            continue
            
        # Artifact save directory for plots
        plot_save_dir = os.path.join(artifact_save_dir, 'shap_dependence_plots')
        if not os.path.exists(plot_save_dir):
            os.makedirs(plot_save_dir)
        
        plot_name = os.path.join(plot_save_dir, f'{model_prefix}_feature{index}_SHAP_dependence.png') 
        
        # Clear the previous plot
        plt.clf()
        shap.dependence_plot(feature_index, shap_values, X_test_transformed, 
                             feature_names=feature_names, show=False)  # Pass feature names for labeling
        # Save the SHAP dependence plot as an image
        fig = plt.gcf()
        plt.savefig(plot_name, dpi=150, bbox_inches='tight')
        log.info(f'{plot_name} saved.')
        
    log.info(f'All SHAP dependence plots saved in {plot_save_dir}')
    
import os
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)

def evaluate_classification_models(model_name, predictions, true_labels, target_names, plot_classification_report=False, artifact_save_dir: str = 'artifacts'):
    """
    Evaluate a classification model and generate a classification report.
    
    Parameters:
        model_name (str): Name of the model for plot naming.
        predictions (array-like): Model predictions (predicted class labels).
        true_labels (array-like): True class labels.
        target_names (list): e.g ['Class 0', 'Class 1']
        plot_classification_report (bool): Whether to plot the classification report (default: False).
        artifact_save_dir (str): Directory where artifacts including plots will be saved.

    Returns:
        dict: A dictionary containing the computed metrics.
    """
    evaluation_results = {}
    
    accuracy = accuracy_score(true_labels, predictions)
    evaluation_results['accuracy'] = accuracy
    
    precision = precision_score(true_labels, predictions)
    evaluation_results['precision'] = precision

    recall = recall_score(true_labels, predictions)
    evaluation_results['recall'] = recall

    f1 = f1_score(true_labels, predictions)
    evaluation_results['f1'] = f1

    try:
        roc_auc = roc_auc_score(true_labels, predictions)
        evaluation_results['roc_auc'] = roc_auc
    except ValueError:
        # roc_auc_score may not work for multiclass classification
        evaluation_results['roc_auc'] = None

    confusion = confusion_matrix(true_labels, predictions)
    evaluation_results['confusion_matrix'] = confusion

    if plot_classification_report:
        # Generate the classification report
        report = classification_report(
            true_labels, predictions, target_names=target_names, output_dict=True
        )

        # Convert the classification report to a DataFrame for easy plotting
        report_df = pd.DataFrame(report).transpose()

        # Plot the classification report as a heatmap
        plt.figure(figsize=(8, 6))
        sns.heatmap(report_df.iloc[:-3, :-1], annot=True, cmap='Blues', fmt=".2f", cbar=False)
        plt.title('Classification Report')
        plt.xlabel('Classes')
        plt.ylabel('Metrics')
        
        fig = plt.gcf()
        model_prefix = os.path.splitext(model_name)[0]
        plot_name = os.path.join(artifact_save_dir, f'{model_prefix}_Classification_Report.png')
        plt.savefig(plot_name)
        log.info(f'{plot_name} saved.')
    
        plt.show()

        # Add classification report metrics to the evaluation results
        for metric, values in report.items():
            if isinstance(values, dict):
                for class_name, value in values.items():
                    metric_name = f"{metric}_{class_name}"
                    evaluation_results[metric_name] = value
    log.info(evaluation_results)
    return evaluation_results

Writing ./modules/evaluation.py


In [153]:
!pip install --upgrade azureml-sdk


Collecting azureml-sdk
  Downloading azureml_sdk-1.53.0-py3-none-any.whl (2.7 kB)
Collecting azureml-train-automl-client~=1.53.0
  Downloading azureml_train_automl_client-1.53.0-py3-none-any.whl (137 kB)
     |████████████████████████████████| 137 kB 4.2 MB/s eta 0:00:01
[?25hCollecting azureml-dataset-runtime[fuse]~=1.53.0
  Downloading azureml_dataset_runtime-1.53.0-py3-none-any.whl (2.3 kB)
Collecting azureml-train-core~=1.53.0
  Downloading azureml_train_core-1.53.0-py3-none-any.whl (8.6 MB)
     |████████████████████████████████| 8.6 MB 1.7 MB/s eta 0:00:01
[?25hCollecting azureml-core~=1.53.0
  Downloading azureml_core-1.53.0-py3-none-any.whl (3.3 MB)
     |████████████████████████████████| 3.3 MB 3.8 MB/s eta 0:00:01
[?25hCollecting azureml-pipeline~=1.53.0
  Downloading azureml_pipeline-1.53.0-py3-none-any.whl (2.4 kB)
Collecting azure-mgmt-network==21.0.1
  Downloading azure_mgmt_network-21.0.1-py3-none-any.whl (8.9 MB)
     |████████████████████████████████| 8.9 MB 8.1 MB/

Collecting azureml-dataprep-rslex~=2.19.5dev0
  Downloading azureml_dataprep_rslex-2.19.5-cp39-cp39-macosx_10_9_x86_64.whl (18.3 MB)
     |████████████████████████████████| 18.3 MB 638 kB/s eta 0:00:01
Collecting azureml-pipeline-core~=1.53.0
  Downloading azureml_pipeline_core-1.53.0-py3-none-any.whl (313 kB)
     |████████████████████████████████| 313 kB 2.7 MB/s eta 0:00:01
[?25hCollecting azureml-pipeline-steps~=1.53.0
  Downloading azureml_pipeline_steps-1.53.0-py3-none-any.whl (69 kB)
     |████████████████████████████████| 69 kB 5.3 MB/s eta 0:00:011
[?25hCollecting azureml-telemetry~=1.53.0
  Downloading azureml_telemetry-1.53.0-py3-none-any.whl (30 kB)
Collecting azureml-automl-core~=1.53.0
  Downloading azureml_automl_core-1.53.0-py3-none-any.whl (248 kB)
     |████████████████████████████████| 248 kB 4.0 MB/s eta 0:00:01
Collecting azureml-train-restclients-hyperdrive~=1.53.0
  Downloading azureml_train_restclients_hyperdrive-1.53.0-py3-none-any.whl (18 kB)
Installing coll

  Attempting uninstall: azureml-train-restclients-hyperdrive
    Found existing installation: azureml-train-restclients-hyperdrive 1.48.0
    Uninstalling azureml-train-restclients-hyperdrive-1.48.0:
      Successfully uninstalled azureml-train-restclients-hyperdrive-1.48.0
  Attempting uninstall: azureml-telemetry
    Found existing installation: azureml-telemetry 1.48.0
    Uninstalling azureml-telemetry-1.48.0:
      Successfully uninstalled azureml-telemetry-1.48.0
  Attempting uninstall: azureml-dataset-runtime
    Found existing installation: azureml-dataset-runtime 1.48.0
    Uninstalling azureml-dataset-runtime-1.48.0:
      Successfully uninstalled azureml-dataset-runtime-1.48.0
  Attempting uninstall: azureml-train-core
    Found existing installation: azureml-train-core 1.48.0
    Uninstalling azureml-train-core-1.48.0:
      Successfully uninstalled azureml-train-core-1.48.0
  Attempting uninstall: azureml-automl-core
    Found existing installation: azureml-automl-core 1.4

In [13]:
%%writefile {modules_dir}/aml_config.py
import os
import sys

from azure.identity import AzureCliCredential
from azureml.core.compute import AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace
import json
from azure.ai.ml import MLClient


#def create_ml_client(subscription_id: str, resource_group: str, workspace_name: str, tenant_id: str = None):
def create_ml_client():
    """
    Create an Azure Machine Learning workspace client.

    This function attempts to create an Azure Machine Learning workspace client using the provided parameters. If it fails
    to create a client, it generates a new configuration file with the provided parameters and tries again.

    Parameters:
        subscription_id (str): Azure subscription ID.
        resource_group (str): Azure resource group name.
        workspace_name (str): Azure Machine Learning workspace name.
        tenant_id (str, optional): Azure Active Directory tenant ID. Default is None.

    Returns:
        azureml.core.Workspace: An Azure Machine Learning workspace client.
    """
    # Create an Azure CLI credential
    credentials = AzureCliCredential(tenant_id='6aa8da55-4c6f-496e-8fc1-de0f7819b03b')
    
    try:
        # Try to create the Azure Machine Learning workspace client using provided parameters
        ml_client = Workspace.from_config(auth=credentials)
    except Exception as ex:
        print("An error occurred while creating the AML client:", str(ex))
        print("Creating a new configuration file...")

        # Define the workspace configuration based on the provided parameters
        client_config = {
            "subscription_id": "1ebe1808-a398-4ab0-b17c-1e3649ea39d5",
            "resource_group": "practice_resource",
            "workspace_name": "practice_workspace",
        }

        # Write the configuration to a JSON file
        config_path = "../config.json"
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, "w") as fo:
            json.dump(client_config, fo)
        
        # Try to create the Azure Machine Learning workspace client again
        ml_client = MLClient.from_config(credential=credentials, path=config_path)
        # Try to create the Azure Machine Learning workspace client again
        #ml_client = Workspace.from_config(path=config_path)
    return ml_client
   


def get_compute(ml_client, compute_name:str, vm_size:str, min_instance:int, max_instances:int):
    ml_client = create_ml_client()
    # specify aml compute name.
    cpu_compute_target = compute_name
    
    try:
        cpu_cluster = ml_client.compute.get(cpu_compute_target)
        print(f'Using existing compute target: {cpu_compute_target}')
    except KeyError:
        print(f"Creating a new cpu compute target: {cpu_compute_target}...")
        cpu_cluster = AmlCompute(
            name = cpu_compute_target,
            size=vm_size,
            min_nodes=min_instance,
            max_nodes=max_instances
        )
        ml_client.compute.begin_create_or_update(compute).result()
        
    return cpu_compute_target, cpu_cluster   


Overwriting ./modules/aml_config.py


In [15]:
%%writefile {modules_dir}/data_ingestion.py
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azureml.core import Workspace, Experiment, Run, Dataset, Datastore
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

from azure.identity import DefaultAzureCredential
from azureml.core import Workspace


def load_data():
    run = Run.get_context()
    
    try:
        # pipeline run
        ws = run.experiment.workspace
    except:
        ws = Workspace.from_config()
    # Make sure that 'ds' is a valid datastore name
    #if ds is None:
        #raise ValueError("The 'ds' parameter cannot be None.")
        
    
    #def_ds = Datastore.get(ws, ds)
    def_ds = Datastore.get(ws, 'workspaceblobstore')
    #load_data('workspaceblobstore', 'ai4i2020.csv')
    raw_df = Dataset.Tabular.from_delimited_files([(def_ds,'ai4i2020.csv')]).to_pandas_dataframe()
    log.info('Data Loaded')
    
    return raw_df

def set_cwd_path(path: str):
    os.chdir(path)
    log.info(f'Current Directory set to: {os.getcwd()}')

Writing ./modules/data_ingestion.py


### Create custom environment

In [17]:
import os
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [18]:
%%writefile {dependencies_dir}/conda.yaml
name: general_env
channels:
  - conda-forge
dependencies:
  - python=3.8.*
  - pip=23.2.*
  - pip:
    - numpy==1.22.*
    - mlflow==2.4.1
    - azureml-mlflow==1.53.0
    - azureml-core==1.53.*
    - azureml-defaults==1.53.*
    - scikit-learn==1.3.*
    - azure-ai-ml==1.9.0
    - requests==2.31.*
    - azure-identity==1.14.0
    - scipy==1.7.1
    - pandas==1.4.4
    - shap==0.42.1
    - joblib==1.3.2
    - seaborn==0.11.2
    - matplotlib==3.4.*
    - shapely==2.0.*
    - scikit-optimize==0.9.*
    - mldesigner==0.1.0b4

Writing ./dependencies/conda.yaml


In [19]:
%%writefile {dependencies_dir}/environment_register.py
from azure.ai.ml.entities import Environment
import os
import sys

current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '..', 'modules')
sys.path.append(src_dir)


import aml_config as aml 

custom_env_name = "general_environment"

ml_client = aml.create_ml_client()
env_docker_conda = Environment(
    name=custom_env_name,
    description="Custom environment for classification and regression tasks",
    conda_file="conda.yaml",
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.4.0",
)
ml_client.environments.create_or_update(env_docker_conda)

Writing ./dependencies/environment_register.py


In [None]:
pip install azure-storage-file-datalake azure-identity

In [16]:
%%writefile data_prep.yaml
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: data_prep
display_name: data_preparation
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  raw_data: 
    type: uri_folder 
outputs:
  prep_data:
    type: uri_folder
code: ./prep_src
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:general_environment:0.4.0
command: >-
  python prep.py 
  --raw_data ${{inputs.raw_data}} 
  --prep_data ${{outputs.prep_data}}  # Reference 'output_data' as an input
# </component>

Overwriting data_prep.yaml


In [17]:
!pip install mldesigner



In [209]:
!pip install azure-storage-file-datalake azure-identity



In [210]:
import os
from azure.storage.filedatalake import (
    DataLakeServiceClient,
    DataLakeDirectoryClient,
    FileSystemClient
)
from azure.identity import DefaultAzureCredential

In [225]:
def get_service_client_token_credential(account_name) -> DataLakeServiceClient:
    account_url = f"https://{account_name}.dfs.core.windows.net"
    token_credential = DefaultAzureCredential()

    service_client = DataLakeServiceClient(account_url, credential=token_credential)

    return service_client

In [226]:
service_client = get_service_client_token_credential('practiceworksp0600133129')

In [227]:
def create_file_system(service_client: DataLakeServiceClient, file_system_name: str) -> FileSystemClient:
    file_system_client = service_client.create_file_system(file_system=file_system_name)

    return file_system_client

In [229]:
file_system_client = create_file_system(service_client, 'machinedata')

In [230]:
def create_directory(file_system_client: FileSystemClient, directory_name: str) -> DataLakeDirectoryClient:
    directory_client = file_system_client.create_directory(directory_name)

    return directory_client

In [231]:
directory_client = create_directory(file_system_client, 'raw_data')

In [232]:
def upload_file_to_directory(directory_client: DataLakeDirectoryClient, local_path: str, file_name: str):
    file_client = directory_client.get_file_client(file_name)

    with open(file=os.path.join(local_path, file_name), mode="rb") as data:
        file_client.upload_data(data, overwrite=True)

In [235]:
upload_file_to_directory(directory_client, '/Users/ejenamvictor/Desktop/project_new/data/', 'ai4i2020.csv')


In [244]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import time

STORAGEACCOUNTURL= 'https://practiceworksp0600133129.blob.core.windows.net/'
STORAGEACCOUNTKEY= 'n6nfy8OrxQyqJ/qesdDFgQqzqY8BD/zjxjKDHzdAoMmw+yCq2nZGz5DJvg5FkAJedaxHKE3ORQ3B+ASttIXhFA=='
LOCALFILENAME= 'ai4i2020.csv'
CONTAINERNAME= 'machinedata'
BLOBNAME= 'raw_data'

#download from blob
t1=time.time()
blob_service_client_instance = BlobServiceClient(account_url=STORAGEACCOUNTURL, credential=STORAGEACCOUNTKEY)
blob_client_instance = blob_service_client_instance.get_blob_client(CONTAINERNAME, BLOBNAME, snapshot=None)
with open(LOCALFILENAME, "wb") as my_blob:
    blob_data = blob_client_instance.download_blob()
    blob_data.readinto(my_blob)
t2=time.time()
print(my_blob)
print(blob_data)
print(("It takes %s seconds to download "+BLOBNAME) % (t2 - t1))

<_io.BufferedWriter name='ai4i2020.csv'>
<azure.storage.blob._download.StorageStreamDownloader object at 0x7ff46e3b9fa0>
It takes 0.7104520797729492 seconds to download raw_data


In [260]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import time

STORAGEACCOUNTURL = 'https://practiceworksp0600133129.blob.core.windows.net/'
STORAGEACCOUNTKEY = 'U79kyC7XigWJ6nROgYY5xjfaNDj82allYrfay0oPHRmomPdTkBPviebxqGceM5kdp58X1xcEZNiV+AStwp3wiQ=='
LOCALFILENAME = 'ai4i2020'
CONTAINERNAME = 'machinedata'
BLOBNAME = 'raw_data'
https://practiceworksp0600133129.blob.core.windows.net/machinedata/raw_data/ai4i2020.csv
# Download from blob
t1 = time.time()
blob_service_client_instance = BlobServiceClient(account_url=STORAGEACCOUNTURL, credential=STORAGEACCOUNTKEY)
blob_client_instance = blob_service_client_instance.get_blob_client(CONTAINERNAME, BLOBNAME, snapshot=None)
with open(LOCALFILENAME, "wb") as my_blob:
    blob_data = blob_client_instance.download_blob()
    blob_data.readinto(my_blob)
    print(blob_data.readall())
t2 = time.time()

# Print the first few rows of the DataFrame
#print(("It takes %s seconds to download " + BLOBNAME) % (t2 - t1))

b''


In [261]:
# LOCALFILE is the file path
#dataframe_blobdata = pd.read_csv(blob_data)

In [274]:
from azure.ai.ml.entities import AzureDataLakeGen2Datastore
from azure.ai.ml import MLClient
import os
import sys

sys.path.insert(1, '/Users/ejenamvictor/Desktop/project_new/modules')
#current_dir = os.path.dirname(os.path.abspath(__file__))
#src_dir = os.path.join(current_dir, '..', 'modules')
#sys.path.append(src_dir)

from aml_config_functions import *
#from pipeline import *

ml_client = create_ml_client()

#ml_client = MLClient.from_config()

store = AzureDataLakeGen2Datastore(
    name="machinedatas",
    description="this data store contains data for a machine failures",
    account_name="practiceworksp0600133129",
    filesystem="machine"
)

ml_client.create_or_update(store)

Found the config file in: ../config.json


An error occurred while creating the AML client: 'AzureCliCredential' object has no attribute '_get_service_client'
Creating a new configuration file...


AzureDataLakeGen2Datastore({'type': <DatastoreType.AZURE_DATA_LAKE_GEN2: 'AzureDataLakeGen2'>, 'name': 'machinedatas', 'description': 'this data store contains data for a machine failures', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/1ebe1808-a398-4ab0-b17c-1e3649ea39d5/resourceGroups/practice_resource/providers/Microsoft.MachineLearningServices/workspaces/practice_workspace/datastores/machinedatas', 'Resource__source_path': None, 'base_path': '/Users/ejenamvictor/Desktop/project_new', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0x7ff48b556dc0>, 'credentials': <azure.ai.ml.entities._credentials.NoneCredentialConfiguration object at 0x7ff48b556f10>, 'account_name': 'practiceworksp0600133129', 'filesystem': 'machine', 'endpoint': 'core.windows.net', 'protocol': 'https'})

In [276]:
from azureml.core import Workspace, Datastore

# Define your Azure Machine Learning workspace
workspace = Workspace.from_config(path="config.json")

# Retrieve the data store you want to set as default
datastore_name = "workspaceblobstore"
datastore = Datastore.get(workspace, datastore_name)

# Set the data store as the default
datastore.set_as_default()

# Verify the default data store
default_datastore = Datastore.get_default(workspace)
print(f"Default Datastore: {default_datastore.name}")

Default Datastore: workspaceblobstore


In [None]:
from azureml.core import Datastore

# Specify the name of the Datastore where you want to upload data
datastore_name = "machinedata"

# Get a reference to the Datastore
datastore = Datastore.get(ml_client, datastore_name)

# Specify the local path to the file or folder you want to upload
local_path = "path/to/local/file_or_folder"

# Specify the target path in the Datastore where you want to upload the data
target_path = "target/path/in/datastore"

# Upload the data to the Datastore
datastore.upload(src_dir=local_path, target_path=target_path, overwrite=True)


In [445]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import shutil

df = pd.read_csv('/Users/ejenamvictor/Desktop/project_new/data/ai4i2020.csv')
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [446]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import shutil

df = pd.read_csv('/Users/ejenamvictor/Desktop/project_new/data/ai4i2020.csv')

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Machine failure'], random_state=101)

# Define local directories for train and test data
train_data_dir = 'train_data'
test_data_dir = 'test_data'

# Create the directories if they don't exist
os.makedirs(train_data_dir, exist_ok=True)
os.makedirs(test_data_dir, exist_ok=True)



# Save the train and test data to the respective directories as CSV files
train_df.to_csv(os.path.join(train_data_dir, 'train_df.csv'), index=False)
test_df.to_csv(os.path.join(test_data_dir, 'test_df.csv'), index=False)



In [18]:
from azure.ai.ml import Input

machine_ds = Input(
    path='/Users/ejenamvictor/Desktop/project_CAS/ai4i2020.csv'
)

In [503]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component

parent_dir = ""

# 1. Load components
prepare_data = load_component(source=parent_dir + "./data_prep.yaml")
train_model = load_component(source=parent_dir + "./train.yaml")
test_model = load_component(source=parent_dir + "./test.yaml")
#modules = load_component(source=parent_dir + "./modules.yaml")

# 2. Construct pipeline
@pipeline()
def machine_failure_classification(pipeline_job_input,
                                  train_input,
                                  test_input,
                                  inference_output, 
                                  ):
    """Machine failure classification."""
    prepare_sample_data = prepare_data(raw_data=pipeline_job_input)
    train_with_sample_data = train_model(train_data=train_input)
    test_model_performance = test_model(test_data=test_input,
                                       model_input=train_with_sample_data.outputs.model_output,)
    #modules_pipeline = modules()
    return {
        "pipeline_job_prepped_data": prepare_sample_data.outputs.prep_data,
        "pipeline_job_model": train_with_sample_data.outputs.model_output,
        "pipeline_inference_data": test_model_performance.outputs.inference_df,
    }


pipeline_jobs = machine_failure_classification(
    pipeline_job_input = Input(type="uri_folder", path=parent_dir + "./data/"),
    train_input = Input(type="uri_folder", path=parent_dir + "./train_data/"),
    test_input = Input(type="uri_folder", path=parent_dir + "./test_data/"),
    inference_output = 'inference_data'
)
# demo how to change pipeline output settings
#pipeline_jobs.outputs.pipeline_job_prepped_data.mode = "rw_mount"
#pipeline_jobs.outputs.pipeline_job_train_data.mode = "rw_mount"
#pipeline_jobs.outputs.pipeline_job_test_data.mode = "rw_mount"

# set pipeline level compute
pipeline_jobs.settings.default_compute = "cpu-cluster"
# set pipeline level datastore
pipeline_jobs.settings.default_datastore = "workspaceblobstore"

In [504]:
from azure.ai.ml import load_component
from azure.ai.ml import dsl, Input, Output
import os
import sys

sys.path.insert(1, '/Users/ejenamvictor/Desktop/project_new/modules')
#current_dir = os.path.dirname(os.path.abspath(__file__))
#src_dir = os.path.join(current_dir, '..', 'modules')
#sys.path.append(src_dir)

from aml_config_functions import *
#from pipeline import *

ml_client = create_ml_client()

# submit job to workspace
pipeline_jobs = ml_client.jobs.create_or_update(
    pipeline_jobs, experiment_name="machine_failure_pipeline_recent"
)
pipeline_jobs

INFO:azureml.core.workspace:Found the config file in: /Users/ejenamvictor/Desktop/project_new/config.json


An error occurred while creating the AML client: 'AzureCliCredential' object has no attribute '_get_service_client'
Creating a new configuration file...


Found the config file in: ../config.json
Uploading project_new (1.59 MBs): 100%|█| 1594125/1594125 [00:01<00:00, 813684.5




Experiment,Name,Type,Status,Details Page
machine_failure_pipeline_recent,orange_loquat_c9sth1zwbb,pipeline,Preparing,Link to Azure Machine Learning studio


In [498]:
#sys.path.insert(0, '/Users/ejenamvictor/Desktop/project_new/modules')
#sys.path.append(function_dir)
sys.path.insert(0, '/Users/ejenamvictor/Desktop/project_new/src/modules')
#sys.path.insert(0, "./src/modules")
# Now you can import modules from function_dir
#from modules.split_data import *
#from tune_train_test import *
#from aml_config_functions import *
#from modules import split_data as sd
#from modules import tune_train_test as tt
#from modules import aml_config_functions as acf
#import split_data as sd
import tune_train_test as tt

In [None]:
def evaluate_classification_models(model_name, predictions, true_labels, target_names, plot_classification_report=False, artifact_save_dir: str = 'artifacts'):
    """
    Evaluate a classification model and generate a classification report.
    
    Parameters:
        model_name (str): Name of the model for plot naming.
        predictions (array-like): Model predictions (predicted class labels).
        true_labels (array-like): True class labels.
        target_names (list): e.g ['Class 0', 'Class 1']
        plot_classification_report (bool): Whether to plot the classification report (default: False).
        artifact_save_dir (str): Directory where artifacts including plots will be saved.

    Returns:
        dict: A dictionary containing the computed metrics.
    """
    evaluation_results = {}
    
    accuracy = accuracy_score(true_labels, predictions)
    evaluation_results['accuracy'] = accuracy
    
    precision = precision_score(true_labels, predictions)
    evaluation_results['precision'] = precision

    recall = recall_score(true_labels, predictions)
    evaluation_results['recall'] = recall

    f1 = f1_score(true_labels, predictions)
    evaluation_results['f1'] = f1

    try:
        roc_auc = roc_auc_score(true_labels, predictions)
        evaluation_results['roc_auc'] = roc_auc
    except ValueError:
        # roc_auc_score may not work for multiclass classification
        evaluation_results['roc_auc'] = None

    confusion = confusion_matrix(true_labels, predictions)
    evaluation_results['confusion_matrix'] = confusion

    if plot_classification_report:
        # Generate the classification report
        report = classification_report(
            true_labels, predictions, target_names=target_names, output_dict=True
        )

        # Convert the classification report to a DataFrame for easy plotting
        report_df = pd.DataFrame(report).transpose()

        # Plot the classification report as a heatmap
        plt.figure(figsize=(8, 6))
        sns.heatmap(report_df.iloc[:-3, :-1], annot=True, cmap='Blues', fmt=".2f", cbar=False)
        plt.title('Classification Report')
        plt.xlabel('Classes')
        plt.ylabel('Metrics')
        
        fig = plt.gcf()
        model_prefix = os.path.splitext(model_name)[0]
        plot_name = os.path.join(artifact_save_dir, f'{model_prefix}_Classification_Report.png')
        plt.savefig(plot_name)
        log.info(f'{plot_name} saved.')
    
        plt.show()

        # Add classification report metrics to the evaluation results
        for metric, values in report.items():
            if isinstance(values, dict):
                for class_name, value in values.items():
                    metric_name = f"{metric}_{class_name}"
                    evaluation_results[metric_name] = value
    log.info(evaluation_results)
    return evaluation_results


In [282]:
import os

test_dir = "./test_src"
os.makedirs(test_dir, exist_ok=True)

In [447]:
%%writefile test.yaml
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: test
display_name: test_model
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  test_data: 
    type: uri_folder
  model_input:
    type: mlflow_model
outputs:
  inference_df:
    type: uri_folder
#code: ./test_src
code: ./
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:general_environment:0.4.0
command: >-
  python 
  test_src/test.py
  --test_data ${{inputs.test_data}}
  --model_input ${{inputs.model_input}}
  --inference_df ${{outputs.inference_df}}  # Reference 'output_data' as an input
# </component>

Overwriting test.yaml


In [448]:
%%writefile train.yaml
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train
display_name: train_model
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  train_data: 
    type: uri_folder
outputs:
  model_output:
    type: mlflow_model
#code: ./train_src
code: ./
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:general_environment:0.4.0
command: >-
  python 
  train_src/train.py
  --train_data ${{inputs.train_data}}
  --model_output ${{outputs.model_output}} # Reference 'output_data' as an input
# </component>

Overwriting train.yaml


In [351]:
%%writefile modules.yaml
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: modules
display_name: functions
# version: 1 # Not specifying a version will automatically update the version
type: command
code: ./modules
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:general_environment:0.4.0
command: >-
  python 
  aml_config_functions.py
# </component>

Overwriting modules.yaml


In [168]:
%%writefile {test_dir}/test.py
import argparse
from pathlib import Path
from typing_extensions import Concatenate
from uuid import uuid4
from datetime import datetime
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle
import sys
import mlflow

current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '.', 'modules')
sys.path.append(src_dir)


#sys.path.append(function_dir)

# Now you can import modules from function_dir
import split_data as sd
import tune_train_test as tt

def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])

#from modules import data_prep_functions

#from aml_config_functions import *
#sys.path.insert(1, '/Users/ejenamvictor/Desktop/project_new/modules')
#from data_prep_functions import *
#from split_data import *
#from tune_train_test import *


parser = argparse.ArgumentParser("train")
parser.add_argument("--test_data", type=str, help="Path to raw data")
parser.add_argument("--model_input", type=str, help="trained model")
parser.add_argument("--inference_df", type=str, help="Path to store inference data")


args = parser.parse_args()

test_df = pd.read_csv(select_first_file(args.test_data), index=False)

print(test_df.shape)
print(test_df.columns)

# Assuming 'test_data' contains both features and the target column
X_test = test_df.drop(columns=['Machine failure'])
y_test = test_df['Machine failure']

# load mlflow model
model = mlflow.sklearn.load_model(args.model_input)

inference_df, inference_col_name, predictions = predict_model(model, X_test, inference_col_name)

inference_dir = Path(args.inference_df)
inference_dir.mkdir(parents=True, exist_ok=True)
inference_df.to_csv(inference_dir / "inference_df.csv", index=False)


Overwriting ./test_src/test.py


In [2]:
import os

train_dir = "./train_src"
os.makedirs(train_dir, exist_ok=True)

In [281]:
%%writefile train.yaml
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_test
display_name: training_data
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  train_data: 
    type: uri_folder
outputs:
  model_output:
    type: mlflow_model
code: 
    - ./train_src
    - ./modules
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:general_environment:0.4.0
command: >-
  python 
  train.py
  --input_data ${{inputs.input_data}} 
  --model_output ${{outputs.model_input}}  # Reference 'output_data' as an input
  --train_output ${{inputs.train_input}}
  --test_output ${{inputs.test_output}}
# </component>

Overwriting train.yaml


In [106]:
#%%writefile {train_dir}/train.py
import argparse
from pathlib import Path
from typing_extensions import Concatenate
from uuid import uuid4
from datetime import datetime
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle
import sys
import mlflow

current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '.', 'test')
sys.path.append(src_dir)


#sys.path.append(function_dir)

# Now you can import modules from function_dir
import split_data as sd
import tune_train_test as tt

def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])

#from modules import data_prep_functions

#from aml_config_functions import *
#sys.path.insert(1, '/Users/ejenamvictor/Desktop/project_new/modules')
#from data_prep_functions import *
#from split_data import *
#from tune_train_test import *


parser = argparse.ArgumentParser("train")
parser.add_argument("--input_data", type=str, help="Path to raw data")
parser.add_argument("--HG_model_output", type=str, help="Path of prepped data")
parser.add_argument("--X_train_output", type=str, help="Path of prepped data")
parser.add_argument("--X_test_output", type=str, help="Path of prepped data")
parser.add_argument("--y_train_output", type=str, help="Path of prepped data")
parser.add_argument("--X_test_output", type=str, help="Path of prepped data")

args = parser.parse_args()

train_df = pd.read_csv(select_first_file(args.input_data))

X_train, X_test, y_train, y_test = sd.custom_train_test_split(train_df, target_column='Machine failure', test_size=0.2, random_state=101, time_series=False)

X_train_data = X_train.to_csv((Path(args.X_train_output) / "X_train.csv"), index=False)
X_test_data = X_test.to_csv((Path(args.X_test_output) / "X_test.csv"), index=False)
y_train_data = y_train.to_csv((Path(args.y_train_output) / "y_train.csv"), index=False)
y_train_data = y_train.to_csv((Path(args.y_train_output) / "y_train.csv"), index=False)

best_params, hyperparameters_filename= tt.hyperparameter_tuning(X_train, y_train, 'test', bayesian_search=True, n_iter=30, random_seed=42)
trained_model = tt.train_model(X_train, y_train, 'HGBR.pkl', hyperparam_filename=hyperparameters_filename)


mlflow.sklearn.save_model(trained_model, args.HG_model_output)
    
# Registering the model to the workspace
print("Registering the model via MLFlow")
mlflow.sklearn.log_model(
    sk_model=trained_model,
    registered_model_name='HGBC',
    artifact_path='HGBC',
)

# Saving the model to a file
mlflow.sklearn.save_model(
    sk_model=trained_model,
    path=os.path.join(args.HG_model_output, 'trained_model'),
)
    


Overwriting ./train_src/train.py


In [None]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component

parent_dir = ""

# 1. Load components
prepare_data = load_component(source=parent_dir + "./data_prep.yaml")
train_model = load_component(source=parent_dir + "./train_test.yaml")

# 2. Construct pipeline
@pipeline()
def machine_failure_classification(pipeline_job_input):
    """Machine failure classification."""
    prepare_sample_data = prepare_data(raw_data=pipeline_job_input)
    train_with_sample_data = train_model(
        input_data=prepare_sample_data
    )
    return {
        "pipeline_job_prepped_data": prepare_sample_data.outputs.prep_data,
    }


pipeline_jobs = machine_failure_classification(
    Input(type="uri_folder", path=parent_dir + "./data/"),
    X_train_output = Output(type='uri_folder', path=parent_dir + './test/') 
)
# demo how to change pipeline output settings
pipeline_jobs.outputs.pipeline_job_prepped_data.mode = "rw_mount"


# set pipeline level compute
pipeline_jobs.settings.default_compute = "cpu-cluster"
# set pipeline level datastore
pipeline_jobs.settings.default_datastore = "workspaceblobstore"

In [54]:
#%%writefile {prep_src}/prep.py
import argparse
from pathlib import Path
from uuid import uuid4
from datetime import datetime
import os

parser = argparse.ArgumentParser("data_preparation")
parser.add_argument("--input_data", type=str, help="Path to training data")
parser.add_argument("--ms_threshold", type=int, help="Max # of epochs for the training")
parser.add_argument("--corr_threshold", type=float, help="Learning rate")
parser.add_argument("--plot_heatmaps", type=str, help="Learning rate schedule")
parser.add_argument("--max_unique_threshold", type=str, help="Path of output model")
parser.add_argument("--output_data", type=str, help="Path of output model")

args = parser.parse_args()

print("hello your data data is cooking...")

lines = [
    f"input_data: {args.input_data}",
    f"ms_threshold: {args.ms_threshold}",
    f"corr_threshold: {args.corr_threshold}",
    f"plot_heatmaps: {args.plot_heatmaps}",
    f"max_unique_threshold: {args.max_unique_threshold}",
    f"output_data: {args.output_data}",
]

for line in lines:
    print(line)

print("mounted_path files: ")
arr = os.listdir(args.input_data)
print(arr)

for filename in arr:
    print("reading file: %s ..." % filename)
    with open(os.path.join(args.input_data, filename), "r") as handle:
        print(handle.read())

Writing {prep_src}/prep.py


FileNotFoundError: [Errno 2] No such file or directory: '{prep_src}/prep.py'

In [39]:
%%writefile {modules_dir}/data_component_registration.py

from azure.ai.ml import load_component
import os
import sys

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
#components_relative_path = 'src'  # Adjust this to your components directory
data_prep_yaml_file = 'data_prep.yaml'  # Adjust this to your data_prep.yaml file

data_prep_yaml_path = os.path.abspath(os.path.join(current_directory, data_prep_yaml_file))

current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '..', 'modules')
sys.path.append(src_dir)

import aml_config as aml

# Loading the component from the yaml file
loaded_component_prep = load_component(source=data_prep_yaml_path)

ml_client = aml.create_ml_client()

# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(loaded_component_prep)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

Overwriting ./modules/data_component_registration.py


In [47]:
%%writefile {modules_dir}/pipeline.py
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output, load_component
import os
import mlflow
import sys
import pandas as pd
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

#aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))

#current_directory = os.getcwd()
current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '..', 'modules')
sys.path.append(src_dir)

from aml_config import *

ml_client = create_ml_client()


cpu_compute_target, cpu_cluster = get_compute(ml_client, compute_name="cpu-cluster", vm_size="STANDARD_E16S_V3", min_instance=0, max_instances=4)

parent_directory = '../modules/'  # Adjust this to your components directory

data_prep = load_component(source=parent_directory + 'data_prep.yaml')

@dsl.pipeline(
    compute=cpu_compute_target
    if (cpu_cluster)
    else "serverless",  # "serverless" value runs pipeline on serverless compute
    description="first pipeline",
)
def classification_pipeline(
    input_data,
    ms_threshold,
    corr_threshold,
    plot_heatmaps,
    max_unique_threshold,
    output_data,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep(
        input_data = input_data,
        ms_threshold = ms_threshold,
        corr_threshold = corr_threshold,
        plot_heatmaps = plot_heatmaps,
        max_unique_threshold = max_unique_threshold
    )
    
    #data_prep_job.outputs.output_data = Output(type='uri_folder', path=output_data, mode='rw_mount')
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.output_data,
    }

parent_dir = "."

pipeline = classification_pipeline(
    input_data=Input(type="uri_folder", path= parent_dir + "/data/"),
    output_data=Output(type="uri_folder", path="processed_data"),
    ms_threshold = 10,
    corr_threshold = 0.8,
    plot_heatmaps = True,
    max_unique_threshold = 0.9,
)

pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="data_prep_component",
)
ml_client.jobs.stream(pipeline_job.name)

Overwriting ./modules/pipeline.py


In [40]:
#%%writefile {modules_dir}/data_prep_component.yaml
import os
from pathlib import Path
from mldesigner import command_component, Input, Output


@command_component(
    name="prep_data",
    version="1",
    display_name="Prep Data",
    description="load the data",
    environment=dict(
        conda_file=Path(__file__).parent / "conda.yaml",
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
    ),
)
def prepare_data_component(
    input_data: Input(type="uri_folder"),
    output_data: Output(type="uri_folder"),
):
    df = drop_high_cardinality_features(os.path.join(training_data, "processed.csv"))
    
    df = drop_high_cardinality_features(df=raw_data, max_unique_threshold=args.max_unique_threshold)
    df = replace_missing_values(df, ms_threshold=args.ms_threshold)
    df = drop_highly_correlated_features(df, corr_threshold=args.corr_threshold, plot_heatmaps=args.plot_heatmaps)
    
    os.path.join(training_data, "mnist_train.csv")
    
    os.makedirs(args.output_data, exist_ok=True)
    processed_data_path = os.path.join(args.output_data, 'processed_df.csv')
    df.to_csv(processed_data_path, engine='pyarrow')
    
    
def process_data():
    df = drop_high_cardinality_features(df=raw_data, max_unique_threshold=args.max_unique_threshold)
    df = replace_missing_values(df, ms_threshold=args.ms_threshold)
    df = drop_highly_correlated_features(df, corr_threshold=args.corr_threshold, plot_heatmaps=args.plot_heatmaps)
    return df
    

NameError: name '__file__' is not defined

In [33]:
#%%writefile {modules_dir}/data_preparation.py
import os
import sys

import argparse
from pathlib import Path
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import mlflow

from data_prep import *
from data_ingestion import *
from pathlib import Path
import traceback

def get_file(f):

    f = Path(f)
    if f.is_file():
        return f
    else:
        files = list(f.iterdir())
        if len(files) == 1:
            return files[0]
        else:
            raise Exception("********This path contains more than one file*******")


def parse_args():
    # setup argparse
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument(
        "--input_data", type=str, help="path containing data for scoring"
    )
    parser.add_argument(
        '--ms_threshold', dest='ms_threshold', type=int, default=10, help='Threshold to switch between interpolation and iterative imputer'
    )
    parser.add_argument(
        '--corr_threshold', dest='corr_threshold', type=float, default=0.8,  help='The threshold for correlation above which features will be dropped'
    )
    parser.add_argument(
        '--plot_heatmaps', dest='plot_heatmaps', type=bool, default=True,  help='Whether to plot heatmaps before and after dropping (default is True)'
    )
    parser.add_argument(
        '--max_unique_threshold', dest='max_unique_threshold', type=float, default= 0.9, help='The maximum allowed fraction of unique values in a column (default is 0.9)'
    )
    parser.add_argument(
        '--output_data', dest='output_data', type=str
    )
    # parse args
    args = parser.parse_args()

    # return args
    return args


def clean_data(input_data, output_data):

    test_file = get_file(input_data)
    data = pd.read_csv(test_file)
    
    df = drop_high_cardinality_features(data, max_unique_threshold=args.max_unique_threshold)
    df = replace_missing_values(df, ms_threshold=args.ms_threshold)
    df = drop_highly_correlated_features(df, corr_threshold=args.corr_threshold, plot_heatmaps=args.plot_heatmaps)

    # Output result
    np.savetxt(output_data + "/predict_result.csv", df, delimiter=",")

def main(args):
    score(args.input_data, args.ms_threshold, args.corr_threshold, args.plot_heatmaps, args.max_unique_threshold, args.output_data)


# run script
if __name__ == "__main__":
    # parse args
    args = parse_args()

    # call main function
    main(args)

usage: ipykernel_launcher.py [-h] [--input_data INPUT_DATA]
                             [--ms_threshold MS_THRESHOLD]
                             [--corr_threshold CORR_THRESHOLD]
                             [--plot_heatmaps PLOT_HEATMAPS]
                             [--max_unique_threshold MAX_UNIQUE_THRESHOLD]
                             [--output_data OUTPUT_DATA]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/ejenamvictor/Library/Jupyter/runtime/kernel-bbde93c4-7b2f-4cd3-9671-2e4c4af96dfc.json


SystemExit: 2

In [178]:
%%writefile {modules_dir}/data_preparation.py
import os
import sys

#current_dir = os.path.dirname(os.path.abspath(__file__))
#modules_dir = os.path.join(current_dir, '..', 'modules')
#sys.path.append(modules_dir)
# Get the directory of the current script (assuming it's in the same directory)
#script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))

# Add the directory containing "data_prep.py" to the Python path
#sys.path.append(os.path.join(script_dir, 'modules'))

# Specify the directory containing the aml_config module
#aml_config_dir = os.path.abspath('./modules')  # Use an absolute path

# Check if the directory exists
#if not os.path.exists(aml_config_dir):
#    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
#    sys.exit(1)

# Add the aml_config directory to sys.path
#sys.path.insert(0, aml_config_dir)


from data_prep import *
from data_ingestion import *
import argparse 
import mlflow
import pandas as pd
from pathlib import Path
import traceback

#def a custom argument type for a list of strings
def list_of_strings(arg):
    return arg.split(',')

#if 'ipykernel' in sys.modules:
    # Exclude IPython-specific arguments
#    sys.argv = [arg for arg in sys.argv if not arg.endswith('ipykernel_launcher.py')]
import aml_config as aml
ml_client = aml.create_ml_client()

def load_raw_data():
    machine_data = ml_client.data.get(name='machine-failure', version='1')
    data_path = machine_data.path  # Remove quotes around data_path variable
    data = pd.read_csv(data_path)  # Use the data_path variable
    return data  # Return the data frame

def path_exists(path):
    if os.path.exists(path):
        return path
    else:
        raise argparse.ArgumentTypeError(f"Path {path} does not exist.")
   
    
def main():
    # Check if running in IPython environment and exclude IPython-specific arguments
#    if 'get_ipython' in globals():
#        sys.argv = [arg for arg in sys.argv if not arg.startswith('-f')]

    #setup arg parser
        
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_ds', dest='--input_ds', type=path_exists, help='Threshold to switch between interpolation and iterative imputer')
    parser.add_argument('--ms_threshold', dest='ms_threshold', type=int, help='Threshold to switch between interpolation and iterative imputer')
    parser.add_argument('--corr_threshold', dest='corr_threshold', type=float, help='The threshold for correlation above which features will be dropped')
    parser.add_argument('--plot_heatmaps', dest='plot_heatmaps', type=bool, help='Whether to plot heatmaps before and after dropping (default is True)')
    parser.add_argument('--max_unique_threshold', dest='max_unique_threshold', type=float, help='The maximum allowed fraction of unique values in a column (default is 0.9)')
    parser.add_argument('--output_data', dest='output_data', type=str)
        
        # parse args
    args = parser.parse_args()
        # Start Logging
        
    mlflow.start_run()

    #print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    #print("input data:", args.data_path)

    #raw_data = pd.read_csv(args.data_path, header=1, index_col=0)
    #raw_data = pd.read_csv((Path(args.input_ds)))
    raw_data = pd.read_csv(args.input_ds)
        # Load in the data from synapse DataLake Storage
        #raw_df = load_data(args.subscription_id, args.resource_group, args.workspace_name, args.data_name)
        # Direct working directory to Artefact location
        #set_cwd_path(args.plot_save_dir)
        
        #df = pd.read_csv(args.raw_data)
        #if not os.path.exists(artifact_save_dir):
        #os.makedirs(artifact_save_dir)
        #if raw_data is None:
        #    mlflow.log_info('Failed to load data.')  # Log an info message
        #    mlflow.end_run()  # End the run
        #    return 
            
    raw_data = check_missing_values(raw_data)
        # Direct working directory to Artefact location
        #set_cwd_path(args.plot_save_dir)
        
            
        #df = drop_high_cardinality_features(df=raw_data, max_unique_threshold=args.max_unique_threshold)
        #df = replace_missing_values(df, ms_threshold=args.ms_threshold)
        #df = drop_highly_correlated_features(df, corr_threshold=args.corr_threshold, plot_heatmaps=args.plot_heatmaps)
    
        # Reset directory back to initial working directory
        #set_cwd_path('..')
    
    mlflow.log_metric('Sample Size', raw_data.shape[0])
    os.makedirs(args.output_data, exist_ok=True)
    processed_data_path = os.path.join(args.output_data, 'processed_df.csv')
    df.to_csv(processed_data_path, engine='pyarrow')
    
        #df.to_csv(os.path.join(args.output_data, 'processed_df.csv'), engine='pyarrow')
    
        # End Logging
    #except Exception as e:
    # Log information about the exception
    #mlflow.log_param("exception_type", type(e).__name__)
    #mlflow.log_param("exception_message", str(e))
    #mlflow.log_param("exception_traceback", traceback.format_exc())
 
    mlflow.end_run()
    
if __name__ =='__main__':
    main()

Overwriting ./modules/data_preparation.py


In [99]:
from azure.ai.ml import load_component
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'src'  # Adjust this to your components directory
data_prep_yaml_file = 'data_prep.yaml'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_prep_yaml_path = os.path.abspath(os.path.join(component_directory, data_prep_yaml_file))

# Specify the directory containing the aml_config module
aml_config_dir = os.path.abspath('./modules')  # Use an absolute path

# Check if the directory exists
if not os.path.exists(aml_config_dir):
    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
    sys.exit(1)

print(aml_config_dir)

/Users/ejenamvictor/Desktop/project_CAS/modules


In [76]:
%%writefile {modules_dir}/data_component_registration.py

from azure.ai.ml import load_component
import os
import sys

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
#components_relative_path = 'src'  # Adjust this to your components directory
data_prep_yaml_file = 'data_prep.yaml'  # Adjust this to your data_prep.yaml file

data_prep_yaml_path = os.path.abspath(os.path.join(current_directory, data_prep_yaml_file))

current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '..', 'modules')
sys.path.append(src_dir)

import aml_config as aml

#parent_directory = 'modules/'  # Adjust this to your components directory

#loaded_component_prep = load_component(source=current_dir + '/' + 'data_prep.yaml')

# Loading the component from the yaml file
loaded_component_prep = load_component(source=data_prep_yaml_path)

ml_client = aml.create_ml_client()

# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(loaded_component_prep)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

Overwriting ./modules/data_component_registration.py


In [30]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
data_prep_relative_path = 'src'  # Adjust this to your components directory
data_prep_yaml_path = 'data_prep.yaml'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, data_prep_relative_path))
data_prep_yaml_path = os.path.abspath(os.path.join(component_directory, data_prep_yaml_path))

print(data_prep_yaml_path)

/Users/ejenamvictor/Desktop/project_CAS/src/data_prep.yaml


In [31]:
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output, load_component
import os
import mlflow
import sys
import pandas as pd
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

In [32]:
import os
pipeline_dir = "./pipeline"
os.makedirs(pipeline_dir, exist_ok=True)

In [173]:
%%writefile {modules_dir}/pipeline.py
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output, load_component
import os
import mlflow
import sys
import pandas as pd
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

#aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))

#current_directory = os.getcwd()
current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '..', 'modules')
sys.path.append(src_dir)

from aml_config import *

ml_client = create_ml_client()


cpu_compute_target, cpu_cluster = get_compute(ml_client, compute_name="cpu-cluster", vm_size="STANDARD_E16S_V3", min_instance=0, max_instances=4)

parent_directory = '../modules/'  # Adjust this to your components directory

data_prep = load_component(source=parent_directory + 'data_prep.yaml')

@dsl.pipeline(
    compute=cpu_compute_target
    if (cpu_cluster)
    else "serverless",  # "serverless" value runs pipeline on serverless compute
    description="first pipeline",
)
def classification_pipeline(
    input_ds,
    ms_threshold,
    corr_threshold,
    plot_heatmaps,
    max_unique_threshold,
    output_data,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep(
        input_ds = input_ds,
        ms_threshold = ms_threshold,
        corr_threshold = corr_threshold,
        plot_heatmaps = plot_heatmaps,
        max_unique_threshold = max_unique_threshold
    )
    
    #data_prep_job.outputs.output_data = Output(type='uri_folder', path=output_data, mode='rw_mount')
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.output_data,
    }


Overwriting ./modules/pipeline.py


In [175]:
%%writefile {modules_dir}/pipeline_job_submission.py
from azure.ai.ml import load_component
from azure.ai.ml import dsl, Input, Output
import os
import sys


current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '..', 'modules')
sys.path.append(src_dir)

#import pipeline as pi

# Get the current working directory
##current_directory = os.getcwd()

# Specify the relative paths to your directories and files
#aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
#components_relative_path = 'src'  # Adjust this to your components directory
#data_preparation_py_path = 'data_preparation.py'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
#aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
#component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
#data_prep_python_path = os.path.abspath(os.path.join(component_directory, data_preparation_py_path))
from aml_config import *
from pipeline import *

ml_client = create_ml_client()

#machine_data = ml_client.data.get(name='machine-failure', version='1')
#print(f'Data asset URI:{machine_data.path}')

# Define input data and parameters

# Create a pipeline
pipeline = classification_pipeline(
    input_ds=Input(type="uri_file", path='/Users/ejenamvictor/Desktop/project_CAS/ai4i2020.csv'),
    ms_threshold = 10,
    corr_threshold = 0.8,
    plot_heatmaps = True,
    max_unique_threshold = 0.9,
    output_data = 'processed_data'
)



#current_dir = os.path.dirname(os.path.abspath(__file__))
#src_dir = os.path.join(current_dir, '.', 'modules')
#sys.path.append(src_dir)


pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="data_prep_component",
)
ml_client.jobs.stream(pipeline_job.name)

Overwriting ./modules/pipeline_job_submission.py


In [266]:
# submit the pipeline job
# Specify the directory containing the aml_config module

current_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(current_dir, '.', 'modules')
sys.path.append(src_dir)

#aml_config_dir = os.path.abspath('./modules')  # Use an absolute path

# Check if the directory exists
#if not os.path.exists(aml_config_dir):
#    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
#    sys.exit(1)

## Add the aml_config directory to sys.path
#sys.path.insert(0, aml_config_dir)

import aml_config as aml 

ml_client = aml.create_ml_client()

pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="data_prep_component",
)
ml_client.jobs.stream(pipeline_job.name)

INFO:azureml.core.workspace:Found the config file in: /Users/ejenamvictor/Desktop/project_CAS/config.json
Found the config file in: ../config.json


An error occurred while creating the AML client: 'AzureCliCredential' object has no attribute '_get_service_client'
Creating a new configuration file...


INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded
INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded
INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded
INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded


RunId: plucky_coat_jzptvvnsnt
Web View: https://ml.azure.com/runs/plucky_coat_jzptvvnsnt?wsid=/subscriptions/1ebe1808-a398-4ab0-b17c-1e3649ea39d5/resourcegroups/practice_resource/workspaces/practice_workspace


INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded



Streaming logs/azureml/executionlogs.txt

[2023-09-30 13:33:20Z] Submitting 1 runs, first five are: 8f1602fb:f40e393c-9a1a-481f-a28c-48571b0f888b
[2023-09-30 13:37:18Z] Execution of experiment failed, update experiment status and cancel running nodes.

Execution Summary
RunId: plucky_coat_jzptvvnsnt
Web View: https://ml.azure.com/runs/plucky_coat_jzptvvnsnt?wsid=/subscriptions/1ebe1808-a398-4ab0-b17c-1e3649ea39d5/resourcegroups/practice_resource/workspaces/practice_workspace


JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Pipeline has failed child jobs. Failed nodes: /data_prep_job. For more details and logs, please go to the job detail page and check the child jobs.",
        "message_format": "Pipeline has failed child jobs. {0}",
        "message_parameters": {},
        "reference_code": "PipelineHasStepJobFailed",
        "details": []
    },
    "environment": "ukwest",
    "location": "ukwest",
    "time": "2023-09-30T13:37:18.037605Z",
    "component_name": ""
} 

#%%writefile {components_dir}/data_prep.yaml

#$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json

from azure.ai.ml import load_component
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'components_registration'  # Adjust this to your components directory
data_prep_python_file = 'data_prep.py'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_prep_python_path = os.path.abspath(os.path.join(component_directory, data_prep_python_file))


        
# <component>
name: data_prep
display_name: data_preparation
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  subscription_id:
    type: string
  resource_group:
    type: string
  workspace_name:
    type: string
  data_name:
    type: string
  plot_save_dir:
    type: string
  ms_threshold:
    type: number
  corr_threshold:
    type: number
  plot_heatmaps:
    type: boolean
  max_unique_threshold:
    type: number
outputs:
  output_data:
    type: uri_folder
code: ..
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
command: >-
  python ${{inputs.data_prep_python_path}} 
  --subscription_id ${{inputs.subscription_id}} 
  --resource_group ${{inputs.resource_group}} 
  --workspace_name ${{inputs.workspace_name}}
  --data_name ${{inputs.data_name}}
  --plot_save_dir ${{inputs.plot_save_dir}}
  --ms_threshold ${{inputs.ms_threshold}}
  --corr_threshold ${{inputs.corr_threshold}}
  --plot_heatmaps ${{inputs.plot_heatmaps}}
  --max_unique_threshold ${{inputs.max_unique_threshold}}
  --output_data ${{outputs.output_data}}
# </component>


In [None]:
import os
component_register_dir = "./component_registration"
os.makedirs(component_register_dir, exist_ok=True)

In [None]:
%%writefile {component_register_dir}/data_prep.py
# importing the Component Package
from azure.ai.ml import load_component
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'components'  # Adjust this to your components directory
data_prep_yaml_file = 'data_prep.yaml'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_prep_yaml_path = os.path.abspath(os.path.join(component_directory, data_prep_yaml_file))



# Specify the directory containing the aml_config module
aml_config_dir = os.path.abspath('./modules')  # Use an absolute path

# Check if the directory exists
if not os.path.exists(aml_config_dir):
    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
    sys.exit(1)

# Add the aml_config directory to sys.path
sys.path.insert(0, aml_config_dir)

import aml_config as aml

# Specify the directory containing the components
#component_directory = os.path.abspath('./components')  # Use an absolute path

# Check if the directory exists
#if not os.path.exists(component_directory):
#    print(f"Directory '{component_directory}' does not exist. Please check the directory path.")
#    sys.exit(1)

# Add the components directory to sys.path
#sys.path.insert(0, component_directory)


# parent_directory = '../components/'
# Loading the component from the yml file
#loaded_component_prep = load_component(source=os.path.join(component_directory, "data_prep.yaml"))

loaded_component_prep = load_component(source=data_prep_yaml_path)

ml_client = aml.create_ml_client()

# Now we register the component to the workspace
data_prep = ml_client.create_or_update(loaded_component_prep)

# Create (register) the component in your workspace
print(
    f"Component {data_prep.name} with Version {data_prep.version} is registered"
)

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'components'  # Adjust this to your components directory
data_prep_yaml_file = 'data_prep.yaml'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_prep_yaml_path = os.path.abspath(os.path.join(component_directory, data_prep_yaml_file))

# Check if the directories and files exist
if not os.path.exists(aml_config_dir):
    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
    sys.exit(1)

if not os.path.exists(component_directory):
    print(f"Directory '{component_directory}' does not exist. Please check the directory path.")
    sys.exit(1)

if not os.path.exists(data_prep_yaml_path):
    print(f"File '{data_prep_yaml_path}' does not exist. Please check the file path.")
    sys.exit(1)

# Now, you have the absolute paths
print("Absolute path to aml_config directory:", aml_config_dir)
print("Absolute path to components directory:", component_directory)
print("Absolute path to data_prep.yaml file:", data_prep_yaml_path)

In [None]:
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output, load_component
import os
import mlflow
import sys
import pandas as pd
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)



import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'components'  # Adjust this to your components directory
data_preparation_py_path = 'data_preparation.py'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_preparation_py_path = os.path.abspath(os.path.join(component_directory, data_preparation_py_path))




# Specify the directory containing the aml_config module
aml_config_dir = os.path.abspath('./modules')  # Use an absolute path

# Check if the directory exists
if not os.path.exists(aml_config_dir):
    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
    sys.exit(1)

# Add the aml_config directory to sys.path
sys.path.insert(0, aml_config_dir)

import aml_config as aml
ml_client = aml.create_ml_client()

# Specify the directory containing the components
#component_directory = os.path.abspath('./components')  # Use an absolute path

# Check if the directory exists
#if not os.path.exists(component_directory):
#    print(f"Directory '{component_directory}' does not exist. Please check the directory path.")
#    sys.exit(1)

# Add the components directory to sys.path
#sys.path.insert(0, component_directory)

# Loading the component from the yml file
data_prep = load_component(source=data_prep_yaml_path)

# Add the aml_config directory to sys.path
#sys.path.insert(0, component_directory)


cpu_compute_target, cpu_cluster = aml.get_compute(ml_client, compute_name="cpu-cluster", vm_size="STANDARD_E16S_V3", min_instance=0, max_instances=4)

@dsl.pipeline(
    compute=cpu_compute_target
    if (cpu_cluster)
    else "serverless",  # "serverless" value runs pipeline on serverless compute
    description="first pipeline",
)
def classification_pipeline(
    subscription_id,
    resource_group,
    workspace_name,
    data_name,
    plot_save_dir,
    ms_threshold,
    corr_threshold,
    plot_heatmaps,
    #columns_to_drop,
    max_unique_threshold,
    data_preparation_py_path
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep(
        subscription_id = subscription_id,
        resource_group = resource_group,
        workspace_name = workspace_name,
        data_name = data_name,
        plot_save_dir = plot_save_dir,
        ms_threshold = ms_threshold,
        corr_threshold = corr_threshold,
        plot_heatmaps = plot_heatmaps,
        #columns_to_drop = columns_to_drop,
        max_unique_threshold = max_unique_threshold,
        data_preparation_py_path = data_preparation_py_path
        
    )
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.output_data,
    }

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
#aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'src'  # Adjust this to your components directory
data_preparation_py_path = 'data_preparation.py'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_prep_python_path = os.path.abspath(os.path.join(component_directory, data_preparation_py_path))

# Now, you have the absolute paths
print("Absolute path to aml_config directory:", aml_config_dir)
print("Absolute path to components directory:", component_directory)
print("Absolute path to data_prep.yaml file:", data_prep_python_path)

In [None]:
from azure.ai.ml import load_component
import os

# Get the current working directory
current_directory = os.getcwd()

# Specify the relative paths to your directories and files
#aml_config_relative_path = 'modules'  # Adjust this to your aml_config directory
components_relative_path = 'src'  # Adjust this to your components directory
data_preparation_py_path = 'data_preparation.py'  # Adjust this to your data_prep.yaml file

# Construct the absolute paths
aml_config_dir = os.path.abspath(os.path.join(current_directory, aml_config_relative_path))
component_directory = os.path.abspath(os.path.join(current_directory, components_relative_path))
data_prep_python_path = os.path.abspath(os.path.join(component_directory, data_preparation_py_path))

# Let's instantiate the pipeline with the parameters of our choice
pipeline = classification_pipeline(
    subscription_id = "1ebe1808-a398-4ab0-b17c-1e3649ea39d5",
    resource_group = "practice_resource",
    workspace_name = "practice_workspace",
    data_name = 'ai4i2020.csv',
    plot_save_dir = 'plots',
    ms_threshold = 10,
    corr_threshold = 0.8,
    plot_heatmaps = True,
    #columns_to_drop = columns_to_drop,
    max_unique_threshold = 0.9, 
    data_preparation_py_path = data_preparation_py_path
)

In [None]:
# submit the pipeline job
# Specify the directory containing the aml_config module
aml_config_dir = os.path.abspath('./modules')  # Use an absolute path

# Check if the directory exists
if not os.path.exists(aml_config_dir):
    print(f"Directory '{aml_config_dir}' does not exist. Please check the directory path.")
    sys.exit(1)

# Add the aml_config directory to sys.path
sys.path.insert(0, aml_config_dir)

import aml_config as aml 

ml_client = aml.create_ml_client()

pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="data_prep_component",
)
ml_client.jobs.stream(pipeline_job.name)

"subscription_id": "1ebe1808-a398-4ab0-b17c-1e3649ea39d5",
            "resource_group": "practice_resource",
            "workspace_name": "practice_workspace",

df = dp.drop_high_cardinality_features(df, max_unique_threshold=0.9)
df = dp.drop_highly_correlated_features(df, corr_threshold=0.8, plot_heatmaps=True)
df = dp.replace_missing_values(df, ms_threshold=10)

In [None]:
#%%writefile {scripts_dir}/model_training.py
import os
import sys

sys.path.insert(0, "/modules")
from data_prep import *
from data_ingestion import *
import argpase 
import mlflow

#def a custom argument type for a list of strings
def list_of_strings(arg):
    return arg.split(',')

def load_data(subscription_id:str, resource_group_name:str, workspace_name:str, data_path:str)
def main():
    #setup arg parser
    parser = argparse.ArgumentParser()
    
    #add arguments
    parser.add_argument('--subscription_id', dest='subscription_id',
                       type=str, help="Azure subscription id")
    parser.add_argument('--resource_group', dest='resource_group',
                       type=str, help="resource group name")
    parser.add_argument('--workspace_name', dest='workspace_name',
                       type=str, help="workspace_name")
    parser.add_argument('--data_name', dest='data_name',
                       type=str, help="data_path")
    parser.add_argument('--plot_save_dir', dest='plot_save_dir'
                       type=str, help='Name of Parent Directory to store pipeline artefacts')
    parser.add_argument('--ms_threshold', dest='ms_threshold',
                       type=int, help='Threshold to switch between interpolation and iterative imputer')
    parser.add_argument('--corr_threshold', dest='corr_threshold',
                       type=float, help='The threshold for correlation above which features will be dropped')
    parser.add_argument('--plot_heatmaps', dest='plot_heatmaps',
                       type=bool, help='Whether to plot heatmaps before and after dropping (default is True)')
    parser.add_argument('columns_to_drop', dest='columns_to_drop', 
                       type=list_of_strings, help='Single column name or a list of column names to be dropped')
    parser.add_argument('--max_unique_threshold', dest='max_unique_threshold',
                       type=float, help='The maximum allowed fraction of unique values in a column (default is 0.9)')
    parser.add_argument('--output_data', dest='output_data',
                       type=str)
    
    # parse args
    args = parser.parse_args()
    # Start Logging
    mlflow.start_run()
    
    # Load in the data from synapse DataLake Storage
    raw_df = load_data(args.subscription_id, args.resource_group_name, args.workspace_name, args.data_name)
    # Direct working directory to Artefact location
    set_cwd_path(args.plot_save_dir)
    
    if args.training_run == 'True':
        training_run = True
    elif args.training_run == 'False':
        training_run = False
    else:
        raise ValueError('Training Run Parameter must be "True" or "False"')
        
    df = check_missing_values(raw_df)
    df = drop_high_cardinality_features(df, max_unique_threshold=args.max_unique_threshold)
    df = replace_missing_values(df, ms_threshold=args.ms_threshold)
    df = drop_highly_correlated_features(df, corr_threshold=args.threshold, plot_heatmaps=True)
    
    # Reset directory back to initial working directory
    set_cwd_path('..')
    
    mlflow.log_metric('Sample Size', df.shape[0])
    
    df.to_parquet(os.path.join(args.output_data, 'processed_df.parquet'), engine='pyarrow')
    
    # End Logging
    mlflow.end_run()
    
if __name__ =='__main__':
    main()

In [None]:
X_train, X_test, y_train, y_test = sd.custom_train_test_split(df, target_column, test_size=0.2, random_state=None, time_series=False)

In [None]:
#%%writefile {components_dir}/data_prep.yml

In [None]:
# <component>
name: data_prep
display_name: data_preparation
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  subscription_id:
    type: string
  resource_group:
    type: string
  workspace_name:
    type: string
  data_name:
    type: string
  plot_save_dir:
    type: string
  ms_threshold:
    type: number
  corr_threshold:
    type: number
  plot_heatmaps:
    type: boolean
  columns_to_drop:
    type: string
  max_unique_threshold:
    type: number
outputs:
  output_data:
    type: uri_folder
code: .
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
command: >-
  python src/data_preparation.py 
  --subscription_id ${{inputs.subscription_id}} 
  --resource_group ${{inputs.resource_group}} 
  --workspace_name ${{inputs.workspace_name}}
  --data_name ${{inputs.data_name}}
  --plot_save_dir ${{inputs.plot_save_dir}}
  --ms_threshold ${{inputs.ms_threshold}}
  --corr_threshold ${{inputs.corr_threshold}}
  --plot_heatmaps ${{inputs.plot_heatmaps}}
  --columns_to_drop ${{inputs.columns_to_drop}}
  --max_unique_threshold ${{inputs.max_unique_threshold}}
  --output_data ${{outputs.output_data}}
# </component>


In [None]:
#%%writefile {modules_dir}/aml_config.py
import os
import sys

from azure.identity import AzureCliCredential
from azureml.core.compute import AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace
import json

#def create_ml_client(subscription_id: str, resource_group: str, workspace_name: str, tenant_id: str = None):
def create_ml_client():
    
    """
    Create an Azure Machine Learning workspace client.

    This function attempts to create an Azure Machine Learning workspace client using the provided parameters. If it fails
    to create a client, it generates a new configuration file with the provided parameters and tries again.

    Parameters:
        subscription_id (str): Azure subscription ID.
        resource_group (str): Azure resource group name.
        workspace_name (str): Azure Machine Learning workspace name.
        tenant_id (str, optional): Azure Active Directory tenant ID. Default is None.

    Returns:
        azureml.core.Workspace: An Azure Machine Learning workspace client.
    """
    # Create an Azure CLI credential
    credentials = AzureCliCredential(tenant_id='6aa8da55-4c6f-496e-8fc1-de0f7819b03b')
    
    try:
        # Try to create the Azure Machine Learning workspace client using provided parameters
        ml_client = Workspace.from_config(auth=credentials)
    except Exception as ex:
        print("An error occurred while creating the AML client:", str(ex))
        print("Creating a new configuration file...")

        # Define the workspace configuration based on the provided parameters
        client_config = {
            "subscription_id": "1ebe1808-a398-4ab0-b17c-1e3649ea39d5",
            "resource_group_name": "victor_resource",
            "workspace_name": "victor_workspace",
        }

        # Write the configuration to a JSON file
        config_path = "../project_CAS/config.json"
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, "w") as fo:
            json.dump(client_config, fo)
        
        # Try to create the Azure Machine Learning workspace client again
        ml_client = Workspace.from_config(path=config_path)
    
    return ml_client

def get_compute(ml_client, name:str, vm_size:str, min_instance:int, max_instances:int):
    # specify aml compute name.
    cpu_compute_target = name
    
    try:
        cpu_cluster = ml_client.compute_targets[cpu_compute_target]
        print(f'Using existing compute target: {cpu_compute_target}')
    except KeyError:
        print(f"Creating a new cpu compute target: {cpu_compute_target}...")
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=vm_size,
            min_nodes=min_instance,
            max_nodes=max_instances
        )
        cpu_cluster = AmlCompute.create(ml_client, name=cpu_compute_target, provisioning_configuration=compute_config)
        cpu_cluster.wait_for_completion(show_output=True)
        
    return cpu_cluster

In [None]:
sys.path.insert(0, "./modules")
import aml_config as ac

In [None]:
ml_client = ac.create_ml_client(subscription_id="1ebe1808-a398-4ab0-b17c-1e3649ea39d5", resource_group="victor_resource", workspace_name="victor_workspace", tenant_id='6aa8da55-4c6f-496e-8fc1-de0f7819b03b')

In [None]:
ac.get_compute(ml_client, name="cpu-cluster", vm_size="STANDARD_E16S_V3", min_instance=0, max_instances=4)


In [None]:
import json as j

In [None]:
print(j.__version__) 

In [None]:
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8.*
  - pip=23.2.*
  - pip:
    - numpy==1.22.*
    - mlflow== 2.4.1
    - azureml-core==1.53.*
    - azureml-defaults==1.53.*
    - mlflow==2.6.*
    - scikit-learn==1.3.*
    - azure-ai-ml==1.9.0
    - requests==2.31.*
    - azure-identity==1.14.0
    - scipy==1.7.1
    - pandas==1.4.4
    - shap==0.42.1
    - joblib==1.3.2
    - seaborn==0.11.2
    - matplotlib==3.4.*
    - shapely==2.0.*
    - scikit-optimize==0.9.*