MIT License

Copyright (c) 2022 onepoint and contributors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

# Common methods and constants

## Imports

In [15]:
import pandas as pd
import numpy as np
import ast
import io
import smote_variants as sv
import logging
from enum import Enum

from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

In [16]:
# Global configuration
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.max_columns = None
#pd.options.display.max_rows = 999
logging.getLogger(sv.__name__).setLevel(logging.WARNING)

## Data constants and methods

In [17]:
LABELS_COLUMNS = ['projectname', 'packageandclass', 'path',
                  'readable very high', 'readable high', 'readable low', 'readable very low',
                  'understandable very high', 'understandable high', 'understandable low', 'understandable very low',
                  'complex very high', 'complex high', 'complex low', 'complex very low',
                  'not modular very high', 'not modular high', 'not modular low', 'not modular very low',
                  'maintainable very high', 'maintainable high', 'maintainable low', 'maintainable very low']

MAINTAINABILITY_COLUMNS = ['maintainable very high', 'maintainable high', 'maintainable low', 'maintainable very low']

METRIC_COLUMNS_LOC = ["nos"]

METRIC_COLUMNS_CGC = ["cgc"]

METRIC_COLUMNS = ["nos", "non", "non_m", "nos_m",
                  "non_am", "nos_am", "non_lm", "nos_lm",
                  "mll", "nos_deeper_4",
                  "cbo", "cbod", "cboi", "cgc", "cyc", "dac", "dit", "lcom4", "lpc",
                  "mnd", "mpc", "nlam", "nle", "noa", "noc", "nod", "noi", "noi_ic",
                  "nolm", "nom", "nop", "rfc", "rfc_ic"]

METRICS_FILES = ["aoi", "argouml", "diarymanagement-1", "diarymanagement-2",
                 "diarymanagement-3", "jsweet-1", "jsweet-2", "jsweet-3",
                 "jsweet-4", "junit4"]

SEEDS = [0, 13, 17, 33, 42, 89, 15837, 40325, 47137, 98035, 98225, 147904, 170800,
         336695, 359004, 402483, 694957, 1606706, 2956982, 3180998, 109493891, 200929116,
         721672933, 963307431, 1085070426, 1718898832, 3256195516, 3866174187, 4210241333, 4294967295]

In [18]:
def load_metrics():
    metrics = []
    for file in METRICS_FILES:
        metrics.append(pd.read_csv("dataset/metrics/metrics_" + file + ".csv", delimiter=";", index_col=2))
    metrics = pd.concat(metrics)
    metrics = metrics.groupby(metrics.index).first()
    return metrics

def load_labels():
    with open("dataset/maintainability_experts_labels/labels.csv") as f:
        content = f.read().replace("\"{", "").replace("}\"", "")
    labels = pd.read_csv(
        io.StringIO(content),
        header=None,
        skiprows=1,
        names=LABELS_COLUMNS
    )
    labels.set_index("packageandclass", inplace=True)
    return labels

def get_data():
    labels_df = load_labels()
    metrics_df = load_metrics()
    return labels_df.join(metrics_df, how="inner")

## Custom estimator

In [19]:
 def get_estimator(name, parameters, seed):
    """Get a estimator instanciated with the given parameters."""
    
    if name == 'ExtraTreesClassifier':
        estimator = ExtraTreesClassifier(**parameters, random_state=seed)
    elif name == 'ExtraTreesRegressor':
        estimator = ExtraTreesRegressor(**parameters, random_state=seed)
    elif name == 'KNeighborsClassifier':
        estimator = KNeighborsClassifier(**parameters)
    elif name == 'KNeighborsRegressor':
        estimator = KNeighborsRegressor(**parameters)
    elif name == 'GradientBoostingClassifier':
        estimator = GradientBoostingClassifier(**parameters, random_state=seed)
    elif name == 'GradientBoostingRegressor':
        estimator = GradientBoostingRegressor(**parameters, random_state=seed)
    elif name == 'RandomForestClassifier':
        estimator = RandomForestClassifier(**parameters, random_state=seed)
    elif name == 'RandomForestRegressor':
        estimator = RandomForestRegressor(**parameters, random_state=seed)
    elif name == 'AdaBoostClassifier':
        estimator = AdaBoostClassifier(**parameters, random_state=seed)
    elif name == 'AdaBoostRegressor':
        estimator = AdaBoostRegressor(**parameters, random_state=seed)
    elif name == 'LogisticRegression':
        estimator = LogisticRegression(**parameters, random_state=seed)
    elif name == 'BinaryDecompositionClassifier':
        estimator = BinaryDecompositionClassifier(parameters, seed)
    elif name == 'ChainedBinaryClassifier':
        estimator = ChainedBinaryClassifier(parameters, seed)
    elif name == 'ChainedBinaryInverseClassifier':
        estimator = ChainedBinaryInverseClassifier(parameters, seed)
    elif name == 'ChainedBinaryMedianClassifier':
        estimator = ChainedBinaryMedianClassifier(parameters, seed)
    elif name == 'IndividualProbabilitiesClassifier':
        estimator = IndividualProbabilitiesClassifier(parameters, seed)
    elif name == 'BinaryProbabilitiesClassifier':
        estimator = BinaryProbabilitiesClassifier(parameters, seed)
    elif name == 'RoundedRegressorClassifier':
        estimator = RoundedRegressorClassifier(parameters, seed)
    elif name == 'AlwaysAClassifier':
        estimator = AlwaysAClassifier()
    elif name == 'ClassBaselocClassifier':
        estimator = ClassBaselocClassifier()
    elif name == 'AlwaysTrueClassifier':
        estimator = AlwaysTrueClassifier()
    elif name == 'BinaryBaselocClassifier':
        estimator = BinaryBaselocClassifier()
    elif name == 'AlwaysMeanRegressor':
        estimator = AlwaysMeanRegressor()
    elif name == 'ContinuousBaselocRegressor':
        estimator = ContinuousBaselocRegressor()
    else: 
        raise ValueError(f"{name} is not a supported estimator.")
        
    return estimator

In [20]:
class OversamplingClassifier(sv.classifiers.OversamplingClassifier):
    """OversamplingClassifier adding missing bindings."""
    
    def decision_function(self, X):
        return self.classifier_obj.decision_function(X)

    classes_ = property(
        lambda self: getattr(self.classifier_obj, 'classes_'),
        lambda self, value: setattr(self.classifier_obj, 'classes_', value),
        lambda self: delattr(self.classifier_obj, 'classes_')
    )

In [21]:
# Here, by convention:
# - maintainable very high -> 0
# - maintainable high      -> 1
# - maintainable low       -> 2
# - maintainable very low  -> 3

In [22]:
class BinaryDecompositionClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subclassifier=None, seed=None):
        self.subclassifier = subclassifier
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subclassifier['estimator']
        parameters = {k: self.subclassifier[k] for k in self.subclassifier if k != 'estimator'}
        self.wtA_ = get_estimator(name, parameters, self.seed)
        self.wtB_ = get_estimator(name, parameters, self.seed)
        self.wtC_ = get_estimator(name, parameters, self.seed)
        self.wtA_.fit(X, y > 0)
        self.wtB_.fit(X, y > 1)
        self.wtC_.fit(X, y > 2)
    
    def predict(self, X):
        wtA = self.wtA_.predict_proba(X)[:, 1]
        wtB = self.wtB_.predict_proba(X)[:, 1]
        wtC = self.wtC_.predict_proba(X)[:, 1]
        pA = 1 - wtA
        pB = wtA - wtB
        pC = wtB - wtC
        pD = wtC
        proba = np.column_stack((pA, pB, pC, pD))
        return proba.argmax(axis=1)

In [23]:
class ChainedBinaryClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subclassifier=None, seed=None):
        self.subclassifier = subclassifier
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subclassifier['estimator']
        parameters = {k: self.subclassifier[k] for k in self.subclassifier if k != 'estimator'}
        self.wtA_ = get_estimator(name, parameters, self.seed)
        self.wtB_ = get_estimator(name, parameters, self.seed)
        self.wtC_ = get_estimator(name, parameters, self.seed)
        self.wtA_.fit(X, y > 0)
        self.wtB_.fit(X, y > 1)
        self.wtC_.fit(X, y > 2)
    
    def predict(self, X):
        wtA = self.wtA_.predict(X)
        wtB = self.wtB_.predict(X)
        wtC = self.wtC_.predict(X)
        return np.where(wtA, np.where(wtB, np.where(wtC, 3, 2), 1), 0)

In [24]:
class ChainedBinaryInverseClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subclassifier=None, seed=None):
        self.subclassifier = subclassifier
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subclassifier['estimator']
        parameters = {k: self.subclassifier[k] for k in self.subclassifier if k != 'estimator'}
        self.wtA_ = get_estimator(name, parameters, self.seed)
        self.wtB_ = get_estimator(name, parameters, self.seed)
        self.wtC_ = get_estimator(name, parameters, self.seed)
        self.wtA_.fit(X, y > 0)
        self.wtB_.fit(X, y > 1)
        self.wtC_.fit(X, y > 2)
    
    def predict(self, X):
        wtA = self.wtA_.predict(X)
        wtB = self.wtB_.predict(X)
        wtC = self.wtC_.predict(X)
        return np.where(wtC, 3, np.where(wtB, 2, np.where(wtA, 1, 0)))

In [25]:
class ChainedBinaryMedianClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subclassifier=None, seed=None):
        self.subclassifier = subclassifier
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subclassifier['estimator']
        parameters = {k: self.subclassifier[k] for k in self.subclassifier if k != 'estimator'}
        self.wtA_ = get_estimator(name, parameters, self.seed)
        self.wtB_ = get_estimator(name, parameters, self.seed)
        self.wtC_ = get_estimator(name, parameters, self.seed)
        self.wtA_.fit(X, y > 0)
        self.wtB_.fit(X, y > 1)
        self.wtC_.fit(X, y > 2)
    
    def predict(self, X):
        wtA = self.wtA_.predict(X)
        wtB = self.wtB_.predict(X)
        wtC = self.wtC_.predict(X)
        return np.where(wtB, np.where(wtC, 3, 2), np.where(wtA, 1, 0))

In [26]:
class IndividualProbabilitiesClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subregressor=None, seed=None):
        self.subregressor = subregressor
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subregressor['estimator']
        parameters = {k: self.subregressor[k] for k in self.subregressor if k != 'estimator'}
        self.pA_ = get_estimator(name, parameters, self.seed)
        self.pB_ = get_estimator(name, parameters, self.seed)
        self.pC_ = get_estimator(name, parameters, self.seed)
        self.pD_ = get_estimator(name, parameters, self.seed)
        y_ = np.array(y.tolist())
        self.pA_.fit(X, np.array(y_[:,0]))
        self.pB_.fit(X, np.array(y_[:,1]))
        self.pC_.fit(X, np.array(y_[:,2]))
        self.pD_.fit(X, np.array(y_[:,3]))
    
    def predict(self, X):
        pA = self.pA_.predict(X)
        pB = self.pB_.predict(X)
        pC = self.pC_.predict(X)
        pD = self.pD_.predict(X)
        proba = np.column_stack((pA, pB, pC, pD))
        return proba.argmax(axis=1)

In [27]:
class BinaryProbabilitiesClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subregressor=None, seed=None):
        self.subregressor = subregressor
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subregressor['estimator']
        parameters = {k: self.subregressor[k] for k in self.subregressor if k != 'estimator'}
        self.wtA_ = get_estimator(name, parameters, self.seed)
        self.wtB_ = get_estimator(name, parameters, self.seed)
        self.wtC_ = get_estimator(name, parameters, self.seed)
        y_ = np.array(y.tolist())
        self.wtA_.fit(X, np.array(y_[:,1]) + np.array(y_[:,2]) + np.array(y_[:,3]))
        self.wtB_.fit(X, np.array(y_[:,2]) + np.array(y_[:,3]))
        self.wtC_.fit(X, np.array(y_[:,3]))
    
    def predict(self, X):
        wtA = self.wtA_.predict(X)
        wtB = self.wtB_.predict(X)
        wtC = self.wtC_.predict(X)
        pA = 1 - wtA
        pB = wtA - wtB
        pC = wtB - wtC
        pD = wtC
        proba = np.column_stack((pA, pB, pC, pD))
        return proba.argmax(axis=1)

In [28]:
class RoundedRegressorClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, subregressor=None, seed=None):
        self.subregressor = subregressor
        self.seed = seed
    
    def fit(self, X, y):
        name = self.subregressor['estimator']
        parameters = {k: self.subregressor[k] for k in self.subregressor if k != 'estimator'}
        self.regressor_ = get_estimator(name, parameters, self.seed)
        self.regressor_.fit(X, y)
    
    def predict(self, X):
        y = self.regressor_.predict(X)
        return np.around(y, 0)

In [29]:
class AlwaysTrueClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.classes_ = np.array([False, True])
    
    def predict(self, X):
        return len(X) * [True]
    
    def predict_proba(self, X):
        return np.array(len(X) * [[0, 1]])

In [30]:
class BinaryBaselocClassifier(BaseEstimator, ClassifierMixin):
    """This baseline classifier should only recieve nos metric as input."""
    
    def __init__(self):
        self.cutoff = 275
    
    def fit(self, X, y):
        self.classes_ = np.array([False, True])
    
    def predict(self, X):
        return X[:,0] < self.cutoff
    
    def predict_proba(self, X):
        y = self.predict(X)
        p = np.where(y, 1, 0)
        return np.column_stack((1 - p, p))

In [31]:
class AlwaysAClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return len(X) * [0]

In [32]:
class ClassBaselocClassifier(BaseEstimator, ClassifierMixin):
    """This baseline classifier should only recieve nos metric as input."""
    
    def __init__(self):
        self.cutoff_a = 80
        self.cutoff_b = 275
        self.cutoff_c = 500
    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        wtA = X[:,0] > self.cutoff_a
        wtB = X[:,0] > self.cutoff_b
        wtC = X[:,0] > self.cutoff_c
        return np.where(wtA, np.where(wtB, np.where(wtC, 3, 2), 1), 0)

In [33]:
class AlwaysMeanRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return len(X) * [0.75]

In [34]:
class ContinuousBaselocRegressor(BaseEstimator, ClassifierMixin):
    """This baseline regressor should only recieve nos metric as input."""
    
    def __init__(self):
        self.cutoff_a = 80
        self.cutoff_b = 275
        self.cutoff_c = 500
    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        wtA = X[:,0] > self.cutoff_a
        wtB = X[:,0] > self.cutoff_b
        wtC = X[:,0] > self.cutoff_c
        return np.where(wtA, np.where(wtB, np.where(wtC, 2.63, 1.97), 0.99), 0.20)

## Processing methods

In [35]:
class MetricType(Enum):
    ALL = "all"
    LOC = "loc"
    CGC = "cgc"

class FoldType(Enum):
    K_FOLD = "k-fold"
    PROJECT_WISE = "project-wise"

In [36]:
def get_metrics_columns(metric_type):
    
    if (metric_type == MetricType.ALL):
        columns = METRIC_COLUMNS
    elif (metric_type == MetricType.LOC):
        columns = METRIC_COLUMNS_LOC
    elif (metric_type == MetricType.CGC):
        columns = METRIC_COLUMNS_CGC
    
    return columns

In [37]:
def sort_metrics(df, metric_type=MetricType.ALL):
    "Sort metrics by descending mutual information with the target."
    
    mi = []
    discrete_features = [False if (m == 'nos_am' or m == 'non_am') else True for m in METRIC_COLUMNS]
    for seed in SEEDS:
        mi.append(mutual_info_classif(df[METRIC_COLUMNS], df['target'], random_state=seed,
                                      discrete_features=discrete_features))

    mi_means = np.mean(mi, axis=0)
    mi_stds = np.std(mi, axis=0)

    sorted_metrics = pd.DataFrame({'mi': mi_means, 'std': mi_stds}, index=METRIC_COLUMNS)
    sorted_metrics = sorted_metrics.sort_values(by='mi', ascending=False)
    columns = get_metrics_columns(metric_type)
    return sorted_metrics[sorted_metrics.index.isin(columns)]

In [38]:
def sort_metrics_probabilities(df, metric_type=MetricType.ALL):
    "Sort metrics by descending mutual information with the target."
    
    mi = []
    discrete_features = [False if (m == 'nos_am' or m == 'non_am') else True for m in METRIC_COLUMNS]
    target_ = np.array(df['target'].tolist())
    for seed in SEEDS:
        mi.append(mutual_info_regression(df[METRIC_COLUMNS], target_[:,0], random_state=seed,
                                         discrete_features=discrete_features))
        mi.append(mutual_info_regression(df[METRIC_COLUMNS], target_[:,1], random_state=seed,
                                         discrete_features=discrete_features))
        mi.append(mutual_info_regression(df[METRIC_COLUMNS], target_[:,2], random_state=seed,
                                         discrete_features=discrete_features))
        mi.append(mutual_info_regression(df[METRIC_COLUMNS], target_[:,3], random_state=seed,
                                         discrete_features=discrete_features))

    mi_means = np.mean(mi, axis=0)
    mi_stds = np.std(mi, axis=0)

    sorted_metrics = pd.DataFrame({'mi': mi_means, 'std': mi_stds}, index=METRIC_COLUMNS)
    sorted_metrics = sorted_metrics.sort_values(by='mi', ascending=False)
    columns = get_metrics_columns(metric_type)
    return sorted_metrics[sorted_metrics.index.isin(columns)]

In [39]:
def sort_metrics_continuous(df, metric_type=MetricType.ALL):
    "Sort metrics by descending mutual information with the target."
    
    mi = []
    discrete_features = [False if (m == 'nos_am' or m == 'non_am') else True for m in METRIC_COLUMNS]
    for seed in SEEDS:
        mi.append(mutual_info_regression(df[METRIC_COLUMNS], df['target'], random_state=seed,
                                         discrete_features=discrete_features))

    mi_means = np.mean(mi, axis=0)
    mi_stds = np.std(mi, axis=0)

    sorted_metrics = pd.DataFrame({'mi': mi_means, 'std': mi_stds}, index=METRIC_COLUMNS)
    sorted_metrics = sorted_metrics.sort_values(by='mi', ascending=False)
    columns = get_metrics_columns(metric_type)
    return sorted_metrics[sorted_metrics.index.isin(columns)]

In [40]:
def as_dict(parameters):
    """Parse the parameters as a Python dictionary."""
    
    try:
        parameters = ast.literal_eval(parameters)
    except:
        raise ValueError("Parameters should be valid Python code representing a dictionary")
    if not isinstance(parameters, dict):
        raise ValueError("Parameters should be a Python dictionary")
        
    return parameters

In [41]:
def get_pipeline(name, parameters, preprocessing, oversampling, seed):
    """Get a pipeline instanciated with the given parameters."""
    
    parameters = as_dict(parameters)
    estimator = get_estimator(name, parameters, seed)
        
    if oversampling:
        oversampler = ('smote_variants', 'MulticlassOversampling',
                       {'oversampler': 'kmeans_SMOTE', 'oversampler_params': {'n_clusters': 2, 'random_state': seed}})
        estimator_params = (type(estimator).__module__, type(estimator).__name__, estimator.get_params())
        estimator = OversamplingClassifier(oversampler, estimator_params)
    
    if preprocessing == 'Standardization':
        pipeline = [StandardScaler(), estimator]
    elif preprocessing == 'Normalization':
        pipeline = [MinMaxScaler(), estimator]
    else:
        pipeline = [estimator]
    
    return make_pipeline(*pipeline)

In [42]:
def get_cv_iterator(fold_type, seed):
    """Get the cross-validation iterator corresponding to the fold-type."""
    
    if (fold_type == FoldType.PROJECT_WISE):
        cv = LeaveOneGroupOut()
    elif (fold_type == FoldType.K_FOLD):
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    else:
        raise ValueError("No fold type chosen.")
    
    return cv

In [49]:
def process(row, df, sorted_metrics, scoring, fold_type=FoldType.PROJECT_WISE):
    """Fit the estimator definied by row
    by project-wise cross-validation on all seeds 
    and return the scores dict defined by scoring."""

    debug_log("called!")

    
    scores = {key: [] for key in scoring.keys()}
    
    X = df[sorted_metrics.index[:row.metrics]].values
    y = df['target'].values
    groups = df.projectname.values
    y_fold = df['fold'].values
    
    if ("Baseloc" in row.estimator):
        X = df[['nos']].values
    
    for seed in SEEDS:
        clf = get_pipeline(row.estimator, row.parameters, row.preprocessing, row.oversampling, seed)
        cv = get_cv_iterator(fold_type, seed)
        cv_scores = cross_validate(clf, X, y, cv=cv.split(X, y_fold, groups), scoring=scoring)
        for key in scoring.keys():
            scores[key].extend(cv_scores.get("test_" + key))
    
    return scores

In [44]:
def debug_log(message):
    with open("debug_log.txt", "a") as file:
        file.write(message + "\n")

# Use debug_log("Your message") instead of print("Your message") inside process_with_probs_and_save
debug_log("test")

In [45]:
debug_log("test")

In [46]:
import pandas as pd
import os

def process_with_probs_and_save(row, df, sorted_metrics, fold_type, output_dir='output', scoring=None):
    """Fit the estimator defined by row by project-wise cross-validation on all seeds,
    and save the prediction probabilities to a uniquely named CSV file."""
    debug_log("called!")
    
    # Ensure scoring is defined here if not passed as a parameter
    if scoring is None:
        scoring = {'f-score': 'f1', 'auc': 'roc_auc', 'recall': 'recall', 'precision': 'precision'}
        
    scores = {key: [] for key in scoring.keys()}
    probabilities = []
    
    X = df[sorted_metrics.index[:row.metrics]].values
    y = df['target'].values
    groups = df['projectname'].values  # Assuming projectname is the correct column name
    y_fold = df['fold'].values
    
    if "Baseloc" in row['estimator']:
        X = df[['nos']].values
    
    # Prepare unique filename for the output CSV
    model_identifier = f"{row['estimator']}_params_{row['parameters']}".replace(" ", "_").replace(",", "").replace(":", "_")
    output_csv = f"{output_dir}/{model_identifier}.csv"
    
    for seed in SEEDS:
        clf = get_pipeline(row['estimator'], row['parameters'], row['preprocessing'], row['oversampling'], seed)
        cv = get_cv_iterator(fold_type, seed)
        
        # Here, we use cross_val_predict to get the prediction probabilities for each fold
        y_probas = cross_val_predict(clf, X, y, cv=cv.split(X, y_fold, groups), method='predict_proba')
        
        probabilities.extend(y_probas[:, 1])  # Assuming the positive class is at index 1

    # After collecting all probabilities, save them to a CSV file
    probabilities_df = pd.DataFrame(probabilities, columns=['PredictionProbability'])
    probabilities_df.to_csv(output_csv, index=False)
    
    # Check if the file has been successfully saved
    if os.path.exists(output_csv):
        print(f"File '{output_csv}' saved successfully.")
    else:
        print(f"Failed to save file '{output_csv}'.")
    
    return scores, output_csv


In [47]:
import os
print(os.getcwd())


/Users/mrksbrg/Documents/Code/maintainability-dataset-analysis


## Output methods

In [48]:
def mean(values):
    return round(np.mean(values), 4)
    
def std(values):
    return round(np.std(values), 4)