# Continuation of Group_F_Case4_Code2_DataPrep notebook
In the previous notebook these steps were done:
- Data Understanding/Exploration
- Data Pre-Processing

In this notebook the following steps were made:
- Extract paths
- Frequency-based encoding
- Creating dataframes for each path
- Scaler (standard or minmaxscaler, depending on the model and the gridsearch)
- Train-test split
- Modeling
- Deployment

# Initial setup and modules/packages loading

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from sklearn.preprocessing import LabelEncoder

%matplotlib inline

from datetime import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

## Data Loading

In [2]:
all_info = pd.read_csv(r"C:\Users\emili\Downloads\all_info.csv")
all_info

Unnamed: 0,Request Identifier,Task execution end date,Actvity ID,Task Type,idBPMApplicationAction,Sex,Is Manager,IsOutSourcer,Year,Month,Age,Time Difference,Target
0,1,2022-04-11 16:02:13.820,100,0,270,1,1,0,2022,4,57,0,Request finished
1,1,2022-04-12 10:02:54.687,102,1,273,0,0,1,2022,4,57,0,Request finished
2,1,2022-04-12 10:17:16.050,103,1,273,0,1,0,2022,4,49,0,Request finished
3,1,2022-05-10 13:43:32.203,104,1,282,1,1,0,2022,4,57,28,Request finished
4,1,2022-05-10 13:48:44.353,107,2,299,1,1,0,2022,5,57,0,Request finished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
184540,45772,2024-05-03 16:42:52.420,102,1,273,0,1,0,2024,5,49,0,Request finished
184541,45772,2024-05-03 17:44:40.083,103,1,300,0,1,0,2024,5,49,0,Request finished
184542,45772,2024-05-06 09:11:24.433,104,1,290,0,1,0,2024,5,25,2,Request finished
184543,45772,2024-05-07 08:31:28.600,104,1,282,0,1,0,2024,5,48,0,Request finished


In [3]:
# Check duplicate values
all_info.duplicated().sum()

0

# Exctract paths by lenght

In [4]:
# Extract activities for each Request Identifier
def extract_activities(all_info):
    activities_dict = {}

    for case_id, group in all_info.groupby('Request Identifier'):
        activities = (group['Actvity ID'].astype(str) + '-' + group['Task Type'].astype(str)).tolist()
        
        # Store activities in the dictionary
        activities_dict[case_id] = activities
    
    return activities_dict

# Extract activities from the DataFrame
activities_dict = extract_activities(all_info)

# Create a list to hold the data for the new DataFrame
data_for_df = []

# Add activities to the list as separate columns
for case_id, activities in activities_dict.items():
    row = {'Request Identifier': case_id}
    for i, activity in enumerate(activities):
        row[f'Activity_{i}'] = activity
    data_for_df.append(row)

# Convert the list to a DataFrame
paths_df = pd.DataFrame(data_for_df)

# Display the DataFrame with separate activity columns
paths_df

Unnamed: 0,Request Identifier,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,...,Activity_15,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24
0,1,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,,,
1,2,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,,,
2,5,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,
3,6,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,
4,7,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44088,45768,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,
44089,45769,100-0,102-1,103-1,104-1,104-1,,,,,...,,,,,,,,,,
44090,45770,100-0,102-1,102-1,104-1,107-2,,,,,...,,,,,,,,,,
44091,45771,100-0,102-1,104-1,107-2,,,,,,...,,,,,,,,,,


# Frequency-based Encoding on activities

This method basically replaces each category, in this case each activity, with the count of how often it appears in the dataset.

In [5]:
# Frequency encoding function
def frequency_encoding(paths_df, columns):
    paths_df = paths_df.copy()  # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    for column in columns:
        freq_encoding = paths_df[column].value_counts().to_dict()
        paths_df[column] = paths_df[column].map(freq_encoding)
    return paths_df

# Get all columns that start with "Activity_"
activity_columns = [col for col in paths_df.columns if col.startswith('Activity_')]

# Apply frequency encoding to the activity columns
paths_df_encoded = frequency_encoding(paths_df, activity_columns)
paths_df_encoded

Unnamed: 0,Request Identifier,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,...,Activity_15,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24
0,1,43493,39338.0,23072.0,24190.0,5405.0,,,,,...,,,,,,,,,,
1,2,43493,39338.0,23072.0,24190.0,5405.0,,,,,...,,,,,,,,,,
2,5,43493,39338.0,3533.0,1768.0,2749.0,,,,,...,,,,,,,,,,
3,6,43493,39338.0,3533.0,1768.0,2749.0,,,,,...,,,,,,,,,,
4,7,43493,39338.0,3533.0,1768.0,2749.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44088,45768,43493,39338.0,3533.0,1768.0,2749.0,,,,,...,,,,,,,,,,
44089,45769,43493,39338.0,23072.0,24190.0,2749.0,,,,,...,,,,,,,,,,
44090,45770,43493,39338.0,3533.0,24190.0,5405.0,,,,,...,,,,,,,,,,
44091,45771,43493,39338.0,13897.0,3460.0,,,,,,...,,,,,,,,,,


# Merge paths with 'Request Identifier' and 'Target'

In [6]:
# Merge paths_df with all_info based on the 'Request Identifier' column
all_info_merged = pd.merge(all_info[['Request Identifier', 'Target']], paths_df_encoded, on='Request Identifier', how='left')
all_info_merged

Unnamed: 0,Request Identifier,Target,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,...,Activity_15,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24
0,1,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
1,1,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
2,1,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
3,1,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
4,1,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184540,45772,Request finished,43493,39338.0,23072.0,24190.0,2749.0,773.0,,,...,,,,,,,,,,
184541,45772,Request finished,43493,39338.0,23072.0,24190.0,2749.0,773.0,,,...,,,,,,,,,,
184542,45772,Request finished,43493,39338.0,23072.0,24190.0,2749.0,773.0,,,...,,,,,,,,,,
184543,45772,Request finished,43493,39338.0,23072.0,24190.0,2749.0,773.0,,,...,,,,,,,,,,


## Check the dataframe obtained

In [7]:
all_info_merged = all_info_merged.drop_duplicates()

In [8]:
all_info_merged

Unnamed: 0,Request Identifier,Target,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,...,Activity_15,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24
0,1,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
5,2,Request finished,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
10,5,Closed administratively/Requester rejects acco...,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
15,6,Closed administratively/Requester rejects acco...,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
20,7,Closed administratively/Requester rejects acco...,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184520,45768,Closed administratively/Requester rejects acco...,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
184525,45769,Closed administratively/Requester rejects acco...,43493,39338.0,23072.0,24190.0,2749.0,,,,...,,,,,,,,,,
184530,45770,Request finished,43493,39338.0,3533.0,24190.0,5405.0,,,,...,,,,,,,,,,
184535,45771,Request finished,43493,39338.0,13897.0,3460.0,,,,,...,,,,,,,,,,


In [9]:
all_info_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44093 entries, 0 to 184539
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Request Identifier  44093 non-null  int64  
 1   Target              44093 non-null  object 
 2   Activity_0          44093 non-null  int64  
 3   Activity_1          44076 non-null  float64
 4   Activity_2          43241 non-null  float64
 5   Activity_3          32458 non-null  float64
 6   Activity_4          11186 non-null  float64
 7   Activity_5          4086 non-null   float64
 8   Activity_6          2333 non-null   float64
 9   Activity_7          1273 non-null   float64
 10  Activity_8          726 non-null    float64
 11  Activity_9          416 non-null    float64
 12  Activity_10         251 non-null    float64
 13  Activity_11         152 non-null    float64
 14  Activity_12         91 non-null     float64
 15  Activity_13         59 non-null     float64
 16  Activity

In [10]:
# make copy of the dataframe 
df = all_info_merged.copy()

## Encode the target

In [11]:
le = LabelEncoder()
df['Target'] = le.fit_transform(df['Target']) 

In [12]:
df['Target'].value_counts()

Target
1    32372
3     8992
0     1858
2      871
Name: count, dtype: int64

This shows that the dataset is imbalanced. We should use stratified k fold for the models.

In [13]:
df

Unnamed: 0,Request Identifier,Target,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,...,Activity_15,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24
0,1,3,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
5,2,3,43493,39338.0,23072.0,24190.0,5405.0,,,,...,,,,,,,,,,
10,5,1,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
15,6,1,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
20,7,1,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184520,45768,1,43493,39338.0,3533.0,1768.0,2749.0,,,,...,,,,,,,,,,
184525,45769,1,43493,39338.0,23072.0,24190.0,2749.0,,,,...,,,,,,,,,,
184530,45770,3,43493,39338.0,3533.0,24190.0,5405.0,,,,...,,,,,,,,,,
184535,45771,3,43493,39338.0,13897.0,3460.0,,,,,...,,,,,,,,,,


# Create dataframes, one for each lenght

In [14]:
activity_columns = [col for col in df.columns if col.startswith('Activity_')]
df['num_activities'] = df[activity_columns].notna().sum(axis=1)

# Group the DataFrame based on the number of activities
grouped_dfs = {name: group for name, group in df.groupby('num_activities')}

In [15]:
for num_activities, group in grouped_dfs.items():
    globals()[f'df_{num_activities}'] = group

In [16]:
df_3

Unnamed: 0,Request Identifier,Target,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,...,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24,num_activities
79,20,1,43493,3137.0,13897.0,,,,,,...,,,,,,,,,,3
87,23,1,43493,3137.0,13897.0,,,,,,...,,,,,,,,,,3
262,63,2,43493,39338.0,1726.0,,,,,,...,,,,,,,,,,3
337,80,1,43493,3137.0,13897.0,,,,,,...,,,,,,,,,,3
366,88,1,43493,3137.0,13897.0,,,,,,...,,,,,,,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184440,45750,1,43493,39338.0,13897.0,,,,,,...,,,,,,,,,,3
184449,45752,2,43493,39338.0,1726.0,,,,,,...,,,,,,,,,,3
184456,45754,1,43493,39338.0,13897.0,,,,,,...,,,,,,,,,,3
184459,45755,1,43493,39338.0,13897.0,,,,,,...,,,,,,,,,,3


The following dataframes will not be considered:
- df_1 e df_2: only have 1/2 columns with activities
- from df_16 until df_25 have less than 10 rows

Consider: df_3 until df_15 (included)

# Modeling

### Code for models development

In the following code, we applied these steps:
- Clean NaN values
- Drop target variable from X and include in y
- Scaler (standard scaler)
- SMOTE
- Models (Logistic Regression, SVM, kNN, Naive-Bayes, Random Forest)

Inside each model (each one with its different and specific parameters), the main steps were:
- Train-test split, using Stratified k-Fold
- GridSearch 
- Evaluation: accuracy, precision, recall and F1 Score

In [17]:
def gridsearch_logReg(model, skf, X_train_res, y_train_res):
    """ 
    Grid search for Logistic Regression
    """
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'max_iter': [100, 200, 300]
    }
    print(f"Performing Grid Search for {model} model....")
    # Initialize GridSearchCV
    start = datetime.now()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)

    # Fit GridSearchCV
    grid_search.fit(X_train_res, y_train_res)
    end = datetime.now()
    print(f"It took {end-start} to run.")
    return grid_search.best_estimator_

In [18]:
def gridsearch_svm(model, skf, X_train_res, y_train_res):
    """ 
    Grid search for SVM
    """
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto'],
        'class_weight': ['balanced']
    }
    print(f"Performing Grid Search for {model} model....")
    # Initialize GridSearchCV
    start = datetime.now()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)

    # Fit GridSearchCV
    grid_search.fit(X_train_res, y_train_res)
    end = datetime.now()
    print(f"It took {end-start} to run.")
    return grid_search.best_estimator_

In [19]:
def gridsearch_knn(model, skf, X_train_res, y_train_res):
    """ 
    Grid search for KNN
    """
    param_grid = {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
        }
    print(f"Performing Grid Search for {model} model....")
    start = datetime.now()
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy')
    
    # Train the model with grid search
    grid_search.fit(X_train_res, y_train_res)
    end = datetime.now()
    print(f"It took {end-start} to run.")
    return grid_search.best_estimator_

In [20]:
def gridsearch_nb(model, skf, X_train_res, y_train_res):
    """ 
    Grid search for Naive-Bayes
    """
    # Define the parameter grid for Grid Search
    param_grid = {
        'model__var_smoothing': [1e-9, 1e-8, 1e-7]
    }

    print(f"Performing Grid Search for {model} model....")
    # Initialize GridSearchCV
    start = datetime.now()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)

    # Fit GridSearchCV to the resampled training data
    grid_search.fit(X_train_res, y_train_res)
    end = datetime.now()
    print(f"It took {end-start} to run.")
    return grid_search.best_estimator_

In [21]:
def gridsearch_rf(model, skf, X_train_res, y_train_res):
    """ 
    Grid search for Random Forest
    """
    # Define the parameter grid for Grid Search
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    print(f"Performing Grid Search for {model} model....")
    start = datetime.now()
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy')
    grid_search.fit(X_train_res, y_train_res)
    end = datetime.now()
    print(f"It took {end-start} to run.")
    return grid_search.best_estimator_

In [22]:
def models_function(df : pd.DataFrame, apply_model, n_splits=3):
    """ 
    Given a dataframe, classification will be done with given model.
    
    Parameters
    ----------
    
        df : pandas dataframe
        apply_model: model to be used for classification.
            It can be 'logReg' for logistic regression, 'svm' for Support Vector Machine, 
            'knn' for KNN, 'nb' for Naive-Bayes, 'xgboost' for XGBoost and 'rf' for Random Forest
    
    Returns
    ----------
        Target predictions for each request identifier    
    """
    df_cleaned = df.dropna(axis=1, how='all')
    # remove unnecessary columns
    df_cleaned = df_cleaned.drop(['Request Identifier', 'num_activities'], axis=1)
    X = df_cleaned.drop(['Target'], axis=1) 
    y = df_cleaned['Target'] 
    
    # Initialise the scaler to be used in the models
    scaler = StandardScaler()
    # Initialize SMOTE
    smote = SMOTE()
    print(f"The model to be used is {apply_model}")
    print('....')
    if apply_model == 'logReg':
        #region Logistic Regression
        # Initialize StratifiedKFold
        skf = StratifiedKFold(n_splits=n_splits)

        # Initialize a list to store metrics for each fold
        metrics = []

        predictions = np.zeros(len(df), dtype=int)
        
        # Iterate through the splits
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Check if SMOTE can be applied
            if len(np.unique(y_train)) == 2 and min(np.bincount(y_train)) >= 6:
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train

            # Scale the training data
            X_train_res = scaler.fit_transform(X_train_res)
            X_test = scaler.transform(X_test)

            # Initialize Logistic Regression
            model = LogisticRegression(class_weight='balanced')
            
            # Extract the best model
            best_model = gridsearch_logReg(model, skf, X_train_res, y_train_res)
            
            # Train the best model
            best_model.fit(X_train_res, y_train_res)

            # Predict on the test set
            y_pred = best_model.predict(X_test)

            predictions[test_index] = y_pred

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            # Store the metrics for this fold
            metrics.append({
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            })

        # Convert the metrics to a DataFrame for easy viewing
        metrics_df = pd.DataFrame(metrics)

        # Print the average metrics across all folds
        print("Average metrics across all folds:")
        print(metrics_df.mean())
        #endregion

    if apply_model == 'svm':
        #region SVM
        # Initialize StratifiedKFold
        skf = StratifiedKFold(n_splits=n_splits)

        # Initialize a list to store metrics for each fold
        metrics = []
        predictions = np.zeros(len(df), dtype=int)

        # Iterate through the splits
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Check if SMOTE can be applied
            if len(np.unique(y_train)) == 2 and min(np.bincount(y_train)) >= 6:
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train

            # Scale the training data
            X_train_res = scaler.fit_transform(X_train_res)
            X_test = scaler.transform(X_test)
            
            model = SVC(random_state=42)
            
            # Extract the best model
            best_model = gridsearch_svm(model, skf, X_train_res, y_train_res)

            # Train the best model
            best_model.fit(X_train_res, y_train_res)

            # Predict on the test set
            y_pred = best_model.predict(X_test)
            predictions[test_index] = y_pred

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            # Store the metrics for this fold
            metrics.append({
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            })

        # Convert the metrics to a DataFrame for easy viewing
        metrics_df = pd.DataFrame(metrics)

        # Print the average metrics across all folds
        print("Average metrics across all folds:")
        print(metrics_df.mean())
        #endregion
    
    if apply_model == 'knn':
        #region KNN
        # Initialise StratifiedKFold
        skf = StratifiedKFold(n_splits=n_splits)

        # Initialize a list to store metrics for each fold
        metrics = []
        predictions = np.zeros(len(df), dtype=int)

        # # Define the parameter grid for grid search
        # param_grid = {
        #     'n_neighbors': [3, 5, 7, 9],
        #     'weights': ['uniform', 'distance'],
        #     'metric': ['euclidean', 'manhattan', 'minkowski']
        # }

        # Iterate through the splits
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Check if SMOTE can be applied
            if len(np.unique(y_train)) == 2 and min(np.bincount(y_train)) >= 6:
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train

            # Scale the training data
            X_train_res = scaler.fit_transform(X_train_res)
            X_test = scaler.transform(X_test)

            # Initialize k-NN with GridSearchCV
            knn = KNeighborsClassifier()

            # Get the best model
            best_model = gridsearch_knn(knn, skf, X_train_res, y_train_res)

            # Predict on the test set
            y_pred = best_model.predict(X_test)
            predictions[test_index] = y_pred

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            # Store the metrics for this fold
            metrics.append({
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            })
            

        # Convert the metrics to a DataFrame for easy viewing
        metrics_df = pd.DataFrame(metrics)

        # Print the evaluation metrics for each fold
        print(metrics_df)

        # Print the average metrics across all folds
        print("Average metrics across all folds:")
        print(metrics_df.mean())
        #endregion
    
    if apply_model == 'nb':
        #region Naive-Bayes

        # Initialise StratifiedKFold
        skf = StratifiedKFold(n_splits=n_splits)

        # Initialise a list to store metrics for each fold
        metrics = []
        predictions = np.zeros(len(df), dtype=int)

        

        # Iterate through the splits
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Check if SMOTE can be applied
            if len(np.unique(y_train)) == 2 and min(np.bincount(y_train)) >= 6:
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train

            # Create a pipeline
            pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('model', GaussianNB())
            ])
            # Get the best model from GridSearchCV
            best_model = gridsearch_nb(pipe, skf, X_train_res, y_train_res)

            # Predict on the test set
            y_pred = best_model.predict(X_test)
            predictions[test_index] = y_pred

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            # Store the metrics for this fold
            metrics.append({
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            })

        # Convert the metrics to a DataFrame for easy viewing
        metrics_df = pd.DataFrame(metrics)

        # Print the average metrics across all folds
        print("Average metrics across all folds:")
        print(metrics_df.mean())
        #endregion

    if apply_model == 'xgboost':
        #region XGBoost
        # Initialise StratifiedKFold
        skf = StratifiedKFold(n_splits=n_splits)

        # Initialise a list to store metrics for each fold
        metrics = []
        predictions = np.zeros(len(df), dtype=int)
        
        # Iterate through the splits
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Check if SMOTE can be applied
            if len(np.unique(y_train)) == 2 and min(np.bincount(y_train)) >= 6:
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train

            # Scale the training data
            X_train_res = scaler.fit_transform(X_train_res)
            X_test = scaler.transform(X_test)

            # Initialize the model
            model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

            # Get the best model from GridSearchCV
            best_model = gridsearch_xgb(model, skf, X_train_res, y_train_res)

            # Predictions and evaluations
            y_pred = best_model.predict(X_test)
            predictions[test_index] = y_pred

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            # Store the metrics for this fold
            metrics.append({
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            })

        # Convert the metrics to a DataFrame for easy viewing
        metrics_df = pd.DataFrame(metrics)

        # Print the average metrics across all folds
        print("Average metrics across all folds:")
        print(metrics_df.mean())
        #endregion

    if apply_model == 'rf':
        #region RandomForest

        # Initialize StratifiedKFold
        skf = StratifiedKFold(n_splits=n_splits)

        # Initialize a list to store metrics for each fold
        metrics = []
        predictions = np.zeros(len(df), dtype=int)

        # Iterate through the splits
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Check if SMOTE can be applied
            if len(np.unique(y_train)) == 2 and min(np.bincount(y_train)) >= 6:
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train

            # Scale the training data
            X_train_res = scaler.fit_transform(X_train_res)
            X_test = scaler.transform(X_test)

            # Initialize RandomForestClassifier
            model = RandomForestClassifier(random_state=42)

            # Get the best model from GridSearchCV
            best_model = gridsearch_rf(model, skf, X_train_res, y_train_res)

            # Predictions and evaluations
            y_pred = best_model.predict(X_test)
            predictions[test_index] = y_pred

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            # Store the metrics for this fold
            metrics.append({
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            })

        # Convert the metrics to a DataFrame for easy viewing
        metrics_df = pd.DataFrame(metrics)

        # Print the average metrics across all folds
        print("Average metrics across all folds:")
        print(metrics_df.mean())
        #endregion
    
    
    print("Predictions made!") 
    print('///////////')   
    return predictions

## Logistic Regression

In [23]:
for num_activities, group_df in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15) :
        print(f'The dataframes with {num_activities} activities were not analysed')
    else:
        print('Predicting for dataframe: ', num_activities )
        predictions = big_function(group_df, 'logReg')
        globals()[f'df_{num_activities}'] = group_df.copy()
        globals()[f'df_{num_activities}'].reset_index(drop=True, inplace=True)
        globals()[f'df_{num_activities}']['Predictions'] = predictions

The dataframes with 1 activities were not analysed
The dataframes with 2 activities were not analysed
Predicting for dataframe:  3
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:10.099615 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:03.344160 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


It took 0:00:03.226891 to run.
Average metrics across all folds:
accuracy     0.979690
precision    0.706948
recall       0.751995
f1_score     0.683816
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  4
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:06.120843 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:06.107696 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:06.260940 to run.
Average metrics across all folds:
accuracy     0.965354
precision    0.838481
recall       0.772817
f1_score     0.734910
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  5
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:01.623669 to run.
Performing Grid Search for LogisticRegression(class_weigh

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


It took 0:00:01.453378 to run.
Average metrics across all folds:
accuracy     0.860704
precision    0.777725
recall       0.747272
f1_score     0.710785
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  6
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.561971 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.555617 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.565078 to run.
Average metrics across all folds:
accuracy     0.924135
precision    0.720333
recall       0.744156
f1_score     0.716350
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  7
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.427894 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.458338 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.438890 to run.
Average metrics across all folds:
accuracy     0.921707
precision    0.723400
recall       0.751355
f1_score     0.715023
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  8
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.355061 to run.
Performing Grid Search for LogisticRegression(class_weigh

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.369168 to run.
Average metrics across all folds:
accuracy     0.919584
precision    0.655971
recall       0.741385
f1_score     0.691884
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  9
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.357232 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.352542 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.340054 to run.
Average metrics across all folds:
accuracy     0.858072
precision    0.657660
recall       0.723870
f1_score     0.681214
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  10
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....




It took 0:00:00.292998 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....




It took 0:00:00.310191 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.269845 to run.
Average metrics across all folds:
accuracy     0.896970
precision    0.544192
recall       0.562037
f1_score     0.551780
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  11
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.289270 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....




It took 0:00:00.253585 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.253583 to run.
Average metrics across all folds:
accuracy     0.787879
precision    0.637643
recall       0.644383
f1_score     0.639896
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  12
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.193978 to run.


  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.203518 to run.




Performing Grid Search for LogisticRegression(class_weight='balanced') model....




It took 0:00:00.212296 to run.
Average metrics across all folds:
accuracy     0.819841
precision    0.578836
recall       0.537902
f1_score     0.556268
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  13
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....




It took 0:00:00.192000 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.176946 to run.


  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.217303 to run.
Average metrics across all folds:
accuracy     0.593939
precision    0.488889
recall       0.461111
f1_score     0.451852
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  14
The model to be used is logReg
....
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.182966 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....




It took 0:00:00.190330 to run.
Performing Grid Search for LogisticRegression(class_weight='balanced') model....
It took 0:00:00.194902 to run.
Average metrics across all folds:
accuracy     0.678571
precision    0.601852
recall       0.611111
f1_score     0.573016
dtype: float64
Predictions made!
///////////
The dataframes with 15 activities were not analysed
The dataframes with 16 activities were not analysed
The dataframes with 17 activities were not analysed
The dataframes with 18 activities were not analysed
The dataframes with 19 activities were not analysed
The dataframes with 20 activities were not analysed
The dataframes with 21 activities were not analysed
The dataframes with 25 activities were not analysed


  _warn_prf(average, modifier, msg_start, len(result))


## Support Vector Machines

In [24]:
for num_activities, group_df in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15) :
        print(f'The dataframes with {num_activities} activities were not analysed')
    else:
        print('Predicting for dataframe: ', num_activities )
        predictions = big_function(group_df, 'svm')
        globals()[f'df_{num_activities}'] = group_df.copy()
        globals()[f'df_{num_activities}'].reset_index(drop=True, inplace=True)
        globals()[f'df_{num_activities}']['Predictions'] = predictions

The dataframes with 1 activities were not analysed
The dataframes with 2 activities were not analysed
Predicting for dataframe:  3
The model to be used is svm
....
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.541615 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.650827 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.653659 to run.
Average metrics across all folds:
accuracy     0.982009
precision    0.800313
recall       0.788942
f1_score     0.755069
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  4
The model to be used is svm
....
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:06.178961 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:05.180611 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:05.433379 to run.
Average metrics across all folds:
accuracy     0.965307
precision    0.83

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.065360 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.029682 to run.
Average metrics across all folds:
accuracy     0.716828
precision    0.628900
recall       0.668878
f1_score     0.628739
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  10
The model to be used is svm
....
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.031886 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.046869 to run.
Performing Grid Search for SVC(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.041685 to run.
Average metrics across all folds:
accuracy     0.860606
precision    0.572842
recall       0.585847
f1_score     0.570417
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  11
The model to be used is svm
....
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.108078 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.033058 to run.
Performing Grid Search for SVC(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.049291 to run.
Average metrics across all folds:
accuracy     0.747475
precision    0.650847
recall       0.652381
f1_score     0.632215
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  12
The model to be used is svm
....
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.032524 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.032892 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.050400 to run.
Average metrics across all folds:
accuracy     0.654762
precision    0.512698
recall       0.430301
f1_score     0.455086
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  13
The model to be used is svm
....
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.030068 to run.




Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.130642 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.046875 to run.
Average metrics across all folds:
accuracy     0.593939
precision    0.418056
recall       0.438889
f1_score     0.400529
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  14
The model to be used is svm
....


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.046889 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.046873 to run.
Performing Grid Search for SVC(random_state=42) model....
It took 0:00:00.039643 to run.
Average metrics across all folds:
accuracy     0.517857
precision    0.347222
recall       0.481481
f1_score     0.370723
dtype: float64
Predictions made!
///////////
The dataframes with 15 activities were not analysed
The dataframes with 16 activities were not analysed
The dataframes with 17 activities were not analysed
The dataframes with 18 activities were not analysed
The dataframes with 19 activities were not analysed
The dataframes with 20 activities were not analysed
The dataframes with 21 activities were not analysed
The dataframes with 25 activities were not analysed


  _warn_prf(average, modifier, msg_start, len(result))


## kNN

In [25]:
for num_activities, group_df in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15) :
        print(f'The dataframes with {num_activities} activities were not analysed')
    else:
        print('Predicting for dataframe: ', num_activities )
        predictions = big_function(group_df, 'knn')
        globals()[f'df_{num_activities}'] = group_df.copy()
        globals()[f'df_{num_activities}'].reset_index(drop=True, inplace=True)
        globals()[f'df_{num_activities}']['Predictions'] = predictions

The dataframes with 1 activities were not analysed
The dataframes with 2 activities were not analysed
Predicting for dataframe:  3
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:09.715191 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:06.700608 to run.


  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:07.020016 to run.
   accuracy  precision    recall  f1_score
0  0.974131   0.600000  0.750000  0.642857
1  0.983027   0.650974  0.750000  0.688259
2  0.975515   0.826594  0.759409  0.685108
Average metrics across all folds:
accuracy     0.977558
precision    0.692523
recall       0.753136
f1_score     0.672075
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  4
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:30.853873 to run.


  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:30.972355 to run.


  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:30.890015 to run.
   accuracy  precision    recall  f1_score
0  0.847130   0.494887  0.543269  0.503735
1  0.979693   0.717800  0.750000  0.732792
2  0.979549   0.780633  0.741620  0.731258
Average metrics across all folds:
accuracy     0.935457
precision    0.664440
recall       0.678296
f1_score     0.655928
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  5
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:04.677644 to run.
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:04.757929 to run.
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:04.407835 to run.
   accuracy  precision    recall  f1_score
0  0.367554   0.420350  0.719697  0.477311
1  0.854668   0.816757  0.747999  0.723703
2  0.865173   0.817479  0.752551  0.731777
Average metrics across all folds:
accuracy     0.695799
precisi

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.312571 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.302173 to run.
   accuracy  precision    recall  f1_score
0  0.885246   0.505479  0.512255  0.508221
1  0.917582   0.454315  0.500000  0.475108
2  0.923077   0.707123  0.583333  0.601977
Average metrics across all folds:
accuracy     0.908635
precision    0.555639
recall       0.531863
f1_score     0.528435
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  9
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.254062 to run.
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:00.236432 to run.
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:00.250297 to run.
   accuracy  precision    recall  f1_score
0  0.865385   0.689095  0.649830  0.659442
1  0.864078   0.755357  0.747789  0.746283
2  0.864078   0.529274  0.509694  0.502501
Average metrics across all folds:
accuracy     0.864513
precision    0.657909
recall       0.635771
f1_score     0.636076
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  10
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.188044 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.209279 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.205831 to run.
   accuracy  precision    recall  f1_score
0  0.945455   0.625000  0.666667  0.644444
1  0.909091   0.602564  0.655556  0.625556
2  0.890909   0.467391  0.479762  0.473035
Average metrics across all folds:
accuracy     0.915152
precision    0.564985
recall       0.600661
f1_score     0.581012
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  11
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:00.172351 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.187963 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.187845 to run.
   accuracy  precision    recall  f1_score
0  0.848485   0.427778  0.500000  0.460129
1  0.666667   0.446429  0.443269  0.440627
2  0.878788   0.447368  0.500000  0.470588
Average metrics across all folds:
accuracy     0.797980
precision    0.440525
recall       0.481090
f1_score     0.457115
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  12
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.251192 to run.
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:00.192859 to run.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:00.170957 to run.
   accuracy  precision    recall  f1_score
0  0.857143   0.464286  0.480769  0.472222
1  0.900000   0.597222  0.638889  0.616667
2  0.850000   0.619048  0.571429  0.585470
Average metrics across all folds:
accuracy     0.869048
precision    0.560185
recall       0.563696
f1_score     0.558120
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  13
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....




It took 0:00:01.426008 to run.
Performing Grid Search for KNeighborsClassifier() model....
It took 0:00:01.084601 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:01.076229 to run.
   accuracy  precision  recall  f1_score
0  0.545455   0.416667    0.45  0.431818
1  0.727273   0.416667    0.45  0.431818
2  0.500000   0.481481    0.40  0.316239
Average metrics across all folds:
accuracy     0.590909
precision    0.438272
recall       0.433333
f1_score     0.393292
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  14
The model to be used is knn
....
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:01.696433 to run.
Performing Grid Search for KNeighborsClassifier() model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:01.543327 to run.
Performing Grid Search for KNeighborsClassifier() model....




It took 0:00:01.613638 to run.
   accuracy  precision    recall  f1_score
0  0.750000   0.500000  0.583333  0.535714
1  0.428571   0.200000  0.333333  0.250000
2  0.571429   0.366667  0.444444  0.383333
Average metrics across all folds:
accuracy     0.583333
precision    0.355556
recall       0.453704
f1_score     0.389683
dtype: float64
Predictions made!
///////////
The dataframes with 15 activities were not analysed
The dataframes with 16 activities were not analysed
The dataframes with 17 activities were not analysed
The dataframes with 18 activities were not analysed
The dataframes with 19 activities were not analysed
The dataframes with 20 activities were not analysed
The dataframes with 21 activities were not analysed
The dataframes with 25 activities were not analysed


  _warn_prf(average, modifier, msg_start, len(result))


## Naive-Bayes

In [26]:
for num_activities, group_df in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15) :
        print(f'The dataframes with {num_activities} activities were not analysed')
    else:
        print('Predicting for dataframe: ', num_activities )
        predictions = big_function(group_df, 'nb')
        globals()[f'df_{num_activities}'] = group_df.copy()
        globals()[f'df_{num_activities}'].reset_index(drop=True, inplace=True)
        globals()[f'df_{num_activities}']['Predictions'] = predictions

The dataframes with 1 activities were not analysed
The dataframes with 2 activities were not analysed
Predicting for dataframe:  3
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.046897 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.062497 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.062497 to run.
Average metrics across all folds:
accuracy     0.979783
precision    0.794465
recall       0.767436
f1_score     0.714817
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  4
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.071618 to run.
Performing Grid Search for Pipeline(steps=[('scal

  _warn_prf(average, modifier, msg_start, len(result))


Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.078130 to run.
Average metrics across all folds:
accuracy     0.821397
precision    0.780225
recall       0.746823
f1_score     0.737766
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  6
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.046867 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.046869 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031259 to run.
Average metrics across all folds:
accuracy     0.918430
precision    0.780017
recall       0.752780
f1_score     0.753316
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  7
The model to be 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.031260 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031229 to run.
Average metrics across all folds:
accuracy     0.752365
precision    0.675399
recall       0.714629
f1_score     0.668808
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  10
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031222 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.050705 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.042800 to run.
Average metrics across all folds:
accuracy     0.630303
precision    0.505454
recall       0.515873
f1_score     0.410635
dtype: float64
Predictions made!
///////////
Predicting fo

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.031233 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031263 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031226 to run.
Average metrics across all folds:
accuracy     0.747475
precision    0.618655
recall       0.622222
f1_score     0.616171
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  12
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031479 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031023 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.030867 to run.
Average metrics across all folds:
accuracy     0.869048
precision    0.643651
recall       0.678775
f1_score     0.650893
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  13
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031256 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031258 to run.
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]) model....
It took 0:00:00.031257 to run.
Average metrics across all folds:
accuracy     0.684848
precision    0.527513
recall       0.588889
f1_score     0.531481
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  14
The model to be used is nb
....
Performing Grid Search for Pipeline(steps=[('scaler', StandardScaler()), ('model', Gaussian

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:00.031241 to run.
Average metrics across all folds:
accuracy     0.690476
precision    0.505556
recall       0.629630
f1_score     0.526720
dtype: float64
Predictions made!
///////////
The dataframes with 15 activities were not analysed
The dataframes with 16 activities were not analysed
The dataframes with 17 activities were not analysed
The dataframes with 18 activities were not analysed
The dataframes with 19 activities were not analysed
The dataframes with 20 activities were not analysed
The dataframes with 21 activities were not analysed
The dataframes with 25 activities were not analysed


  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

In [27]:
for num_activities, group_df in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15) :
        print(f'The dataframes with {num_activities} activities were not analysed')
    else:
        print('Predicting for dataframe: ', num_activities )
        predictions = big_function(group_df, 'rf')
        globals()[f'df_{num_activities}'] = group_df.copy()
        globals()[f'df_{num_activities}'].reset_index(drop=True, inplace=True)
        globals()[f'df_{num_activities}']['Predictions'] = predictions

The dataframes with 1 activities were not analysed
The dataframes with 2 activities were not analysed
Predicting for dataframe:  3
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:01:06.745059 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:04.352243 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:06.903339 to run.
Average metrics across all folds:
accuracy     0.982751
precision    0.649681
recall       0.749104
f1_score     0.687117
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  4
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:29.797395 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:25.387993 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:22.983796 to run.
Average metrics across all folds:
accuracy     0.979598
precision    0.745898
recall       0.750241
f1_score     0.733974
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  5
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:57.318255 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:58.255436 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:54.128282 to run.
Average metrics across all folds:
accuracy     0.866197
precision    0.732489
recall       0.750440
f1_score     0.726496
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  6
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:44.468159 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:48.145648 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:51.975371 to run.
Average metrics across all folds:
accuracy     0.929835
precision    0.728769
recall       0.750848
f1_score     0.726805
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  7
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:50.722532 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:46.167837 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:51.853160 to run.
Average metrics across all folds:
accuracy     0.928306
precision    0.676815
recall       0.747339
f1_score     0.705847
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  8
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:45.954573 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:43.677319 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.910787 to run.
Average metrics across all folds:
accuracy     0.926870
precision    0.767186
recall       0.717284
f1_score     0.721515
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  9
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:46.046891 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:46.092940 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:45.295516 to run.
Average metrics across all folds:
accuracy     0.880601
precision    0.800042
recall       0.751325
f1_score     0.743874
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  10
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....




It took 0:00:47.051532 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:46.404614 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:44.338462 to run.
Average metrics across all folds:
accuracy     0.933333
precision    0.566340
recall       0.611111
f1_score     0.587120
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  11
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:43.058289 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:44.990137 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:41.768369 to run.
Average metrics across all folds:
accuracy     0.848485
precision    0.548796
recall       0.587103
f1_score     0.565400
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  12
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.482092 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:43.301453 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....




It took 0:00:55.550221 to run.
Average metrics across all folds:
accuracy     0.902381
precision    0.568305
recall       0.587963
f1_score     0.577566
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  13
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:51.943883 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:49.021616 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:52.634616 to run.
Average metrics across all folds:
accuracy     0.745455
precision    0.565079
recall       0.577778
f1_score     0.545899
dtype: float64
Predictions made!
///////////
Predicting for dataframe:  14
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:49.664770 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....




It took 0:00:48.473441 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:46.989787 to run.
Average metrics across all folds:
accuracy     0.821429
precision    0.583333
recall       0.638889
f1_score     0.607143
dtype: float64
Predictions made!
///////////
The dataframes with 15 activities were not analysed
The dataframes with 16 activities were not analysed
The dataframes with 17 activities were not analysed
The dataframes with 18 activities were not analysed
The dataframes with 19 activities were not analysed
The dataframes with 20 activities were not analysed
The dataframes with 21 activities were not analysed
The dataframes with 25 activities were not analysed


  _warn_prf(average, modifier, msg_start, len(result))


# Deployment

Comparing the evaluation metrics of each model by data frame, the best one was Random Forest for all data frames. 

In [28]:
for num_activities, group_df in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15):
        print(f'The dataframes with {num_activities} activities were not analysed')
    else:
        print('Predicting for dataframe: ', num_activities)
        predictions = big_function(group_df, 'rf')
        
        # Create a new DataFrame to store the results
        result_df = group_df.copy()
        result_df.reset_index(drop=True, inplace=True)
        
        # Add the predictions as a new column
        result_df['Predictions'] = predictions
        
        # Exclude columns with NaN values
        columns_to_exclude = result_df.columns[result_df.isnull().any()].tolist()
        result_df = result_df.drop(columns=columns_to_exclude)
        
        # Store the result DataFrame in a global variable for further use if needed
        globals()[f'df_{num_activities}'] = result_df
        
        # Print the result DataFrame with the target, predictions, and activity columns
        print(f'Result DataFrame for {num_activities} activities:')
        activity_columns = [col for col in result_df.columns if 'activity' in col.lower()]  # Assuming activity columns contain 'activity' in their name
        columns_to_display = ['Target', 'Predictions'] + activity_columns
        print(result_df[columns_to_display])

The dataframes with 1 activities were not analysed
The dataframes with 2 activities were not analysed
Predicting for dataframe:  3
The model to be used is rf
....
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:01:07.828247 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:09.124623 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:12.973311 to run.
Average metrics across all folds:
accuracy     0.982751
precision    0.649681
recall       0.749104
f1_score     0.687117
dtype: float64
Predictions made!
///////////
Result DataFrame for 3 activities:
       Target  Predictions  Activity_0  Activity_1  Activity_2
0           1            1       43493      3137.0     13897.0
1           1            1       43493      3137.0     13897.0
2           2            2       43493     39338.0      1726.0
3           1            1       43493      3137.0     13897.0
4           1            1       43493      3137.0     13897.0
...       ...          ...         ...         ...         ...
10778       1            1       43493     39338.0     13897.0
10779       2            2       43493     39338.0      1726.0
10780       1            1       43493     39338.0     13897.0
10781       1            1       43493     39338.0     13897.0
10782       1            1       43493     39338.0     13897.0

[10783 row

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:29.173699 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:33.096333 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:31.103249 to run.
Average metrics across all folds:
accuracy     0.979598
precision    0.745898
recall       0.750241
f1_score     0.733974
dtype: float64
Predictions made!
///////////
Result DataFrame for 4 activities:
       Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3
0           1            1       43493     39338.0     23072.0     24190.0
1           1            1       43493     39338.0     23072.0     24190.0
2           1            1       43493     39338.0     23072.0     24190.0
3           1            1       43493     39338.0     23072.0     24190.0
4           1            1       43493     39338.0     23072.0     24190.0
...       ...          ...         ...         ...         ...         ...
21267       1            1       43493     39338.0     23072.0     24190.0
21268       1            1       43493     39338.0     23072.0     24190.0
21269       3            3       43493     39338.0     13897.0      3460.0
21270       1    

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:04.602003 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:01:01.828336 to run.
Average metrics across all folds:
accuracy     0.866197
precision    0.732489
recall       0.750440
f1_score     0.726496
dtype: float64
Predictions made!
///////////
Result DataFrame for 5 activities:
      Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0          3            3       43493     39338.0     23072.0     24190.0   
1          3            3       43493     39338.0     23072.0     24190.0   
2          1            1       43493     39338.0      3533.0      1768.0   
3          1            1       43493     39338.0      3533.0      1768.0   
4          1            1       43493     39338.0      3533.0      1768.0   
...      ...          ...         ...         ...         ...         ...   
7095       3            3       43493     39338.0     23072.0     24190.0   
7096       3            3       43493     39338.0      3533.0     24190.0   
7097       1            1       43493     39338.0      3533.0      1768.0 

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:46.207839 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:47.188868 to run.
Average metrics across all folds:
accuracy     0.929835
precision    0.728769
recall       0.750848
f1_score     0.726805
dtype: float64
Predictions made!
///////////
Result DataFrame for 6 activities:
      Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0          3            3       43493     39338.0      3533.0      1768.0   
1          1            1       43493     39338.0      1726.0      2591.0   
2          1            1       43493     39338.0      1726.0      2591.0   
3          1            1       43493     39338.0      1726.0      2591.0   
4          1            1       43493     39338.0     23072.0      2591.0   
...      ...          ...         ...         ...         ...         ...   
1748       3            3       43493     39338.0      3533.0      1768.0   
1749       1            1       43493     39338.0      1726.0       259.0   
1750       3            3       43493     39338.0      3533.0      1768.0 

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:45.960628 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:45.167404 to run.
Average metrics across all folds:
accuracy     0.928306
precision    0.676815
recall       0.747339
f1_score     0.705847
dtype: float64
Predictions made!
///////////
Result DataFrame for 7 activities:
      Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0          1            1       43493     39338.0      1726.0      2591.0   
1          1            1       43493        90.0      3533.0      1768.0   
2          3            3       43493     39338.0      1726.0      2591.0   
3          3            3       43493     39338.0      1726.0      2591.0   
4          3            3       43493     39338.0     23072.0      2591.0   
...      ...          ...         ...         ...         ...         ...   
1055       1            1       43493     39338.0     23072.0     24190.0   
1056       1            1       43493     3

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:44.972542 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:44.552524 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.036088 to run.
Average metrics across all folds:
accuracy     0.926870
precision    0.767186
recall       0.717284
f1_score     0.721515
dtype: float64
Predictions made!
///////////
Result DataFrame for 8 activities:
     Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0         3            3       43493     39338.0      1726.0      2591.0   
1         1            1       43493     39338.0     23072.0     24190.0   
2         1            1       43493     39338.0      1726.0      2591.0   
3         1            1       43493     39338.0      1726.0      2591.0   
4         1            1       43493     39338.0     23072.0     24190.0   
..      ...          ...         ...         ...         ...         ...   
542       3            3       43493     39338.0      1726.0      2591.0   
543       1            1       43493     39338.0      3533.0     24190.0   
544       3            3       43493     39338.0      1726.0      2591.0   
545    



It took 0:00:43.937100 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:43.390108 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:43.681262 to run.
Average metrics across all folds:
accuracy     0.933333
precision    0.566340
recall       0.611111
f1_score     0.587120
dtype: float64
Predictions made!
///////////
Result DataFrame for 10 activities:
     Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0         1            1       43493     39338.0      1726.0      2591.0   
1         1            1       43493     39338.0     23072.0     24190.0   
2         1            1       43493     39338.0     23072.0     24190.0   
3         1            1       43493     39338.0      1726.0      2591.0   
4         1            1       43493     39338.0      1726.0      2591.0   
..      ...          ...         ...         ...         ...         ...   
160       3            3       43493     39338.0      3533.0       259.0   
161       3            3       43493     39338.0      3533.0      2591.0   
162       3            3       43493     39338.0     23072.0     24190.0   
163   

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:43.871162 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:44.021894 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.031170 to run.
Average metrics across all folds:
accuracy     0.848485
precision    0.548796
recall       0.587103
f1_score     0.565400
dtype: float64
Predictions made!
///////////
Result DataFrame for 11 activities:
    Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0        0            3       43493     39338.0      1726.0      2591.0   
1        0            3       43493     39338.0      1726.0      2591.0   
2        1            1       43493     39338.0      1726.0      2591.0   
3        0            3       43493     39338.0      1726.0      2591.0   
4        1            1       43493     39338.0      3533.0      1768.0   
..     ...          ...         ...         ...         ...         ...   
94       3            3       43493     39338.0      3533.0      2591.0   
95       3            3       43493     39338.0     13897.0      2591.0   
96       3            3       43493     39338.0      1726.0      2591.0   
97       3      

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:45.021444 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.017377 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....




It took 0:00:42.902311 to run.
Average metrics across all folds:
accuracy     0.902381
precision    0.568305
recall       0.587963
f1_score     0.577566
dtype: float64
Predictions made!
///////////
Result DataFrame for 12 activities:
    Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0        3            3       43493     39338.0      1726.0      2591.0   
1        3            3       43493     39338.0     23072.0     24190.0   
2        1            1       43493     39338.0      1726.0      2591.0   
3        1            1       43493     39338.0       465.0      1768.0   
4        3            3       43493     39338.0     23072.0     24190.0   
..     ...          ...         ...         ...         ...         ...   
56       3            3       43493     39338.0      3533.0      2591.0   
57       3            3       43493     39338.0     23072.0     24190.0   
58       3            3       43493     39338.0      1726.0      2591.0   
59       3      

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.117545 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....
It took 0:00:42.377310 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:41.615258 to run.
Average metrics across all folds:
accuracy     0.745455
precision    0.565079
recall       0.577778
f1_score     0.545899
dtype: float64
Predictions made!
///////////
Result DataFrame for 13 activities:
    Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0        3            3       43493     39338.0     23072.0     24190.0   
1        1            1       43493     39338.0     23072.0     24190.0   
2        1            1       43493     39338.0     23072.0     24190.0   
3        1            1       43493     39338.0      3533.0        90.0   
4        1            1       43493     39338.0      1726.0      2591.0   
5        0            0       43493     39338.0     23072.0     24190.0   
6        0            3       43493     39338.0      1726.0      2591.0   
7        1            1       43493     39338.0     13897.0      2591.0   
8        1            1       43493     39338.0      3533.0      1768.0   
9        3      

  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:42.078048 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....




It took 0:00:40.387675 to run.
Performing Grid Search for RandomForestClassifier(random_state=42) model....


  _warn_prf(average, modifier, msg_start, len(result))


It took 0:00:41.674590 to run.
Average metrics across all folds:
accuracy     0.821429
precision    0.583333
recall       0.638889
f1_score     0.607143
dtype: float64
Predictions made!
///////////
Result DataFrame for 14 activities:
    Target  Predictions  Activity_0  Activity_1  Activity_2  Activity_3  \
0        1            1       43493     39338.0      1726.0      2591.0   
1        3            0       43493     39338.0      1726.0      2591.0   
2        0            3       43493      3137.0     13897.0      3460.0   
3        3            3       43493     39338.0     23072.0     24190.0   
4        1            1       43493     39338.0     23072.0     24190.0   
5        0            3       43493       475.0      1726.0        94.0   
6        3            3       43493     39338.0     23072.0     24190.0   
7        3            3       43493     39338.0     23072.0     24190.0   
8        3            3       43493     39338.0      1726.0      2591.0   
9        1      

  _warn_prf(average, modifier, msg_start, len(result))


### Check the data frames with the predictions

In [29]:
#df_3 until df_14
df_3

Unnamed: 0,Request Identifier,Target,Activity_0,Activity_1,Activity_2,num_activities,Predictions
0,20,1,43493,3137.0,13897.0,3,1
1,23,1,43493,3137.0,13897.0,3,1
2,63,2,43493,39338.0,1726.0,3,2
3,80,1,43493,3137.0,13897.0,3,1
4,88,1,43493,3137.0,13897.0,3,1
...,...,...,...,...,...,...,...
10778,45750,1,43493,39338.0,13897.0,3,1
10779,45752,2,43493,39338.0,1726.0,3,2
10780,45754,1,43493,39338.0,13897.0,3,1
10781,45755,1,43493,39338.0,13897.0,3,1


### Concatenate the data frames returned

In [30]:
# List to store all the dataframes
dfs_to_concat = []

for num_activities, _ in grouped_dfs.items():
    if (num_activities <= 2) | (num_activities >= 15):
        continue
    else:
        # Access the dataframe for each num_activities
        df_name = f'df_{num_activities}'
        # Check if the dataframe exists in the global scope
        if df_name in globals():
            dfs_to_concat.append(globals()[df_name])

# Concatenate all the dataframes
final_df = pd.concat(dfs_to_concat, ignore_index=True)

# Print the concatenated dataframe
print("Concatenated DataFrame:")
final_df

Concatenated DataFrame:


Unnamed: 0,Request Identifier,Target,Activity_0,Activity_1,Activity_2,num_activities,Predictions,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,Activity_9,Activity_10,Activity_11,Activity_12,Activity_13
0,20,1,43493,3137.0,13897.0,3,1,,,,,,,,,,,
1,23,1,43493,3137.0,13897.0,3,1,,,,,,,,,,,
2,63,2,43493,39338.0,1726.0,3,2,,,,,,,,,,,
3,80,1,43493,3137.0,13897.0,3,1,,,,,,,,,,,
4,88,1,43493,3137.0,13897.0,3,1,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43199,43020,3,43493,39338.0,3533.0,14,3,2591.0,1500.0,91.0,396.0,262.0,145.0,85.0,97.0,49.0,21.0,13.0
43200,43052,1,43493,39338.0,3533.0,14,1,2591.0,1500.0,874.0,396.0,262.0,21.0,85.0,55.0,35.0,23.0,27.0
43201,43141,3,43493,39338.0,3533.0,14,3,2591.0,1500.0,737.0,947.0,262.0,145.0,85.0,33.0,49.0,21.0,13.0
43202,43942,1,43493,39338.0,23072.0,14,1,24190.0,1500.0,874.0,396.0,180.0,237.0,85.0,55.0,35.0,23.0,27.0


In [31]:
# Rearrange the columns
new_column_order = ['Request Identifier', 'Target', 'Predictions', 'num_activities', 'Activity_0', 'Activity_1', 'Activity_2', 'Activity_3', 'Activity_4', 'Activity_5', 'Activity_6', 'Activity_7', 'Activity_8', 'Activity_9', 'Activity_10', 'Activity_11', 'Activity_12', 'Activity_13']
final_df_rearranged = final_df.reindex(columns=new_column_order)
print("DataFrame with rearranged columns:")
final_df_rearranged

DataFrame with rearranged columns:


Unnamed: 0,Request Identifier,Target,Predictions,num_activities,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,Activity_9,Activity_10,Activity_11,Activity_12,Activity_13
0,20,1,1,3,43493,3137.0,13897.0,,,,,,,,,,,
1,23,1,1,3,43493,3137.0,13897.0,,,,,,,,,,,
2,63,2,2,3,43493,39338.0,1726.0,,,,,,,,,,,
3,80,1,1,3,43493,3137.0,13897.0,,,,,,,,,,,
4,88,1,1,3,43493,3137.0,13897.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43199,43020,3,3,14,43493,39338.0,3533.0,2591.0,1500.0,91.0,396.0,262.0,145.0,85.0,97.0,49.0,21.0,13.0
43200,43052,1,1,14,43493,39338.0,3533.0,2591.0,1500.0,874.0,396.0,262.0,21.0,85.0,55.0,35.0,23.0,27.0
43201,43141,3,3,14,43493,39338.0,3533.0,2591.0,1500.0,737.0,947.0,262.0,145.0,85.0,33.0,49.0,21.0,13.0
43202,43942,1,1,14,43493,39338.0,23072.0,24190.0,1500.0,874.0,396.0,180.0,237.0,85.0,55.0,35.0,23.0,27.0


## Merge with paths_df - to check actual activities without encoding

In [33]:
paths_merged = pd.merge(paths_df, final_df_rearranged[['Request Identifier', 'Predictions']],
                        on='Request Identifier', how='inner')

In [35]:
paths_merged

Unnamed: 0,Request Identifier,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,...,Activity_16,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24,Predictions
0,1,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,,,3
1,2,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,,,3
2,5,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,1
3,6,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,1
4,7,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43199,45768,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,,1
43200,45769,100-0,102-1,103-1,104-1,104-1,,,,,...,,,,,,,,,,1
43201,45770,100-0,102-1,102-1,104-1,107-2,,,,,...,,,,,,,,,,3
43202,45771,100-0,102-1,104-1,107-2,,,,,,...,,,,,,,,,,3


In [58]:
# reverse target encoding
paths_merged['Predictions_Name'] = le.inverse_transform(paths_merged['Predictions'])
paths_merged

Unnamed: 0,Request Identifier,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,...,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24,Predictions,Original_Target,Predictions_Name
0,1,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,3,Request finished,Request finished
1,2,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,3,Request finished,Request finished
2,5,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,1,Closed administratively/Requester rejects acco...,Closed administratively/Requester rejects acco...
3,6,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,1,Closed administratively/Requester rejects acco...,Closed administratively/Requester rejects acco...
4,7,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,1,Closed administratively/Requester rejects acco...,Closed administratively/Requester rejects acco...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43199,45768,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,1,Closed administratively/Requester rejects acco...,Closed administratively/Requester rejects acco...
43200,45769,100-0,102-1,103-1,104-1,104-1,,,,,...,,,,,,,,1,Closed administratively/Requester rejects acco...,Closed administratively/Requester rejects acco...
43201,45770,100-0,102-1,102-1,104-1,107-2,,,,,...,,,,,,,,3,Request finished,Request finished
43202,45771,100-0,102-1,104-1,107-2,,,,,,...,,,,,,,,3,Request finished,Request finished


In [59]:
# Calculate the frequency of each target value
target_counts_merged = paths_merged['Predictions_Name'].value_counts(normalize=True)
target_counts_merged

Predictions_Name
Closed administratively/Requester rejects accounting impact    0.730002
Request finished                                               0.249444
Request canceled                                               0.019628
Closed administratively                                        0.000926
Name: proportion, dtype: float64

##  Business Process Conclusion Prediction - based on the paths that go/do not go through 101

In [44]:
# Identify activity columns
activity_columns = [col for col in paths_merged.columns if 'Activity' in col]

# Create a boolean mask for rows where any activity column contains '101'
mask = paths_merged[activity_columns].apply(lambda row: row.astype(str).str.contains('101').any(), axis=1)

# Apply the mask to filter the DataFrame
df_101 = paths_merged[mask]

# Display the filtered DataFrame
print("Filtered DataFrame (rows containing '101'):")
df_101

Filtered DataFrame (rows containing '101'):


Unnamed: 0,Request Identifier,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,...,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24,Predictions,Original_Target
51,56,100-0,102-1,101-3,102-1,103-1,104-1,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
58,63,100-0,102-1,101-3,,,,,,,...,,,,,,,,,2,Request canceled
67,72,100-0,102-1,101-3,102-1,102-1,103-1,104-1,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
70,75,100-0,102-1,101-3,102-1,103-1,104-1,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
107,113,100-0,101-3,102-1,103-1,102-1,103-1,104-1,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43139,45703,100-0,102-1,101-3,102-1,102-1,102-1,104-1,107-2,,...,,,,,,,,,3,Request finished
43151,45716,100-0,102-1,101-3,101-3,102-1,104-1,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
43157,45722,100-0,102-1,102-1,101-3,,,,,,...,,,,,,,,,2,Request canceled
43180,45747,100-0,102-1,101-3,102-1,104-1,,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...


In [48]:
# Identify activity columns
activity_columns = [col for col in paths_merged.columns if 'Activity' in col]

# Create a boolean mask for rows where any activity column contains '101'
mask = paths_merged[activity_columns].apply(lambda row: row.astype(str).str.contains('101').any(), axis=1)

# Apply the mask to filter the DataFrame
df_101 = paths_merged[mask]
df_not_101 = paths_merged[~mask]

# Calculate the number of rows that have '101' and those that do not
count_101 = df_101.shape[0]
count_not_101 = df_not_101.shape[0]
total_rows = paths_merged.shape[0]

# Calculate the percentage of rows
percentage_101 = (count_101 / total_rows) * 100
percentage_not_101 = (count_not_101 / total_rows) * 100

# Display the results
print(f"Percentage of rows containing '101': {percentage_101:.2f}%")
print(f"Percentage of rows not containing '101': {percentage_not_101:.2f}%")

Percentage of rows containing '101': 4.81%
Percentage of rows not containing '101': 95.19%


In [50]:
# Calculate the frequency of each target value
target_counts = df_101['Predictions_Name'].value_counts(normalize=True)
target_counts

Original_Target
Request canceled                                               0.405964
Closed administratively/Requester rejects accounting impact    0.378066
Request finished                                               0.205387
Closed administratively                                        0.010582
Name: proportion, dtype: float64

95% of the tasks did not go through 101 and, consequently, about 5% did go through it. From this 5%, 40% of the requests were canceled and 20% were finished, being the rest of the tasks closed administratively. 

##  Business Process Conclusion Prediction - based on the paths that go through 102 or 105

In [51]:
# Identify activity columns
activity_columns = [col for col in paths_merged.columns if 'Activity' in col]

# Create a boolean mask for rows where any activity column contains '102' or '105'
mask_102_105 = paths_merged[activity_columns].apply(lambda row: row.astype(str).str.contains('102|105').any(), axis=1)

# Apply the mask to filter the DataFrame
df_102_105 = paths_merged[mask_102_105]

# Display the filtered DataFrame
print("Filtered DataFrame (rows containing '102' or '105'):")
df_102_105

Filtered DataFrame (rows containing '102' or '105'):


Unnamed: 0,Request Identifier,Activity_0,Activity_1,Activity_2,Activity_3,Activity_4,Activity_5,Activity_6,Activity_7,Activity_8,...,Activity_17,Activity_18,Activity_19,Activity_20,Activity_21,Activity_22,Activity_23,Activity_24,Predictions,Original_Target
0,1,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,,3,Request finished
1,2,100-0,102-1,103-1,104-1,107-2,,,,,...,,,,,,,,,3,Request finished
2,5,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
3,6,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
4,7,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43199,45768,100-0,102-1,102-1,103-1,104-1,,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
43200,45769,100-0,102-1,103-1,104-1,104-1,,,,,...,,,,,,,,,1,Closed administratively/Requester rejects acco...
43201,45770,100-0,102-1,102-1,104-1,107-2,,,,,...,,,,,,,,,3,Request finished
43202,45771,100-0,102-1,104-1,107-2,,,,,,...,,,,,,,,,3,Request finished


In [56]:
# Identify activity columns
activity_columns = [col for col in paths_merged.columns if 'Activity' in col]

# Create boolean masks for rows containing '102' and '105'
mask_102 = paths_merged[activity_columns].apply(lambda row: row.astype(str).str.contains('102').any(), axis=1)
mask_105 = paths_merged[activity_columns].apply(lambda row: row.astype(str).str.contains('105').any(), axis=1)

# Count total number of rows
total_rows = len(paths_merged)

# Count rows that contain '102'
count_102 = mask_102.sum()

# Count rows that contain '105'
count_105 = mask_105.sum()

# Count rows that contain both '102' and '105'
count_both = (mask_102 & mask_105).sum()

# Calculate percentages
percentage_102 = (count_102 / total_rows) * 100
percentage_105 = (count_105 / total_rows) * 100
percentage_both = (count_both / total_rows) * 100

# Display the percentages
print(f"Percentage of rows containing '102': {percentage_102:.2f}%")
print(f"Percentage of rows containing '105': {percentage_105:.2f}%")
print(f"Percentage of rows containing both '102' and '105': {percentage_both:.2f}%")

Percentage of rows containing '102': 92.03%
Percentage of rows containing '105': 1.11%
Percentage of rows containing both '102' and '105': 0.02%


In [53]:
# Calculate the frequency of each target value
target_counts = df_102_105['Predictions_Name'].value_counts(normalize=True)
target_counts

Original_Target
Closed administratively/Requester rejects accounting impact    0.727020
Request finished                                               0.250982
Request canceled                                               0.021004
Closed administratively                                        0.000994
Name: proportion, dtype: float64

#### Rows that go or do not go through 101 (have 102 or 105)

In [55]:
# Identify activity columns
activity_columns = [col for col in df_102_105.columns if 'Activity' in col]

# Create a boolean mask for rows where any activity column contains '101'
mask_101 = df_102_105[activity_columns].apply(lambda row: row.astype(str).str.contains('101').any(), axis=1)

# Calculate the percentage of rows with '101'
percentage_with_101 = mask_101.mean() * 100

# Calculate the percentage of rows without '101'
percentage_without_101 = (1 - mask_101.mean()) * 100

# Print the results
print(f"Percentage of rows with '101': {percentage_with_101:.2f}%")
print(f"Percentage of rows without '101': {percentage_without_101:.2f}%")

Percentage of rows with '101': 5.07%
Percentage of rows without '101': 94.93%


94.93% of the tasks that went through 102 and 105, did not went through 101, and, consequently, 5.07% of the tasks that went through 102 and 105, did went through 101. Having in consideration that about 93% of the tasks had either 102 or 105. 

To sum up, the grand majority of the tasks did not go through 101, so they were not cancelled or rejected. From the ones that went through 102 or 105, also the majority was not cancelled or rejected.