# Notebook 02 — Model Training (SubFraudGMM)This notebook runs the SubFraudGMM algorithm across all four equipment datasetsand six anomaly thresholds. It uses the pre-built module `subfraudgmm.py`.**Prerequisites:**- Run notebook 01 (or use the pre-computed CSVs in `../data/`)- Ensure `../results/intermediary/` can be created (outputs ~576 files)

In [None]:
import osimport timeimport pandas as pdimport concurrent.futuresfrom subfraudgmm import (    FEATURE_COLUMNS,    DEFAULT_THRESHOLDS,    process_main_task,)

In [None]:
# Load the four equipment datasetsdf_motoniveladora = pd.read_csv('../data/motoniveladora_final.csv')df_motoniveladora['Produto'] = 'motoniveladora'df_trator = pd.read_csv('../data/trator_esteira_final.csv')df_trator['Produto'] = 'trator'df_escavadeira = pd.read_csv('../data/escavadeira_final.csv')df_escavadeira['Produto'] = 'escavadeira'df_compactador = pd.read_csv('../data/rolo_compactador_final.csv')df_compactador['Produto'] = 'compactador'

## Exploratory Data Analysis (optional)

In [None]:
df_concatenado = pd.concat(    [df_motoniveladora, df_trator, df_escavadeira, df_compactador],    ignore_index=True)df_concatenado["Produto"].value_counts()

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)df_concatenado[FEATURE_COLUMNS].describe()

## Task Construction and Parallel Execution

In [None]:
# Ensure the intermediary output directory existsos.makedirs('../results/intermediary', exist_ok=True)# Map dataset names to DataFramesdataframes = {    'rolo-compactador': df_compactador,    'escavadeira':      df_escavadeira,    'trator-esteira':   df_trator,    'motoniveladora':   df_motoniveladora,}# Build task list: (df_name, df, threshold, fraud_id)# fraud_id=None → full-data run; otherwise → LOO fold with that fraud treated as non-fraudtasks = []for df_name, df in dataframes.items():    for threshold in DEFAULT_THRESHOLDS:        tasks.append((df_name, df, threshold, None))        fraud_ids = df[df['fraude'] == 1]['ID']        for fraud_id in fraud_ids:            df_temp = df.copy()            df_temp.loc[df_temp['ID'] == fraud_id, 'fraude'] = 0            tasks.append((df_name, df_temp, threshold, fraud_id))print(f"Total tasks: {len(tasks)}")

In [None]:
overall_start_time = time.perf_counter()with concurrent.futures.ProcessPoolExecutor() as executor:    results = list(executor.map(process_main_task, tasks))overall_elapsed = time.perf_counter() - overall_start_timeprint(f"Total processing time: {overall_elapsed:.2f} seconds")

In [None]:
# Timing summaryprint(f"Total wall-clock time: {overall_elapsed:.2f} seconds")df_results = pd.DataFrame(results)print(f"Cumulative CPU time across all workers: {df_results['elapsed_seconds'].sum():.0f} seconds")df_results.head(10)

In [None]:
import matplotlib.pyplot as plt# Processing time comparison: before and after optimizationtimes_seconds = {    "Before optimisation": 238660.76,    "After optimisation":  19170.53,}times_hours = {k: v / 3600 for k, v in times_seconds.items()}plt.figure(figsize=(10, 4))bars = plt.barh(list(times_hours.keys()), list(times_hours.values()),                color=["#d62728", "#2ca02c"])for bar in bars:    width = bar.get_width()    plt.text(width * 1.01, bar.get_y() + bar.get_height() / 2,             f"{width:,.2f} h", va='center', fontsize=10)plt.xlabel("Processing Time (hours)")plt.title("Total Processing Time: Before vs After Optimisation")plt.grid(axis='x', which='both', linestyle='--', linewidth=0.5)plt.tight_layout()plt.show()