# Notebook 03 — Analysis (SubFraudGMM Results)This notebook loads the intermediary result files produced by notebook 02,aggregates them into a Risk Indicator score and ranking, and producesvisualizations for each equipment category.**Prerequisites:** Intermediary CSVs must be present in `../results/intermediary/`(generated by notebook 02 or provided separately).

In [None]:
import pandas as pdimport osimport numpy as npimport reimport matplotlib.pyplot as pltimport seaborn as snsimport matplotlib.patches as mpatches# Path to the directory containing intermediary result filesdiretorio = '../results/intermediary'# Empty DataFrame to accumulate all datadf_total = pd.DataFrame()def calculate_f1_score(precision, recall):    """Vectorised F1 score, safe against division by zero."""    return np.where((precision + recall) > 0, 2 * (precision * recall) / (precision + recall), 0)def processar_arquivo(nome_arquivo):    """Process a single intermediary CSV file and add metadata columns."""    # Load the CSV file    df = pd.read_csv(os.path.join(diretorio, nome_arquivo), delimiter=',', encoding='utf-8')    # Add filename column    df['Arquivo'] = nome_arquivo    nome_arquivo = nome_arquivo.replace('.csv', '')    parametros = nome_arquivo.split('_')    # Add and transform columns    produto = parametros[0].replace('-', ' ')    df['Produto'] = produto.capitalize()    df['Threshold'] = pd.to_numeric(parametros[1], errors='coerce')    if len(parametros) > 2:        df['FraudCV'] = parametros[2]    else:        df['FraudCV'] = ''    df['fraude'] = df['fraude'].apply(lambda x: 'Sim' if x == 1 else 'Não')    df['num_features'] = df['features'].apply(lambda x: x.count(',') + 1 if pd.notna(x) else 0)    # Compute cluster size for each feature group    # 'features' column defines the cluster    df['Tamanho_Cluster'] = df.groupby('features')['features'].transform('count')    return dfdef normalize_column(column):    """Min-max normalisation."""    return (column - column.min()) / (column.max() - column.min())def carregar_arquivos(padrao):    """Load all CSV files matching *padrao* regex and compute Risk Indicator."""    arquivos_csv = [arquivo for arquivo in os.listdir(diretorio) if padrao.match(arquivo)]    df_total = pd.concat([processar_arquivo(arquivo) for arquivo in arquivos_csv])    df_total['Subespaço'] = (        df_total['features'].astype(str) + '|' +        df_total['Produto'].astype(str) + '|' +        df_total['Threshold'].astype(str)    )    df_total['Subespaço_ID'] = df_total['Subespaço'].astype('category').cat.codes + 1    df_total['Métrica_Desempenho'] = df_total['num_fraud_reclustered'] / df_total['Tamanho_Cluster']    df_total['f1_score'] = calculate_f1_score(df_total['Métrica_Desempenho'], df_total['loo'])    df_total['Normalized Distance'] = normalize_column(df_total['euclidean_distance_to_fraud'])    df_total['Normalized Subsets']  = normalize_column(df_total['ocorrencias'])    df_total['Normalized LOO']      = normalize_column(df_total['loo'])    # Risk Indicator: higher is more suspicious    df_total['Risk Indicator'] = (        (1 - df_total['Normalized Distance']) +        df_total['Normalized Subsets'] +        df_total['Normalized LOO']    ) / 3    return df_total# Load threshold-80 files (change regex to load other thresholds)padrao = re.compile(r'^.+_80\.csv$')df_total = carregar_arquivos(padrao)

In [None]:
# Aggregate by record ID: mean/max metrics, derive Risk Indicator and Rankdf_mean = df_total.groupby([    'ID', 'Threshold', 'FraudCV', 'Produto', 'numeroEdital',    'Ente', 'ID ProcedimentoLictatorio', 'fraude']).agg({    'Normalized Distance': 'mean',    'Normalized Subsets':  'max',    'Normalized LOO':      'mean',    'win':        'mean',    'unique':     'mean',    'period':     'mean',    'num_partic': 'mean',    'num':        'mean',    'unit_price': 'mean',    'duration':   'mean',    'met':        'mean',    'features':   lambda x: ', '.join(x.astype(str).unique())}).reset_index()df_mean['Risk Indicator'] = (    (1 - df_mean['Normalized Distance']) +    df_mean['Normalized Subsets'] +    df_mean['Normalized LOO']) / 3df_mean["Rank"] = df_mean.groupby(["Produto", "FraudCV", "Threshold"])["Risk Indicator"].rank(    method="min", ascending=False)# Extract unique feature names for frequency analysisall_features = set()df_mean['features'].str.split(', ').apply(all_features.update)def count_feature_frequencies(feature_str):    features = feature_str.split(', ')    return [features.count(feature) for feature in all_features]df_mean['Feature Frequencies'] = df_mean['features'].apply(count_feature_frequencies)df_mean["Grupo"] = df_mean.apply(    lambda row: "RT" if row["ID"] == row["FraudCV"] else "Demais", axis=1)

In [None]:
# Top-10 records by Risk Indicatortop_10_risk = df_mean.sort_values(by='Risk Indicator', ascending=False).head(10)top_10_risk.head()

In [None]:
def extract_top_features(row):    """Return the top-3 most frequently appearing features for a record."""    feature_names = [f.strip() for f in row['features'].split(',')]    frequencies   = row['Feature Frequencies']    feature_freq  = sorted(zip(feature_names, frequencies), key=lambda x: x[1], reverse=True)    top3 = feature_freq[:3]    top_features = [f[0] for f in top3]    top_values   = [row.get(f, None) for f in top_features]    return pd.Series({        "ID":              row["ID"],        "Produto":         row["Produto"],        "Risk Indicator":  row["Risk Indicator"],        "Fraud":           row["fraude"],        "Top 3 Features":  ", ".join(top_features),        "Feature 1 value": top_values[0],        "Feature 2 value": top_values[1],        "Feature 3 value": top_values[2],    })final_results = []for produto, group in df_mean.groupby("Produto"):    top10    = group.sort_values(by="Risk Indicator", ascending=False).head(10)    enriched = top10.apply(extract_top_features, axis=1)    print(produto)    print(enriched)    final_results.append(enriched)final_df = pd.concat(final_results, ignore_index=True)

In [None]:
# Records ranked in the top-10 across any product/threshold/fold combinationdf_mean[df_mean['Rank'] < 11]

In [None]:
# Export SubFraudGMM results (threshold = 80) to results/df_mean[(df_mean['Threshold'] == 80)][['ID', 'Produto', 'Grupo', 'Risk Indicator', 'Rank']].to_csv(    "../results/SubFraudGMM.csv", index=False)

In [None]:
# Simplified aggregation without threshold/LOO dimension (for baseline comparison)df_mean_simplificado = df_total.groupby(['Produto', 'ID', 'fraude']).agg({    'win':        'mean',    'unique':     'mean',    'period':     'mean',    'num_partic': 'mean',    'num':        'mean',    'unit_price': 'mean',    'duration':   'mean',    'met':        'mean'}).reset_index()df_mean_simplificado['risk_score'] = (    df_mean_simplificado['win'] +    df_mean_simplificado['met'] +    df_mean_simplificado['unit_price'] +    (1 - df_mean_simplificado['unique']) +    (1 - df_mean_simplificado['period']) +    (1 - df_mean_simplificado['num_partic']) +    (1 - df_mean_simplificado['num']) +    (1 - df_mean_simplificado['duration'])) / 8df_mean_simplificado["Rank"] = df_mean_simplificado.groupby(["Produto"])["risk_score"].rank(    method="min", ascending=False)df_mean_simplificado.head()

In [None]:
# Build Grupo column and export simple ranking baselinedf_mean_simplificado['Grupo'] = df_mean_simplificado['fraude'].apply(    lambda x: 'RT' if x == 'Sim' else 'Demais')df_mean_simplificado['Risk Indicator'] = df_mean_simplificado['risk_score']df_mean_simplificado[['ID', 'Produto', 'Grupo', 'Risk Indicator', 'Rank']].to_csv(    "../results/RankingSimples.csv", index=False)

In [None]:
# Risk Score distribution by product and fraud statusplt.figure(figsize=(12, 6))palette = {'Não': 'tab:blue', 'Sim': 'tab:orange'}box = sns.boxplot(    data=df_mean_simplificado, x='Produto', y='risk_score', hue='fraude', palette=palette)handles, labels = box.get_legend_handles_labels()plt.legend(handles, labels, title='Fraud')plt.title('Risk Score Distribution by Product and Fraud Status')plt.xlabel('Product')plt.ylabel('Risk Score')plt.xticks(rotation=45)plt.tight_layout()plt.show()

In [None]:
# Rank distribution by product and fraud statusplt.figure(figsize=(12, 6))palette = {'Não': 'tab:blue', 'Sim': 'tab:orange'}box = sns.boxplot(    data=df_mean_simplificado, x='Produto', y='Rank', hue='fraude', palette=palette)handles, labels = box.get_legend_handles_labels()plt.legend(handles, labels, title='Fraud')plt.title('Rank Distribution by Product and Fraud Status')plt.xlabel('Product')plt.ylabel('Rank')plt.xticks(rotation=45)plt.tight_layout()plt.show()