In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.insert(0,"/Users/erichsato/Documents/WFP/01_programming/callclarity")
sys.path.insert(0,"/Users/erichsato/Documents/WFP/01_programming/nlp_cfm")
os.environ["PROJECT_ABSPATH"] = "/Users/erichsato/Documents/WFP/01_programming/nlp_cfm"

import numpy as np
import pandas as pd

from callclarity.error_report.detectors.exact_incoherence import ExactIncoherenceDetector
from callclarity.error_report.detectors.approx_incoherence import ApproximateIncoherenceDetector
from callclarity.error_report.detectors.model_incoherence import ModelIncoherenceDetector
from callclarity.error_report.embeddings.embedder import TextEmbedder
from callclarity.error_report.sorters.semantic_sorter import SemanticSorter
from callclarity.error_report.inspector import IncoherenceInspector
from callclarity.error_report.reporter import IncoherenceReporter
from ftm.io import dio

dio.set_run_id("data",run_id = "2023-11-10_corrected_ukr")
dio.set_run_id("model",run_id = "2023-11-10_corrected_ukr")

In [8]:
# df = dio.read_data("domain","crm",path_args = {"country":"ukr"})

df = (
    dio.read_data("raw","crm_ukr_07")
    .merge(
        dio.read_data("raw","crm_ukr_07_translations")[["Case ID (Number)","description_translated"]],
        on = ["Case ID (Number)"],
        how = "left"
    )
    
)
df["id"] = df["Case ID (Number)"]
df["raw_descrip""tion"] = df["description_translated"]
df["raw_mcat_cat_category"] = df["Macro Category"] + " - " + df["Category"]
df = df[df["raw_mcat_cat_category"].notnull()]

2024-01-02 10:46:57.224 | INFO     | ftm.io:_read_excel:412 - Reading excel from '/Users/erichsato/Documents/WFP/01_programming/nlp_cfm/data/raw/crm/ukr/NLP Request_September 2023.xlsx'
2024-01-02 10:47:02.193 | INFO     | ftm.io:_read_excel:412 - Reading excel from '/Users/erichsato/Documents/WFP/01_programming/nlp_cfm/data/raw/crm/ukr/NLP Request_September 2023_translations.xlsx'


# Exact Incoherence

In [3]:
eid = ExactIncoherenceDetector(df["raw_description"],df["raw_mcat_cat_category"])

In [4]:
error_flags, error_recs, error_metadata = eid.get_errors()

2023-12-26 10:41:18.615 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:30 - Calculating 'exact_incoherence' conflicts
2023-12-26 10:41:18.631 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:42 -  └ Filtering conflicted cases
2023-12-26 10:41:18.632 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:46 -  └ Getting cluster indices
2023-12-26 10:41:18.632 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:50 -  └ Getting error recommendations
2023-12-26 10:41:18.634 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:57 -  └ Getting error flags


In [5]:
np.sum(error_flags)

7390

In [6]:
df.shape[0] - np.sum(error_flags)

2950

# Approx Incoherence 

In [7]:
from sklearn.neighbors import radius_neighbors_graph
from scipy.sparse.csgraph import connected_components
from scipy.sparse import csr_matrix

In [8]:
embedder = TextEmbedder(df["raw_description"])
embeddings = embedder.embed()

2023-12-26 10:41:18.876 | INFO     | callclarity.utils.device:get_torch_device:26 - Using Apple Silicon as the torch device


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

In [10]:
len(embeddings)

10340

In [11]:
len(df["raw_mcat_cat_category"])

10340

In [13]:
aid = ApproximateIncoherenceDetector(embeddings = embeddings.tolist(), labels = df["raw_mcat_cat_category"])

In [14]:
error_flags, error_recs, error_metadata = aid.get_errors()

2023-12-26 10:43:28.930 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:57 -  └ Calculating clusters
2023-12-26 10:43:32.417 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:61 -    └ 3,905 clusters were found.
2023-12-26 10:43:32.417 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:64 - Calculating 'approx_incoherence' conflicts
2023-12-26 10:43:32.927 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:76 -  └ Filtering conflicted cases
2023-12-26 10:43:32.928 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:80 -  └ Getting cluster indices
2023-12-26 10:43:32.928 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:84 -  └ Getting error recommendations
2023-12-26 10:43:32.929 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:91 -  └ Getting error flags


In [None]:
for nrows in [100,500,1000,5000,10000,50000,100000]:
    
    print(f"{nrows} rows")
    %time nn_graph = radius_neighbors_graph(embeddings[:nrows],radius = 0.1, mode = "distance", metric = "cosine", n_jobs = -1)
    print()

# Model Incoherence 

In [16]:
mid = ModelIncoherenceDetector(embeddings = embeddings.tolist(), labels = df["raw_mcat_cat_category"])

In [17]:
error_flags, error_recs, error_metadata = mid.get_errors()

2023-12-26 10:43:57.531 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:58 -  └ Calculating clusters


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-12-26 10:45:12.049 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:62 -    └ 1,457 clean learning issues were found.
2023-12-26 10:45:12.050 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:65 - Calculating 'model_incoherence' conflicts
2023-12-26 10:45:12.055 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:82 -  └ Getting error flags


In [137]:
len(error_flags)

10340

In [138]:
len(error_recs)

10340

In [321]:
np.sum(error_flags)

1461

# Semantic sorter 

In [632]:
semantic_sorter = SemanticSorter(embeddings, embedding_distance_step=0.01)

2023-12-20 18:47:59.698 | INFO     | callclarity.error_report.sorters.semantic_sorter:_init_nn_graph:24 - Instantiating radius neighbors graph


In [631]:
sorted_indices = semantic_sorter.get_sort_indices()

100%|███████████████████████████████████████████| 80/80 [00:02<00:00, 27.59it/s]


In [629]:
n = 6000
df.iloc[sorted_indices].iloc[n:n+10]

Unnamed: 0,country,id,date_created,raw_description,raw_macro_category,raw_category,raw_subcategory,raw_mcat_cat_category,macro_category,category,mcat_cat_category,grouped_categories_list,subcategory
9310,ukr,612303,2023-09-11 06:43:00,products,Request for assistance,Food,,Request for assistance - Food,Grouped others,Grouped others,Grouped others - Grouped others,"[Other - Other (please explain), Allegations of misconduct - Fraud, Request for assistance - Cash, Allegations of misconduct - Disrespectful treatment, Request for assistance - Food, Allegations of misconduct - Diversion of assistance, Allegations of misconduct - Discrimination, Request for assistance - Other (please explain), Allegations of misconduct - Corruption, Request for specialised services (non-WFP) - Other (please explain), Allegations of misconduct - Other (please explain), Request for specialised services (non-WFP) - Medical, Request for specialised services (non-WFP) - Legal]",
9633,ukr,613281,2023-09-11 14:44:00,products,Request for assistance,Food,,Request for assistance - Food,Grouped others,Grouped others,Grouped others - Grouped others,"[Other - Other (please explain), Allegations of misconduct - Fraud, Request for assistance - Cash, Allegations of misconduct - Disrespectful treatment, Request for assistance - Food, Allegations of misconduct - Diversion of assistance, Allegations of misconduct - Discrimination, Request for assistance - Other (please explain), Allegations of misconduct - Corruption, Request for specialised services (non-WFP) - Other (please explain), Allegations of misconduct - Other (please explain), Request for specialised services (non-WFP) - Medical, Request for specialised services (non-WFP) - Legal]",
8157,ukr,596131,2023-08-25 06:12:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8165,ukr,596191,2023-08-25 06:46:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8178,ukr,596480,2023-08-25 07:37:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8223,ukr,598297,2023-08-25 10:18:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8232,ukr,598350,2023-08-25 10:35:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8235,ukr,598386,2023-08-25 10:45:00,Outgoing call\nDoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8241,ukr,598463,2023-08-25 11:03:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],
8245,ukr,598497,2023-08-25 11:15:00,Outgoing call\ndoes not meet the,Request for information,Targeting criteria,,Request for information - Targeting criteria,Request for information,Targeting criteria,Request for information - Targeting criteria,[],


# Inspector 

In [3]:
from callclarity.error_report.inspector import IncoherenceInspector

In [106]:
embedder = TextEmbedder(df["raw_description"])
embeddings = embedder.embed().tolist()

2023-12-27 15:50:33.000 | INFO     | callclarity.utils.device:get_torch_device:26 - Using Apple Silicon as the torch device


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

In [127]:
inspector = IncoherenceInspector(
    ids = df["id"],
    texts = df["raw_description"],
    labels = df["raw_mcat_cat_category"],
    embeddings = embeddings,
)
df_inspected = inspector.inspect()

2023-12-27 16:41:50.834 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:58 - Calculating 'exact_incoherence' conflicts
2023-12-27 16:41:51.422 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:70 -  └ Filtering conflicted cases
2023-12-27 16:41:51.423 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:74 -  └ Getting cluster indices
2023-12-27 16:41:51.424 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:78 -  └ Getting error recommendations
2023-12-27 16:41:51.425 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:82 -  └ Getting incoherence top frequency
2023-12-27 16:41:51.426 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:92 -  └ Getting error flags
2023-12-27 16:41:51.504 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:90 -  └ Calculating clusters
2023-12-27 16:41:54.430 | INFO     | call

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-12-27 16:42:28.741 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:66 -    └ 1,468 clean learning issues were found.
2023-12-27 16:42:28.746 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:69 - Calculating 'model_incoherence' conflicts
2023-12-27 16:42:29.108 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:93 -  └ Getting error flags
2023-12-27 16:42:29.299 | INFO     | callclarity.error_report.sorters.semantic_sorter:_init_nn_graph:24 - Instantiating radius neighbors graph
100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 26.64it/s]


In [11]:
inspector = IncoherenceInspector(
    ids = df["id"],
    texts = df["raw_description"],
    labels = df["raw_mcat_cat_category"],
)
df_inspected = inspector.inspect()

2024-01-02 10:48:48.260 | INFO     | callclarity.error_report.inspector:_init_embeddings:46 - Embedding texts
2024-01-02 10:48:48.261 | INFO     | callclarity.utils.device:get_torch_device:26 - Using Apple Silicon as the torch device


Batches:   0%|          | 0/728 [00:00<?, ?it/s]

2024-01-02 10:49:27.543 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:58 - Calculating 'exact_incoherence' conflicts
2024-01-02 10:49:32.225 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:70 -  └ Filtering conflicted cases
2024-01-02 10:49:32.227 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:74 -  └ Getting cluster indices
2024-01-02 10:49:32.228 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:78 -  └ Getting error recommendations
2024-01-02 10:49:32.229 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:82 -  └ Getting incoherence top frequency
2024-01-02 10:49:32.230 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:92 -  └ Getting error flags
2024-01-02 10:49:32.860 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:90 -  └ Calculating clusters
2024-01-02 10:51:35.957 | INFO     | call

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2024-01-02 11:00:21.909 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:66 -    └ 20,352 clean learning issues were found.
2024-01-02 11:00:21.910 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:69 - Calculating 'model_incoherence' conflicts
2024-01-02 11:00:23.679 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:93 -  └ Getting error flags
2024-01-02 11:00:24.677 | INFO     | callclarity.error_report.sorters.semantic_sorter:_init_nn_graph:24 - Instantiating radius neighbors graph
100%|█████████████████████████████████████████████| 8/8 [00:26<00:00,  3.29s/it]


In [13]:
df_inspected.shape

(93082, 28)

In [18]:
df_inspected["priority"].fillna(-1).value_counts(normalize = True).sort_index()

-1.0    0.561913
 0.0    0.208676
 1.0    0.155540
 2.0    0.023409
 3.0    0.050461
Name: priority, dtype: float64

# Generate the report 

In [128]:
reporter = IncoherenceReporter(
    df = df_inspected,
    path = "/Users/erichsato/Documents/WFP/01_programming/nlp_cfm/data/temp/report_tests/report_v4.xlsx",
    text_col="text",
    label_col="label",
)
reporter.report()

In [116]:
df_inspected df_inspected["exact_incoherence_metadata"]

SyntaxError: invalid syntax (3621103058.py, line 1)

In [22]:
df_inspected[(df_inspected["priority"].notnull() | df_inspected["cluster_priority"].notnull())]

Unnamed: 0,id,priority,cluster_priority,text,label,label_recommended,exact_incoherence_recommendation,approx_incoherence_recommendation,model_incoherence_recommendation,is_incoherent,is_exact_incoherence,is_approx_incoherence,is_model_incoherence,exact_incoherence_metadata,approx_incoherence_metadata,model_incoherence_metadata,cluster_incoherence_count,cluster_total_count
4567,559637,0.0,0.0,The MTCN code did not arrive. More than 50 day...,Request for information - Assistance timing or...,Request for information - Targeting criteria,Request for information - Targeting criteria,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,1,1,1,"{'exact_incoherence_cluster_index': 165.0, 'ex...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.6872487681548...,576.0,1393.0
6736,585890,0.0,0.0,The MTCN code did not arrive. Received an SMS ...,Request for information - Assistance duration,Request for information - Targeting criteria,Request for information - Targeting criteria,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,1,1,1,"{'exact_incoherence_cluster_index': 68.0, 'exa...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9989767111168...,576.0,1393.0
8342,600348,0.0,0.0,The MTCN code did not arrive. Received an SMS ...,Request for information - Other (please explain),Request for information - Targeting criteria,Request for information - Targeting criteria,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,1,1,1,"{'exact_incoherence_cluster_index': 68.0, 'exa...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9954472010089...,576.0,1393.0
4952,563819,1.0,0.0,The MTCN code did not arrive. More than 50 day...,Request for information - Other (please explain),Complaint - Assistance / Delivery Mechanisms /...,,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,0,1,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9883657994091...,576.0,1393.0
6935,540414,1.0,0.0,The MTCN code did not arrive. More than 50 day...,Complaint - Incorrect records,Complaint - Assistance / Delivery Mechanisms /...,,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,0,1,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9675125838860...,576.0,1393.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10222,496454,3.0,,did not receive 3 help\nthe burned person's pa...,Complaint - Incorrect records,Complaint - Assistance / Delivery Mechanisms /...,,,Complaint - Assistance / Delivery Mechanisms /...,1,0,0,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': 0.8422582085989...,,
10245,530047,3.0,,did not receive any payment due to the fact th...,Technical requests - Withdrawal of personally ...,Complaint - Assistance / Delivery Mechanisms /...,,,Complaint - Assistance / Delivery Mechanisms /...,1,0,0,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': 0.8987482142804...,,
10267,527001,3.0,,"On the website, the status ""Your application h...",Technical requests - Withdrawal of personally ...,Complaint - Assistance / Delivery Mechanisms /...,,,Complaint - Assistance / Delivery Mechanisms /...,1,0,0,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': 0.7726238277221...,,
10287,535840,3.0,,According to the instructions of ticket 217777...,Complaint - Physical access challenges,Complaint - Assistance / Delivery Mechanisms /...,,,Complaint - Assistance / Delivery Mechanisms /...,1,0,0,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': 0.877596948300946},,


In [10]:
df_inspected

Unnamed: 0,id,priority,cluster_priority,text,label,label_recommended,exact_incoherence_recommendation,approx_incoherence_recommendation,model_incoherence_recommendation,is_incoherent,is_exact_incoherence,is_approx_incoherence,is_model_incoherence,exact_incoherence_metadata,approx_incoherence_metadata,model_incoherence_metadata,cluster_incoherence_count,cluster_total_count
4567,559637,0.0,0.0,The MTCN code did not arrive. More than 50 day...,Request for information - Assistance timing or...,Request for information - Targeting criteria,Request for information - Targeting criteria,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,1,1,1,"{'exact_incoherence_cluster_index': 165.0, 'ex...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.7069928319065...,576.0,1393.0
6736,585890,0.0,0.0,The MTCN code did not arrive. Received an SMS ...,Request for information - Assistance duration,Request for information - Targeting criteria,Request for information - Targeting criteria,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,1,1,1,"{'exact_incoherence_cluster_index': 68.0, 'exa...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9987864743619...,576.0,1393.0
8342,600348,0.0,0.0,The MTCN code did not arrive. Received an SMS ...,Request for information - Other (please explain),Request for information - Targeting criteria,Request for information - Targeting criteria,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,1,1,1,"{'exact_incoherence_cluster_index': 68.0, 'exa...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9947342570185...,576.0,1393.0
4952,563819,1.0,0.0,The MTCN code did not arrive. More than 50 day...,Request for information - Other (please explain),Complaint - Assistance / Delivery Mechanisms /...,,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,0,1,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9890000441534...,576.0,1393.0
6935,540414,1.0,0.0,The MTCN code did not arrive. More than 50 day...,Complaint - Incorrect records,Complaint - Assistance / Delivery Mechanisms /...,,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,1,0,1,1,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': 0.0, 'app...",{'model_incoherence_severity': 0.9702969878089...,576.0,1393.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10325,561546,,,"Under number 380962139303, there is no informa...",Complaint - Incorrect records,,,,,0,0,0,0,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': nan},,
10326,538519,,,According to case 179681\n\nThe person has pro...,Complaint - Physical access challenges,,,,,0,0,0,0,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': nan},,
10327,552764,,,A person with reduced mobility,Complaint - Physical access challenges,,,,,0,0,0,0,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': nan},,
10328,537334,,,a person with a disability cannot receive assi...,Complaint - Physical access challenges,,,,,0,0,0,0,"{'exact_incoherence_cluster_index': nan, 'exac...","{'approx_incoherence_cluster_index': nan, 'app...",{'model_incoherence_severity': nan},,


In [56]:
df_inspected

Unnamed: 0,id,text,label,embedding,is_exact_incoherence,exact_incoherence_recommendation,exact_incoherence_metadata,is_approx_incoherence,approx_incoherence_recommendation,approx_incoherence_metadata,is_model_incoherence,model_incoherence_recommendation,model_incoherence_metadata,priority,cluster_priority,is_incoherent,cluster_incoherence_count
4567,559637,The MTCN code did not arrive. More than 50 day...,Request for information - Assistance timing or...,"[-0.014842264354228973, -0.03156145662069321, ...",1,Request for information - Targeting criteria,"{'exact_incoherence_cluster_index': 165.0, 'ex...",1,Complaint - Assistance / Delivery Mechanisms /...,"{'approx_incoherence_cluster_index': 0.0, 'app...",0,,{'model_incoherence_severity': nan},0.0,0.0,1,576.0
6736,585890,The MTCN code did not arrive. Received an SMS ...,Request for information - Assistance duration,"[-0.010520915500819683, -0.023126713931560516,...",1,Request for information - Targeting criteria,"{'exact_incoherence_cluster_index': 68.0, 'exa...",1,Complaint - Assistance / Delivery Mechanisms /...,"{'approx_incoherence_cluster_index': 0.0, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.9985255345577...,0.0,0.0,1,576.0
8342,600348,The MTCN code did not arrive. Received an SMS ...,Request for information - Other (please explain),"[-0.010520915500819683, -0.023126713931560516,...",1,Request for information - Targeting criteria,"{'exact_incoherence_cluster_index': 68.0, 'exa...",1,Complaint - Assistance / Delivery Mechanisms /...,"{'approx_incoherence_cluster_index': 0.0, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.9967618459160...,0.0,0.0,1,576.0
4952,563819,The MTCN code did not arrive. More than 50 day...,Request for information - Other (please explain),"[-0.012392878532409668, -0.01640527881681919, ...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",1,Complaint - Assistance / Delivery Mechanisms /...,"{'approx_incoherence_cluster_index': 0.0, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.99046117826451},1.0,0.0,1,576.0
6935,540414,The MTCN code did not arrive. More than 50 day...,Complaint - Incorrect records,"[-0.021083777770400047, -0.03303220495581627, ...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",1,Complaint - Assistance / Delivery Mechanisms /...,"{'approx_incoherence_cluster_index': 0.0, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.9688556062587...,1.0,0.0,1,576.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10325,561546,"Under number 380962139303, there is no informa...",Complaint - Incorrect records,"[0.002346019260585308, -0.049770016223192215, ...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",0,,{'model_incoherence_severity': nan},,,0,
10326,538519,According to case 179681\n\nThe person has pro...,Complaint - Physical access challenges,"[0.021950630471110344, 0.006401213817298412, -...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",0,,{'model_incoherence_severity': nan},,,0,
10327,552764,A person with reduced mobility,Complaint - Physical access challenges,"[-0.028081171214580536, -0.008712985552847385,...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",0,,{'model_incoherence_severity': nan},,,0,
10328,537334,a person with a disability cannot receive assi...,Complaint - Physical access challenges,"[-0.0012608394026756287, 0.008399887941777706,...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",0,,{'model_incoherence_severity': nan},,,0,


In [39]:
df_inspected[(df_inspected["priority"] == 3) & (df_inspected["cluster_priority"].isnull())]

Unnamed: 0,id,text,label,embedding,is_exact_incoherence,exact_incoherence_recommendation,exact_incoherence_metadata,is_approx_incoherence,approx_incoherence_recommendation,approx_incoherence_metadata,is_model_incoherence,model_incoherence_recommendation,model_incoherence_metadata,priority,cluster_priority,is_incoherent,cluster_incoherence_count
7509,589484,How to get food assistance,Request for assistance - Food,"[0.010919742286205292, -0.056632790714502335, ...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Request for information - Assistance type and ...,{'model_incoherence_severity': 0.8470128352107...,3.0,,1,
8910,606559,How to get food assistance,Request for assistance - Food,"[0.010919742286205292, -0.056632790714502335, ...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Request for information - Assistance type and ...,{'model_incoherence_severity': 0.8470128352107...,3.0,,1,
8216,598233,Outgoing call Not answered\n,Request for information - Other (please explain),"[0.016490323469042778, 0.012342012487351894, 0...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Request for information - Targeting criteria,{'model_incoherence_severity': 0.7159209437969...,3.0,,1,
8237,598397,Outgoing call Not answered\n,Request for information - Other (please explain),"[0.016490265727043152, 0.012341977097094059, 0...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Request for information - Targeting criteria,{'model_incoherence_severity': 0.751842926790409},3.0,,1,
8239,598437,Outgoing call Not answered\n,Request for information - Other (please explain),"[0.016490265727043152, 0.012341977097094059, 0...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Request for information - Targeting criteria,{'model_incoherence_severity': 0.7159210687108...,3.0,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10245,530047,did not receive any payment due to the fact th...,Technical requests - Withdrawal of personally ...,"[0.011644075624644756, 0.011705193668603897, -...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.8836796410364...,3.0,,1,
10267,527001,"On the website, the status ""Your application h...",Technical requests - Withdrawal of personally ...,"[0.0037415430415421724, 0.0019116394687443972,...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.8068970599945...,3.0,,1,
10282,535674,265661 repeatedly\n\nA person with reduced mob...,Complaint - Physical access challenges,"[-0.03614500164985657, -0.00664135068655014, -...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.8528693147466...,3.0,,1,
10287,535840,According to the instructions of ticket 217777...,Complaint - Physical access challenges,"[0.035441432148218155, -0.027964206412434578, ...",0,,"{'exact_incoherence_cluster_index': nan, 'exac...",0,,"{'approx_incoherence_cluster_index': nan, 'app...",1,Complaint - Assistance / Delivery Mechanisms /...,{'model_incoherence_severity': 0.8931862779376...,3.0,,1,


In [62]:
", ".join([f'"{c}"' for c in df_inspected.columns.tolist()])

'"id", "text", "label", "embedding", "is_exact_incoherence", "exact_incoherence_recommendation", "exact_incoherence_metadata", "is_approx_incoherence", "approx_incoherence_recommendation", "approx_incoherence_metadata", "is_model_incoherence", "model_incoherence_recommendation", "model_incoherence_metadata", "priority", "cluster_priority", "is_incoherent", "cluster_incoherence_count"'

In [67]:
df_inspected[
    [
        "id", 
        "priority", "cluster_priority",
        "text", "label",
        "exact_incoherence_recommendation", 
        "approx_incoherence_recommendation", 
        "model_incoherence_recommendation", 
        "cluster_incoherence_count",
        "is_incoherent", 
        "is_exact_incoherence","is_approx_incoherence", "is_model_incoherence", 
        "exact_incoherence_metadata", "approx_incoherence_metadata","model_incoherence_metadata", ]
].T

Unnamed: 0,4567,6736,8342,4952,6935,9478,2765,2829,7582,10187,...,10317,10318,10320,10322,10323,10325,10326,10327,10328,10338
id,559637,585890,600348,563819,540414,514132,534995,522778,503722,554460,...,538556,537795,567952,567885,537750,561546,538519,552764,537334,545421
priority,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
cluster_priority,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
text,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. Received an SMS ...,The MTCN code did not arrive. Received an SMS ...,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. More than 50 day...,The MTCN code did not arrive. More than 50 day...,...,According to case 269686\nCan't get to the ban...,Repeat request\nA person with reduced mobility...,"A person with limited mobility, a transfer of ...","The recipient of the aid was injured, cannot m...","A person with reduced mobility, does not have ...","Under number 380962139303, there is no informa...",According to case 179681\n\nThe person has pro...,A person with reduced mobility,a person with a disability cannot receive assi...,Changing the phone number\nMs. Lyubov lives in...
label,Request for information - Assistance timing or...,Request for information - Assistance duration,Request for information - Other (please explain),Request for information - Other (please explain),Complaint - Incorrect records,Complaint - Incorrect records,Complaint - Incorrect records,Complaint - Incorrect records,Complaint - Incorrect records,Complaint - Incorrect records,...,Complaint - Physical access challenges,Complaint - Physical access challenges,Complaint - Physical access challenges,Complaint - Physical access challenges,Complaint - Physical access challenges,Complaint - Incorrect records,Complaint - Physical access challenges,Complaint - Physical access challenges,Complaint - Physical access challenges,Complaint - Incorrect records
exact_incoherence_recommendation,Request for information - Targeting criteria,Request for information - Targeting criteria,Request for information - Targeting criteria,,,,,,,,...,,,,,,,,,,
approx_incoherence_recommendation,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,...,,,,,,,,,,
model_incoherence_recommendation,,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Assistance / Delivery Mechanisms /...,,,,,...,,,,,,,,,,
cluster_incoherence_count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,...,,,,,,,,,,
is_incoherent,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
inspe

In [643]:
inspector.inspect()

2023-12-20 19:04:57.740 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:30 - Calculating 'exact_incoherence' conflicts
2023-12-20 19:04:58.226 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:42 -  └ Filtering conflicted cases
2023-12-20 19:04:58.227 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:46 -  └ Getting cluster indices
2023-12-20 19:04:58.228 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:50 -  └ Getting error recommendations
2023-12-20 19:04:58.229 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:57 -  └ Getting error flags
2023-12-20 19:04:58.246 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:57 -  └ Calculating clusters
2023-12-20 19:05:01.104 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:61 -    └ 3,905 clusters were found.
2023-12-20 19:05:01.105 | INFO     | callclar

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-12-20 19:05:29.918 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:62 -    └ 1,428 clean learning issues were found.
2023-12-20 19:05:29.919 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:65 - Calculating 'model_incoherence' conflicts
2023-12-20 19:05:29.923 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:82 -  └ Getting error flags


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/erichsato/miniconda3/envs/nlp_cfm/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/mx/rrg5pwtj0lg09l5rvnz5y6zr0000gn/T/ipykernel_22306/3375630170.py", line 1, in <module>
    inspector.inspect()
  File "/Users/erichsato/Documents/WFP/01_programming/callclarity/callclarity/error_report/inspector.py", line 180, in inspect
    self.assign_priority()
  File "/Users/erichsato/Documents/WFP/01_programming/callclarity/callclarity/error_report/inspector.py", line 122, in assign_priority
    df["priority"] = df.apply(
  File "/Users/erichsato/miniconda3/envs/nlp_cfm/lib/python3.10/site-packages/pandas/core/frame.py", line 9568, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/Users/erichsato/miniconda3/envs/nlp_cfm/lib/python3.10/site-packages/pandas/core/apply.py", line 764, in apply
    return self.apply_st

In [636]:
_ = inspector.get_errors()

2023-12-20 18:56:11.003 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:30 - Calculating 'exact_incoherence' conflicts
2023-12-20 18:56:11.546 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:42 -  └ Filtering conflicted cases
2023-12-20 18:56:11.548 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:46 -  └ Getting cluster indices
2023-12-20 18:56:11.549 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:50 -  └ Getting error recommendations
2023-12-20 18:56:11.550 | INFO     | callclarity.error_report.detectors.exact_incoherence:get_errors:57 -  └ Getting error flags
2023-12-20 18:56:11.572 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:57 -  └ Calculating clusters
2023-12-20 18:56:14.378 | INFO     | callclarity.error_report.detectors.approx_incoherence:get_errors:61 -    └ 3,905 clusters were found.
2023-12-20 18:56:14.379 | INFO     | callclar

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-12-20 18:57:15.107 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:62 -    └ 1,424 clean learning issues were found.
2023-12-20 18:57:15.108 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:65 - Calculating 'model_incoherence' conflicts
2023-12-20 18:57:15.113 | INFO     | callclarity.error_report.detectors.model_incoherence:get_errors:82 -  └ Getting error flags


In [640]:
df_inspect = inspector.output_df.copy()
df_inspect["_approx_incoherence_cluster_index"] = df_inspect["approx_incoherence_metadata"].str["approx_incoherence_cluster_index"]

In [641]:
df_inspect.head()

Unnamed: 0,id,text,label,embedding,is_exact_incoherence,exact_incoherence_recommendation,exact_incoherence_metadata,is_approx_incoherence,approx_incoherence_recommendation,approx_incoherence_metadata,is_model_incoherence,model_incoherence_recommendation,model_incoherence_metadata,_approx_incoherence_cluster_index
0,482666,Clarification of the status of the application for eHelp.\n,Request for information - Other (please explain),"[0.02063043601810932, -0.014633174985647202, 0.0018570597749203444, 0.03578193113207817, 0.026378776878118515, 0.010056345723569393, -0.0037040424067527056, 0.015377209521830082, -0.0061657060869038105, 0.00020253939146641642, -0.007710890844464302, -0.019451137632131577, 0.018473321571946144, -0.0076651484705507755, -0.030224254354834557, -0.0013894927687942982, 0.03242503106594086, -0.039075255393981934, -0.016736697405576706, 0.03644832968711853, 0.026604976505041122, -0.029906943440437317, -0.03138010576367378, -0.03566896170377731, -0.005968381650745869, 0.007034401874989271, -0.02914425916969776, -0.03436632081866264, 0.042137566953897476, -0.03530101105570793, 0.025876149535179138, 0.002100934274494648, 0.028253639116883278, -0.018664270639419556, -0.05479830130934715, -0.024769887328147888, 0.05613785237073898, 0.0020175911486148834, 0.0023002440575510263, -0.007379493210464716, 0.08874393254518509, -0.01469328161329031, 0.025371013209223747, 0.015361685305833817, -0.00976340938359499, 0.004464337602257729, 0.044284045696258545, 0.0337451808154583, 0.012728153727948666, 0.0142359659075737, 0.036281414330005646, 0.03620985150337219, 0.014675529673695564, 0.02287684753537178, 0.02511700987815857, -0.007940569892525673, -0.03099764510989189, 0.023507051169872284, -0.020254170522093773, -0.0014974616933614016, 0.028966784477233887, -0.04963718727231026, -0.01459343358874321, 0.02146773971617222, 0.018954424187541008, -0.046064842492341995, 0.03481094539165497, 0.03793967142701149, -0.05907777324318886, -0.0018728224094957113, 0.06972736120223999, 0.08613918721675873, -0.020117398351430893, -0.032251615077257156, 0.004387949127703905, -0.020590726286172867, -0.03717455640435219, -0.048438120633363724, -0.06928769499063492, 0.02950315922498703, -0.01594865694642067, 0.007081903517246246, 0.03608689457178116, 0.04165271297097206, 0.004951501730829477, 0.04289400950074196, -0.03444371744990349, -0.012347990646958351, 0.03336329758167267, -0.005095571745187044, 0.007351362146437168, -0.08307232707738876, 0.021938584744930267, 0.03161279112100601, 0.05435875803232193, -0.013639518059790134, -0.07568836957216263, 0.003482514526695013, -0.020995959639549255, -0.023884855210781097, ...]",0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 1.0, 'approx_incoherence_cluster_size': 1323.0, 'approx_incoherence_cluster_candidates': 60.0}",0,,{'model_incoherence_severity': nan},1.0
1,482736,The MTCN code did not arrive. Less than 50 days have passed since the last SMS with the code. It is recommended to wait.\n,Request for information - Assistance timing or location,"[-0.008439027704298496, -0.02939574420452118, 0.010878953151404858, 0.011462884955108166, -0.017010273411870003, -0.02308790199458599, -0.02591068483889103, 0.017643386498093605, -0.021967783570289612, -0.007430138532072306, -0.04495438560843468, 0.02255563996732235, 0.003848706604912877, -0.08500702679157257, -0.03630180284380913, -0.06978219747543335, 0.02096705511212349, 0.03162963688373566, 0.012953582219779491, 0.07118736207485199, 0.0015393586363643408, -0.04586445167660713, -0.06180022656917572, -0.0033178257290273905, 0.009632723405957222, 0.010354195721447468, -0.07657653093338013, -0.020427223294973373, 0.05291944369673729, -0.02524198591709137, -0.014135745353996754, 0.00884307362139225, -0.008973411284387112, -0.035615332424640656, 0.03457825258374214, -0.009590371511876583, -0.006010080222040415, -0.005752519704401493, 0.0016937210457399487, -0.008935168385505676, 0.056827664375305176, 0.06074042618274689, 0.05838621035218239, -0.00927925668656826, 0.010676869191229343, -0.030442070215940475, 0.034310828894376755, -0.056385818868875504, 0.007307080086320639, -0.03522370010614395, 0.008918720297515392, -0.011266268789768219, 0.0263383649289608, 0.01136823371052742, 0.007708424236625433, -0.010483023710548878, 0.0012370087206363678, -0.012115121819078922, 0.055211421102285385, -0.018655875697731972, -0.00550741096958518, -0.01096710842102766, 0.03556482493877411, -0.004530636593699455, 0.006918497383594513, 0.013359615579247475, -0.012607481330633163, 0.026921728625893593, -0.009666836820542812, -0.004200617782771587, 0.0023695367854088545, 0.045048538595438004, -0.02501119114458561, 0.000690472312271595, -0.015935523435473442, -0.03634878247976303, -0.052980296313762665, -0.03252899646759033, -0.0010813188273459673, -0.00042353165918029845, 0.0009461082518100739, -0.027907855808734894, -0.027391448616981506, 0.028043108060956, -0.026190055534243584, -0.023360667750239372, -0.047376617789268494, -0.04517459124326706, 0.043313831090927124, -0.004136286210268736, -0.008708756417036057, -0.007035595830529928, -0.026466004550457, 0.05397871509194374, -0.031498029828071594, -0.08701244741678238, -0.050210729241371155, 0.06004083529114723, -0.0018230975838378072, 0.0032960374373942614, ...]",0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",1,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},0.0
2,482786,The MTCN code did not arrive. Less than 50 days have passed since the last SMS with the code. It is recommended to wait.\n,Request for information - Assistance timing or location,"[-0.008439027704298496, -0.02939574420452118, 0.010878953151404858, 0.011462884955108166, -0.017010273411870003, -0.02308790199458599, -0.02591068483889103, 0.017643386498093605, -0.021967783570289612, -0.007430138532072306, -0.04495438560843468, 0.02255563996732235, 0.003848706604912877, -0.08500702679157257, -0.03630180284380913, -0.06978219747543335, 0.02096705511212349, 0.03162963688373566, 0.012953582219779491, 0.07118736207485199, 0.0015393586363643408, -0.04586445167660713, -0.06180022656917572, -0.0033178257290273905, 0.009632723405957222, 0.010354195721447468, -0.07657653093338013, -0.020427223294973373, 0.05291944369673729, -0.02524198591709137, -0.014135745353996754, 0.00884307362139225, -0.008973411284387112, -0.035615332424640656, 0.03457825258374214, -0.009590371511876583, -0.006010080222040415, -0.005752519704401493, 0.0016937210457399487, -0.008935168385505676, 0.056827664375305176, 0.06074042618274689, 0.05838621035218239, -0.00927925668656826, 0.010676869191229343, -0.030442070215940475, 0.034310828894376755, -0.056385818868875504, 0.007307080086320639, -0.03522370010614395, 0.008918720297515392, -0.011266268789768219, 0.0263383649289608, 0.01136823371052742, 0.007708424236625433, -0.010483023710548878, 0.0012370087206363678, -0.012115121819078922, 0.055211421102285385, -0.018655875697731972, -0.00550741096958518, -0.01096710842102766, 0.03556482493877411, -0.004530636593699455, 0.006918497383594513, 0.013359615579247475, -0.012607481330633163, 0.026921728625893593, -0.009666836820542812, -0.004200617782771587, 0.0023695367854088545, 0.045048538595438004, -0.02501119114458561, 0.000690472312271595, -0.015935523435473442, -0.03634878247976303, -0.052980296313762665, -0.03252899646759033, -0.0010813188273459673, -0.00042353165918029845, 0.0009461082518100739, -0.027907855808734894, -0.027391448616981506, 0.028043108060956, -0.026190055534243584, -0.023360667750239372, -0.047376617789268494, -0.04517459124326706, 0.043313831090927124, -0.004136286210268736, -0.008708756417036057, -0.007035595830529928, -0.026466004550457, 0.05397871509194374, -0.031498029828071594, -0.08701244741678238, -0.050210729241371155, 0.06004083529114723, -0.0018230975838378072, 0.0032960374373942614, ...]",0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",1,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},0.0
3,482821,Clarification of the status of the application for eHelp.\n,Request for information - Other (please explain),"[0.0206304918974638, -0.014633155427873135, 0.0018570049433037639, 0.03578194975852966, 0.026378758251667023, 0.010056320577859879, -0.003704037284478545, 0.015377200208604336, -0.006165683269500732, 0.00020249250519555062, -0.007710909005254507, -0.01945115253329277, 0.018473342061042786, -0.007665088865906, -0.030224239453673363, -0.0013895094161853194, 0.03242502361536026, -0.03907524794340134, -0.016736652702093124, 0.03644828870892525, 0.02660493366420269, -0.029906921088695526, -0.03138013556599617, -0.03566891327500343, -0.0059683965519070625, 0.007034383248537779, -0.029144296422600746, -0.03436626121401787, 0.04213761165738106, -0.03530100733041763, 0.025876114144921303, 0.002100969897583127, 0.028253618627786636, -0.018664240837097168, -0.054798323661088943, -0.02476983144879341, 0.056137848645448685, 0.002017607679590583, 0.002300248946994543, -0.007379472255706787, 0.08874401450157166, -0.014693284407258034, 0.025371069088578224, 0.015361655503511429, -0.009763403795659542, 0.00446433387696743, 0.04428408294916153, 0.03374515473842621, 0.01272819098085165, 0.014235940761864185, 0.03628145530819893, 0.036209836602211, 0.014675476588308811, 0.022876884788274765, 0.02511703409254551, -0.007940561510622501, -0.0309976264834404, 0.02350703626871109, -0.02025420404970646, -0.0014974798541516066, 0.02896677330136299, -0.04963718727231026, -0.014593459665775299, 0.021467743441462517, 0.018954409286379814, -0.04606486111879349, 0.03481093421578407, 0.037939634174108505, -0.05907781049609184, -0.0018728870199993253, 0.06972740590572357, 0.08613918721675873, -0.020117400214076042, -0.03225155547261238, 0.004387957975268364, -0.020590735599398613, -0.03717450797557831, -0.048438120633363724, -0.06928765028715134, 0.029503189027309418, -0.015948591753840446, 0.007081924006342888, 0.036086954176425934, 0.041652727872133255, 0.004951440263539553, 0.04289396107196808, -0.034443680197000504, -0.012348054908216, 0.033363327383995056, -0.005095556378364563, 0.007351348176598549, -0.08307235687971115, 0.02193855307996273, 0.03161276504397392, 0.05435871332883835, -0.013639594428241253, -0.07568837702274323, 0.003482484957203269, -0.0209959764033556, -0.023884901776909828, ...]",0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 1.0, 'approx_incoherence_cluster_size': 1323.0, 'approx_incoherence_cluster_candidates': 60.0}",0,,{'model_incoherence_severity': nan},1.0
4,482829,questions about vouchers,Request for information - Assistance timing or location,"[0.0015718897338956594, -0.0364912785589695, 0.013456335291266441, 0.015434589236974716, -0.05154842138290405, -0.035081133246421814, 0.010278778150677681, 0.04880379140377045, 0.021257774904370308, -0.018499882891774178, 0.024207238107919693, -0.034679513424634933, 0.05853838473558426, -0.08898232877254486, 0.04525475203990936, -0.11515336483716965, -0.015365703031420708, 0.03068643808364868, -0.018867459148168564, 0.0002457223308738321, -0.014810502529144287, -0.012170711532235146, 0.06562542170286179, -0.0012525812489911914, -0.03257249668240547, -0.01907171867787838, 0.008979164063930511, 0.007202895358204842, 0.06276728957891464, -0.09456547349691391, -0.05860545486211777, -0.005924633238464594, 0.053647346794605255, 0.011028011329472065, -0.09935255348682404, -0.0035393524449318647, -0.018180202692747116, 0.004956546239554882, -0.04078147932887077, -0.005359138827770948, 0.04488660767674446, 0.024224206805229187, 0.11065588891506195, -0.02031642198562622, -0.028536023572087288, 0.014889538288116455, -0.05043899640440941, 0.012467332184314728, -0.011393032968044281, -0.06819599121809006, 0.007066535763442516, -0.02074790932238102, -0.02381168119609356, 0.04239662364125252, 0.026345502585172653, -0.01380956545472145, -0.03609498590230942, 0.00394153269007802, -0.0623030960559845, 0.012293435633182526, 0.017124582082033157, 0.025552859529852867, -0.01448134332895279, 0.01869543083012104, 0.026078935712575912, -0.0008449280867353082, -0.007083900738507509, 0.04345174878835678, -0.05506202206015587, -0.03927164152264595, 0.022372562438249588, 0.05477389693260193, -0.01767372153699398, 0.009603593498468399, -0.04474073275923729, -0.09675130993127823, 0.00880727730691433, -0.0410396046936512, -0.056049447506666183, -0.031706225126981735, 0.008362864144146442, 0.032127805054187775, -0.032279837876558304, 0.10499772429466248, 0.019234543666243553, -0.02674178220331669, 0.002519388683140278, -0.025060787796974182, -0.009056868962943554, -0.03869660943746567, 0.00024308246793225408, -0.017037395387887955, -0.004866653122007847, -0.0252920463681221, -0.02065029740333557, 0.0007463456713594496, -0.015005734749138355, -0.09467515349388123, 0.010208695195615292, 0.00863694865256548, ...]",0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",1,Request for information - Targeting criteria,"{'approx_incoherence_cluster_index': 28.0, 'approx_incoherence_cluster_size': 17.0, 'approx_incoherence_cluster_candidates': 1.0}",1,Request for information - Targeting criteria,{'model_incoherence_severity': 0.8949588715650808},28.0


In [None]:
inspector.output_df.head()

In [637]:
_ = inspector.get_priority()

AttributeError: 'IncoherenceInspector' object has no attribute 'get_priority'

In [482]:
inspector.output_df["priority"].value_counts().sort_index()

0.0    364
1.0    837
2.0    667
3.0    344
Name: priority, dtype: int64

In [412]:
inspector.detectors[2].model

## Semantic ordering

In [559]:
nn_graph = inspector.detectors[1].nn_graph
coo_graph = nn_graph.tocoo()
non_zero_cond = (coo_graph.data > 0)

max_embedding_distance = inspector.detectors[1].max_embedding_distance
step = 0.005
sort_steps = np.arange(step, max_embedding_distance + step, step)

sort_matrix = np.zeros(shape = (nn_graph.shape[0],len(sort_steps)))
for index,sort_step in enumerate(sort_steps):
    
    # filter only connections that matter
    #   1. it has to be connected (non zero)
    #   2. it has to be sorted
    mask = non_zero_cond & (coo_graph.data <= sort_step)
    filtered_nn_graph = csr_matrix((coo_graph.data[mask], (coo_graph.row[mask], coo_graph.col[mask])), shape=nn_graph.shape)
    
    # calculate calculated components
    n_components, labels = connected_components(filtered_nn_graph)
    
    # sort and reindex, assign smaller indices for larger components
    #   1. create count dictionary
    #   2. reindex components according to size
    #   3. reassign components indices
    element_counts = dict(zip(*np.unique(labels, return_counts=True)))
    element_counts = dict(sorted(element_counts.items(), key = lambda item: item[1], reverse = True))
    index_map = {
        old_index:new_index
        for new_index, old_index in enumerate(element_counts.keys())
    }
    labels = np.array([index_map[old_index] for old_index in labels])
    
    # store indices in sort matrix
    sort_matrix[:,index] = labels

# create indices sorted by multiple columns, hierarchically. 
sorted_indices = np.lexsort(sort_matrix.T, axis = 0)

In [568]:
df_test.iloc[sorted_indices]

Unnamed: 0,id,text,label,is_exact_incoherence,exact_incoherence_recommendation,exact_incoherence_metadata,is_approx_incoherence,approx_incoherence_recommendation,approx_incoherence_metadata,is_model_incoherence,model_incoherence_recommendation,model_incoherence_metadata,priority
450,495778,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
524,497983,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
533,498099,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
662,502173,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
696,503377,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10327,552764,A person with reduced mobility,Complaint - Physical access challenges,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",0,,{'model_incoherence_severity': nan},
10328,537334,a person with a disability cannot receive assistance,Complaint - Physical access challenges,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",0,,{'model_incoherence_severity': nan},
10329,538015,"Text of case number 229107\nUnable to receive transfer. Man is not walking. The card cannot be received - it is refused.\nPlease consider the issue and forward the MTSN code that arrived on November 17\nmade a bank card. received 1 payment. the code is expired for 2 previous payments\nmtsn code 1850910856\nmtsn code 5100780607\nSend MTCN code expired.\nThe outgoing call on 06/14/2023 is not answered\nWeekend 20.06 they cannot provide the TIN, they will call again. CHECK TIN AND...\n\n20.06 I called back, the TIN was checked",Complaint - Physical access challenges,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",1,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,{'model_incoherence_severity': 0.9232283750769543},2.0
10334,535505,She received assistance once in August 2022,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",1,Request for information - Targeting criteria,{'model_incoherence_severity': 0.939734313862668},2.0


In [529]:
df_test = inspector.output_df.drop(columns = ["embedding"]).copy()

In [561]:
df_test.iloc[[-1,0]]

Unnamed: 0,id,text,label,is_exact_incoherence,exact_incoherence_recommendation,exact_incoherence_metadata,is_approx_incoherence,approx_incoherence_recommendation,approx_incoherence_metadata,is_model_incoherence,model_incoherence_recommendation,model_incoherence_metadata,priority
10339,558435,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
0,482666,Clarification of the status of the application for eHelp.\n,Request for information - Other (please explain),0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 1.0, 'approx_incoherence_cluster_size': 1323.0, 'approx_incoherence_cluster_candidates': 60.0}",0,,{'model_incoherence_severity': nan},


In [531]:
df_test.sort_values(sort_matrix)

KeyError: array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       [3.000e+00, 3.000e+00, 3.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.000e+00, 3.000e+00, 3.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [5.612e+03, 5.454e+03, 5.326e+03, ..., 9.000e+00, 1.000e+01,
        1.000e+01],
       [5.613e+03, 5.455e+03, 5.327e+03, ..., 4.314e+03, 4.093e+03,
        3.904e+03],
       [1.000e+00, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [550]:
sorted_indices = np.lexsort(sort_matrix.T, axis = 0)

In [551]:
df_test.iloc[sorted_indices]

Unnamed: 0,id,text,label,is_exact_incoherence,exact_incoherence_recommendation,exact_incoherence_metadata,is_approx_incoherence,approx_incoherence_recommendation,approx_incoherence_metadata,is_model_incoherence,model_incoherence_recommendation,model_incoherence_metadata,priority
450,495778,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
524,497983,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
533,498099,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.\n,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
662,502173,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
696,503377,The MTCN code did not arrive. More than 50 days since the last SMS with the code. MTCN resubmission required.,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': 0.0, 'approx_incoherence_cluster_size': 1393.0, 'approx_incoherence_cluster_candidates': 576.0}",0,,{'model_incoherence_severity': nan},
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10327,552764,A person with reduced mobility,Complaint - Physical access challenges,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",0,,{'model_incoherence_severity': nan},
10328,537334,a person with a disability cannot receive assistance,Complaint - Physical access challenges,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",0,,{'model_incoherence_severity': nan},
10329,538015,"Text of case number 229107\nUnable to receive transfer. Man is not walking. The card cannot be received - it is refused.\nPlease consider the issue and forward the MTSN code that arrived on November 17\nmade a bank card. received 1 payment. the code is expired for 2 previous payments\nmtsn code 1850910856\nmtsn code 5100780607\nSend MTCN code expired.\nThe outgoing call on 06/14/2023 is not answered\nWeekend 20.06 they cannot provide the TIN, they will call again. CHECK TIN AND...\n\n20.06 I called back, the TIN was checked",Complaint - Physical access challenges,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",1,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,{'model_incoherence_severity': 0.9232283750769543},2.0
10334,535505,She received assistance once in August 2022,Complaint - Assistance / Delivery Mechanisms / Payment Instruments,0,,"{'exact_incoherence_cluster_index': nan, 'exact_incoherence_cluster_size': nan, 'exact_incoherence_cluster_candidates': nan}",0,,"{'approx_incoherence_cluster_index': nan, 'approx_incoherence_cluster_size': nan, 'approx_incoherence_cluster_candidates': nan}",1,Request for information - Targeting criteria,{'model_incoherence_severity': 0.939734313862668},2.0


# Finalizing the report

In [40]:
import cleanlab

In [45]:
cl = inspector.detectors[2].model

In [50]:
classes_problems = cleanlab.dataset.find_overlapping_classes(
    labels=inspector.detectors[2].labels,
    confident_joint=cl.confident_joint,  # cleanlab uses the confident_joint internally to quantify label noise (see cleanlab.count.compute_confident_joint)
    class_names=inspector.detectors[2].label_encoder.classes_,
)

In [54]:
classes_problems[classes_problems["Joint Probability"] > 0.001]

Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,Request for information - Other organization a...,Request for information - Targeting criteria,33,35,219,0.02118
1,Request for information - Other (please explain),Request for information - Targeting criteria,32,35,136,0.013153
2,Request for information - Assistance timing or...,Request for information - Targeting criteria,29,35,119,0.011509
3,Request for information - Other (please explain),Request for information - Other organization a...,32,33,83,0.008027
4,Request for information - Assistance timing or...,Request for information - Other organization a...,29,33,78,0.007544
5,Request for information - Assistance duration,Request for information - Targeting criteria,28,35,68,0.006576
6,Complaint - Assistance / Delivery Mechanisms /...,Request for information - Targeting criteria,6,35,67,0.00648
7,Complaint - Assistance / Delivery Mechanisms /...,Request for information - Other (please explain),6,32,56,0.005416
8,Interaction Issues - Interrupted interaction,Interaction Issues - Other (please explain),18,19,56,0.005416
9,Complaint - Assistance / Delivery Mechanisms /...,Complaint - Incorrect records,6,8,54,0.005222


In [55]:
health = cleanlab.dataset.overall_label_health_score(
    inspector.detectors[2].labels, confident_joint=cl.confident_joint
    # cleanlab uses the confident_joint internally to quantify label noise (see cleanlab.count.compute_confident_joint)
)

 * Overall, about 21% (2,202 of the 10,340) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.79.
