In [1]:
import sys
sys.path.insert(0,"..")
from queuerious_detector.preprocessing import *
import pandas as pd

In [None]:
#load the data
raw_data = pd.read_csv(
    "../data/raw/aa_dataset-tickets-multi-lang-5-2-50-version.csv")

#combine classes based on previous analysis
class_map = {'Technical Support': 'Technical & IT Support',
    'IT Support': 'Technical & IT Support',
    'Customer Service': 'Customer Service, Returns & Exchanges',
    'Returns and Exchanges': 'Customer Service, Returns & Exchanges'
}

#preprocess the data
preprocess_data = preprocess_tickets(
    df=raw_data,
    text_fields=["subject", "body"],
    target_col="queue",
    new_target_col="queue_grouped",
    class_map=class_map,
    output_columns=["combined_text", "queue_grouped"]
)

In [None]:
preprocess_data.head()

In [None]:

def redact_pii(text: Any, lang: str) -> str:
    """
    Redact PII from text using regex and Named Entity Recognition (NER).

    Regex:
      - Emails
      - Phone numbers
      - IP addresses
      - Credit card numbers
      - Street-style addresses

    NER:
      - PERSON (names)

    Args:
        text (Any): Input text to redact.
        lang (str): Language code ('en' or 'de') for appropriate NER model.

    Returns:
        str: Text with PII replaced by placeholders.
    """
    if not isinstance(text, str):
        return ""

    redacted = text  # <- Define redacted here

    patterns = {
        "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
        "phone": (
            r"\b(\+?\d{1,3}[-.\s]?)?(\(?\d{3}\)?|\d{3})"
            r"[-.\s]?\d{3}[-.\s]?\d{4}\b"
        ),
        "ip": r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
        "credit_card": r"\b(?:\d[ -]*?){13,16}\b",
        "address": r"\b\d{1,5}\s+\w+(?:\s\w+)?\s+(St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard|Dr|Drive|Ln|Lane)\b",
    }

    for key, pattern in patterns.items():
        redacted = re.sub(pattern, f"[{key.upper()}_REDACTED]", redacted)

    nlp = nlp_en if lang == "en" else nlp_de if lang == "de" else None
    if not nlp:
        return redacted

    # NER-based redaction
    doc = nlp(redacted)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            redacted = redacted.replace(ent.text, "<NAME>")

    return redacted

In [None]:
#load the dataset
raw_multi_df = pd.read_csv("../data/raw/aa_dataset-tickets-multi-lang-5-2-50-version.csv")
raw_multi_df.sample(n=1)

In [None]:
#drop the non-english records
en_de_only_df = raw_multi_df[(raw_multi_df['language'] == "en") 
                             | (raw_multi_df['language'] == 'de')].copy()

In [None]:
#add combined text column
df_combo_text = combine_text_columns(en_de_only_df, ["subject", "body"])
df_combo_text.sample(n=1)

In [None]:
#now check for PII ->
df_combo_text['PII_found'] = df_combo_text.apply(
    lambda row: find_pii_patterns(row["Combined_Text_Col"],
                                   row["language"]), axis=1)

In [None]:
#now redact PII
df_combo_text['Redacted_Text'] = df_combo_text.apply(
    lambda row: redact_pii(row["Combined_Text_Col"],
                                   row["language"]), axis=1)

In [None]:
#running this cell a few times to assess
df_combo_text[["Combined_Text_Col", "PII_found", "Redacted_Text"]].sample(n=1)

In [None]:
df_combo_text["language"].value_counts()

In [None]:
#now translate combined text field to english -> I let this run for about 4hrs and it still has not completed.
#df_combo_text["translated_text"] = df_combo_text["Combined_Text_Col"].apply(translate_to_english)

### Lessons Learned:
1. In viewing the "Answer" column I can see that the data was previously redacted, however it doesn't seem that "body" and "subject" were redacted.

2. The redaction function will get added to the pipeline to help reduce pii in new data (production)

3. I experimented the spacy Small models but had too many misclassifications, the Large models seem to perform better.

4. One thing that increases missclassifications is the lack of context - for example the "subject" column alone may be too short for NER to be effective. I decided to combine the "subject" and "body" columns.

5. Sometimes product is misclassified by Spacy NER - it's recognized as person entity

6. One enhancement I would make is translation - the traslation function takes hours to implement since we have 12k german records.