## **Feature:** Pattern Matching

**Names:** Tanat

### **What it does**
[Brief description]

### **Helper Functions**
[List Helper Functions]

In [149]:
# Get API Key
from dotenv import load_dotenv
load_dotenv()
import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
pd.set_option('future.no_silent_downcasting', False)
import numpy as np
import math
import re
import datetime
from pint import UnitRegistry
from pint.errors import UndefinedUnitError
ureg = UnitRegistry()

# Langchain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

In [152]:
def unique_non_numeric(series: pd.Series):
    return series[~series.apply(lambda x: pd.api.types.is_number(x))].dropna().unique()

In [153]:
def llm_classify_patterns(df):
    """
    takes a df and returns a string containings suggestions
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent.

    In data cleaning Pattern matching allows you to find specific 
    sequences of characters, which can then be used to standardize formats, 
    remove punctuation and symbols, correct misspellings, and extract key 
    information, thereby preparing your data for accurate analysis.

    For each column analyse the patterns in the column and classify it ACCORDING
    TO THESE PRIORITIES, you may classify multiple patterns per column.
    - columns to drop: 
        ids (can't analyse), 
        constant columns (nuniques=1), 
        plain texts (long texts: comments, descriptions etc. hard to analyse)
    Numeric Columns:
        - NUMERIC WITH MISSING VALUES (e.g. "missing", "unknown", "not sure")
        - RANGES (columns with 2 numbers and some separator)
        small ranges (handle with midpoints), big ranges (one hot encoding))
        (17-20, 18->20 years, between 7 - 10 days)
        - Units
        (10kgs, 30 hours)
        - Numeric (currency, percentage, numeric stored as string)
        Currencies: 3.213,00 USD -> 3213.00
        Percentage: 10.3% -> 0.103
        Strings: twenty-two
    - Datetime (timestamp, date, time)
    - Multiselect (consider one hot encoding)
    - Categories 
        Low Cardinality: <10 unique values - one hot encoding
        Moderate Cardinality: 10-30 unique values - feature engineering/drop
        High Cardinality: 30+ unique values - feature engineering/drop
        - ordinal (label) encoding: natural ranking exists
    Also suggest treatment options to handle these patterns
    FLAG ANY Inconsistent formatting in same columns
    """))

    col_info = [{
        'col_name': col,
        'uniques': df[col].unique()[:5],
        'nuniques': df[col].nunique(),
        'type': df[col].dtype,
        'non-numerics': unique_non_numeric(df[col])
    } for col in df.select_dtypes(include='object')]
    dataset = f"""
    Dataset info: Shape: {df.shape}, 
    Object columns: {col_info}
    """

    messages.append(HumanMessage(content=dataset))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    return response.content.strip()

In [None]:
def clean_numeric_plain(val):
    """
    Handles any spaces, symbols, currency codes, percentages, EU and US decimal
    formats etc.
    """
    s = str(val).strip()
    # Remove spaces, symbols, currency codes, %
    s = re.sub(r'\s+', '', s)
    s = re.sub(r'[$€£¥₹¢₽₦₴₪₩]', '', s)
    s = re.sub(r'(USD|EUR|AUD|GBP|INR|JPY|CAD|CHF|SEK|NOK|DKK|CNY|KRW|RUB|BRL|MXN)', '', s, flags=re.IGNORECASE)
    is_percent = False
    if '%' in s:
        s = s.replace('%', '')
        is_percent = True
    # Handle different number formats: 1.234,56 or 1,234.56
    if re.match(r'^\d{1,3}(\.\d{3})+,\d+$', s):
        s = s.replace('.', '').replace(',', '.')
    elif re.match(r'^\d{1,3}(,\d{3})+(\.\d+)?$', s):
        s = s.replace(',', '')
    # Handle any decimal commas and stray commas
    elif re.match(r'^\d+,\d+$', s):
        s = s.replace(',', '.')
    s = s.replace(',', '')
    # Convert to float
    try:
        num = float(s)
        if is_percent:
            num = num / 100
        return num
    except:
        print(f"numeric vals: Failed to clean '{s}'")
        return s
    
def handle_numeric(series):
    return series.apply(clean_numeric_plain)

In [None]:
def handle_range(series):
    """"
    applies convert_range to series
    """
    return series.apply(convert_range)

def convert_range(val):
    """
    Takes first first 2 numeric blocks with no more than 4 character separator
    in between and returns midpoint
    """
    s = str(val).strip()
    # Match the range pattern
    range_regex = r'([\d.,]+)[\D]{0,4}([\d.,]+)'
    match = re.search(range_regex, s)
    if not match:
        return s
    
    num1, num2 = match.group(1), match.group(2)
    try:
        n1 = clean_numeric_plain(num1)
        n2 = clean_numeric_plain(num2)
        midpoint = (n1 + n2) / 2
        return midpoint
    except:
        print(f"range vals: Failed to clean '{s}'")
        return s

In [147]:
def handle_units(series):
    """
    Normalize all units to the first unit found in the series.
    """
    ureg = UnitRegistry()
    
    # Find first valid unit
    target_unit = None
    for text in series.dropna():
        match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', str(text))
        if match:
            try:
                ureg.Unit(match.group(2))  # Test if unit exists
                target_unit = match.group(2)
                break
            except:
                continue
    
    if not target_unit:
        print(f"No convertible unit found in '{series.name}'")
        return series  # No units found, return as-is
    
    def convert_unit(text):
        if pd.isna(text):
            return text
            
        match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', str(text))
        if not match:
            return text
            
        value, unit = match.groups()
        try:
            quantity = ureg.Quantity(float(value), unit)
            converted = quantity.to(target_unit)
            return converted.magnitude
        except:
            print(f"unit vals: Failed to clean '{text}'")
            return text
    
    result = series.apply(convert_unit)
    result.name = f"{series.name}_{target_unit}" if series.name else f"values_{target_unit}"
    return result

In [None]:
def pattern_matching(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """

    # TODO: Create helper docs (Reimplement with functions)
    helper_docs = """
    The following functions returns a cleaned series make sure to assign it df[col] = ...
    - handle_numeric(series): Handles any spaces, symbols, currency codes, percentages, EU and US decimal formats ONLY
    - handle_range(series): Matches first 2 numeric blocks cleans it and returns its midpoint 
    - handle_units(series): Picks first unit found and converts series to that unit
    """

    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""{llm_classify_patterns(df)}"""))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent trying to clean any patterns.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    
    Helper functions available (USE PANDAS WHERE POSSIBLE):
    {helper_docs}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime

    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions if needed
    - ASSUME DF IS ALREADY DEFINED
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    - USE .REPLACE TO FIX SMALL INCONSISTENT FORMATTINGS
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        print(generated_code)
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

In [182]:
# df = pd.read_csv("../sample_data/smoke.csv")[0:5000]
df = pd.read_csv("../sample_data/smoke.csv")[0:5000]
test_df = df.copy()