In [37]:
import pandas as pd
import re
import csv
import os
import ast

In [40]:
unit_map = {
    'centimetre': {'cm', 'centimetre', 'centimeter', 'centimeters', 'centimtr', 'cmt', 'cmtr', 'cmetr'},
    'foot': {'ft', 'foot', 'feet', 'ft.', 'foots', 'feets', 'fet', 'fut', 'feetz'},
    'inch': {'in', 'inch', 'inches', 'inc', 'inchs', 'incs', 'ins', 'inche', '\"'},
    'metre': {'m', 'metre', 'meter', 'meters', 'mtr', 'mtrs', 'mter', 'meetr', 'metrs'},
    'millimetre': {'mm', 'millimetre', 'millimeter', 'millimeters', 'milimeter', 'milimetre', 'mlmtr'},
    'yard': {'yd', 'yard', 'yards', 'yrd', 'yad', 'yards.', 'yrds', 'yrds.'},

    'gram': {'g', 'gram', 'grams', 'gramm', 'grm', 'gramms', 'gms', 'grms'},
    'kilogram': {'kg', 'kilogram', 'kilograms', 'kilo', 'kilos', 'kilogramme', 'kilogramms', 'kgram'},
    'microgram': {'µg', 'microgram', 'micrograms', 'mcg', 'micrgram', 'ugm', 'microgramm', 'microgrm'},
    'milligram': {'mg', 'milligram', 'milligrams', 'miligram', 'miligramm', 'mgs', 'mgr', 'mgrm'},
    'ounce': {'oz', 'ounce', 'ounces', 'onz', 'ounze', 'ounc', 'onzs', 'ounzes'},
    'pound': {'lb', 'lbs', 'pound', 'pounds', 'pd', 'pond', 'lb.', 'lbs.', 'lbses', 'ib', 'ibs'},
    'ton': {'ton', 'tons', 'tonne', 'tonns', 'tonn', 'tn', 'tons.', 'tonnes'},

    'kilovolt': {'kv', 'kilovolt', 'kilovolts', 'kvlt', 'kv.', 'kvolt', 'kilovts'},
    'millivolt': {'mv', 'millivolt', 'millivolts', 'mvolt', 'mvolts', 'milivolt', 'mvlt'},
    'volt': {'v', 'volt', 'volts', 'voltage', 'volts.', 'vlt', 'volte', 'voltes'},

    'kilowatt': {'kw', 'kilowatt', 'kilowatts', 'kwatt', 'kilowatts.', 'kwatts', 'kilowtt'},
    'watt': {'w', 'watt', 'watts', 'wat', 'wats', 'wtts', 'wattz'},

    'centilitre': {'cl', 'centilitre', 'centiliters', 'centiliter', 'cntltr', 'centilitrs', 'cntlitres'},
    'cubic foot': {'ft³', 'cubic foot', 'cubic feet', 'foot3', 'feet3', 'ft^3', 'cuft'},
    'cubic inch': {'in³', 'cubic inch', 'cubic inches', 'inch3', 'inches3', 'in^3', 'cuin'},
    'cup': {'cup', 'cups', 'cp', 'cps', 'cupp', 'cupps', 'cps.'},
    'decilitre': {'dl', 'decilitre', 'deciliter', 'dcl', 'dcltrs', 'decileter', 'decillitre'},
    'fluid ounce': {'floz', 'fluid ounce', 'fluid ounces', 'fl.oz', 'fluidonz', 'flounce', 'flozs'},
    'gallon': {'gal', 'gallon', 'gallons', 'gln', 'galn', 'gals', 'galln'},
    'imperial gallon': {'imp gal', 'imperial gallon', 'imperial gallons', 'igallon', 'imperial gal', 'imp.gal'},
    'litre': {'l', 'litre', 'liters', 'liter', 'litres', 'ltr', 'lit', 'litrs'},
    'microlitre': {'μl', 'microlitre', 'microliter', 'mcl', 'mcL', 'microlitres', 'microltrs'},
    'millilitre': {'ml', 'millilitre', 'milliliters', 'milliliter', 'mililitre', 'mililiter', 'mltr'},
    'pint': {'pt', 'pint', 'pints', 'pts', 'pnt', 'pnts', 'pintrs'},
    'quart': {'qt', 'quart', 'quarts', 'qts', 'qurt', 'qrt', 'quarts.'},
}


def convert_entity_text(text):
    text = str(text)

    #european vs western identification
    if "," in text and "." not in text:
        decimal_part = text.split(",")[1] if "," in text else ""
        if not decimal_part.strip("0"):
            pass
        else:
            text = text.replace(",", ".")
    elif "," in text and "." in text:
        comma_pos = text.index(",")
        dot_pos = text.index(".")
        if comma_pos > dot_pos:
            text = text.replace(",", "")
        else:
            text = text.replace(",", ".")

    match = re.match(r"(\d+(\.\d+)?)(\s*[\w\’]+)", text, re.IGNORECASE)
    if match:
        number = match.group(1)
        unit = match.group(3).strip().lower()

        for key, values in unit_map.items():
            if unit in values:
                return f"{number} {key}"
            else:
                pass

    return ""

def convert_entity_text_2(text):
    text = str(text)

    #european vs western detection
    if "," in text and "." not in text:
        if re.match(r"^\d{1,3}(,\d{2,3})+$", text):
            pass
        else:
            decimal_part = text.split(",")[1] if "," in text else ""
            if not decimal_part.strip("0"):
                pass
            else:
                text = text.replace(",", ".")
    elif "," in text and "." in text:
        comma_pos = text.index(",")
        dot_pos = text.index(".")
        if comma_pos > dot_pos:
            text = text.replace(",", "")
        else:
            text = text.replace(",", ".")

    match = re.match(r"(\d+(\.\d+)?)(\s*[\w\"\'\’]+)", text, re.IGNORECASE)
    if match:
        number = match.group(1)
        unit = match.group(3).strip().lower()

        if unit == '"':
            unit = 'inch'

        for key, values in unit_map.items():
            if unit in values:
                return f"{number} {key}"

    return ""

def get_entity_value(req_entity_name, entities):
    
    size_entities = {"height", "width", "depth"}
    weight_entities = {"item_weight", "maximum_weight_reccomendation"}

    exact_match = None
    alt_size_match = None
    alt_weight_match = None

    for label, value in entities:
        value = convert_entity_text_2(value)
        label = label.lower()
        
        if label == req_entity_name:
            return value

        if req_entity_name in size_entities and label in size_entities:
            alt_size_match = value 

        if req_entity_name in weight_entities and label in weight_entities:
            alt_weight_match = value

    if alt_size_match:
        return alt_size_match
    if alt_weight_match:
        return alt_weight_match

    return convert_entity_text_2(entities[0][1])

merged_chunks_file = "chunks_80p_accuracy/merged_chunks.csv"
with open(merged_chunks_file, newline='') as file, open("output.csv", "w", newline='') as outfile:
    csvreader = csv.reader(file)
    csvwriter = csv.writer(outfile)

    header = next(csvreader)

    csvwriter.writerow(["index", "prediction"])

    for row in csvreader:
        idx = row[0]
        req_entity_name = row[3]
        entities = row[6]

        if entities:
            try:
                entity_list = ast.literal_eval(entities)
            except (ValueError, SyntaxError):
                entity_list = []
        else:
            entity_list = []    
        
        if not entity_list:
            csvwriter.writerow([idx, ""])
        else:
            prediction_value = get_entity_value(req_entity_name, entity_list)
            
            csvwriter.writerow([idx, prediction_value])

In [11]:
ner_output = pd.read_csv("ner_final_testdata_comparison.csv")

output_df = ner_output.iloc[:,[0, 2]]

output_df.to_csv("output.csv", index=False)

In [19]:
train_df = pd.read_csv("train.csv")
height_df = train_df.loc[train_df["entity_name"] == "height"]
width_df = train_df.loc[train_df["entity_name"] == "width"]
depth_df = train_df.loc[train_df["entity_name"] == "depth"]

height_df, width_df, depth_df = height_df.head(300), width_df.head(300), depth_df.head(300)

height_df.to_csv("hwd_tuning/height.csv", index=False)
width_df.to_csv("hwd_tuning/width.csv", index=False)
depth_df.to_csv("hwd_tuning/depth.csv", index=False)

In [4]:
train_df = pd.read_csv("train.csv")

voltage_df = train_df.loc[train_df["entity_name"] == "voltage"]
wattage_df = train_df.loc[train_df["entity_name"] == "wattage"]

voltage_df, wattage_df = voltage_df.head(50), wattage_df.head(50)
voltage_wattage_df = pd.concat([voltage_df, wattage_df])

voltage_wattage_df.to_csv("hwd_tuning/pre_ocr/voltwatt.csv")