# Scenario B - RFQ Similarity

Imports & Data Loading

In [1]:
import pandas as pd
import numpy as np
import re
from itertools import product

# Load data (adjust paths if needed)
rfq = pd.read_csv("rfq.csv")
ref = pd.read_csv("reference_properties.tsv", sep="\t")
print("RFQ:", rfq.shape, "REF:", ref.shape)
rfq.head()

RFQ: (1000, 25) REF: (175, 34)


Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,weight_min,weight_max,inner_diameter_min,inner_diameter_max,outer_diameter_min,outer_diameter_max,yield_strength_min,yield_strength_max,tensile_strength_min,tensile_strength_max
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,15000.0,25000.0,610.0,610.0,,,,,760.0,810.0
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,,,,,,,,,
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,,,,,,,,,
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,53800.0,53800.0,,,60.3,,,,,
4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,,,,,,Round Tubes,1.5,1.5,...,14500.0,14500.0,,,48.3,,,,,


Helpers

In [2]:

def norm_text(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    s = re.sub(r"\s+", " ", s)
    return s

def norm_grade(x):
    if pd.isna(x):
        return None
    s = norm_text(x)
    s = re.sub(r"\s*\+\s*", "+", s)
    s = re.sub(r"\s*-\s*", "-", s)
    return s.upper()

def parse_range(val):
    if pd.isna(val):
        return (np.nan, np.nan, np.nan)
    s = str(val).strip().replace(",", ".")
    # <= or >= forms
    if re.match(r"^[<≥>]=?\s*\d+(\.\d+)?$", s):
        num = float(re.findall(r"\d+(?:\.\d+)?", s)[0])
        if s.startswith(('<=','≤')):
            return (np.nan, num, num)
        if s.startswith(('>=','≥')):
            return (num, np.nan, num)
    if re.match(r"^\d+(\.\d+)?$", s):
        v = float(s); return (v, v, v)
    m = re.match(r"^(\d+(?:\.\d+)?)\s*[-–—]\s*(\d+(?:\.\d+)?)$", s)
    if m:
        a, b = float(m.group(1)), float(m.group(2))
        a, b = (min(a,b), max(a,b))
        return (a, b, (a+b)/2.0)
    m2 = re.search(r"(\d+(?:\.\d+)?)", s)
    if m2:
        v = float(m2.group(1))
        return (v, v, v)
    return (np.nan, np.nan, np.nan)

def interval_iou(a_min, a_max, b_min, b_max):
    if np.isnan([a_min, a_max, b_min, b_max]).any():
        return 0.0
    if a_min > a_max: a_min, a_max = a_max, a_min
    if b_min > b_max: b_min, b_max = b_max, b_min
    inter = max(0.0, min(a_max, b_max) - max(a_min, b_min))
    union = max(a_max, b_max) - min(a_min, b_min)
    if union <= 0: return 0.0
    return inter / union

def exact_match(a, b):
    if pd.isna(a) or pd.isna(b):
        return 0.0
    return 1.0 if str(a).strip().lower() == str(b).strip().lower() else 0.0

Join RFQs with reference

In [3]:
rfq["grade_norm"] = rfq["grade"].map(norm_grade)

# Try to find grade column in reference
cand_cols = [c for c in ref.columns if c.lower() in ["grade", "material", "grade/material"]]
ref_grade_col = cand_cols[0] if len(cand_cols) else ref.columns[0]
ref["grade_norm"] = ref[ref_grade_col].map(norm_grade)

enriched = rfq.merge(ref, how="left", on="grade_norm", suffixes=("", "_ref"))
print("Enriched shape:", enriched.shape)
enriched.head()

Enriched shape: (1005, 60)


Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,Reduction of area (Z%),"Hardness (HB, HV, HRC)",Impact toughness (Charpy V-notch),Fatigue limit,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,,,,,,Standard Specifications,"Cold forming, automotive, high strength applic...",Microalloyed Steel,≤0.22,
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,,,,,Standard Specifications,Structural galvanized steel,Galvanized Steel,,Hot-dip galvanized
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,,,,,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,Hot-dip galvanized
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,,,,,,Standard Specifications,General structural steel,Structural Steel,,
4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,,,,,,Round Tubes,1.5,1.5,...,,,,,,Standard Specifications,General structural steel,Structural Steel,,


Feature Engineering

In [4]:

# Intervals
interval_pairs = [
    ("thickness_min","thickness_max"),
    ("width_min","width_max"),
    ("length_min","length_max"),
    ("height_min","height_max"),
    ("weight_min","weight_max"),
    ("inner_diameter_min","inner_diameter_max"),
    ("outer_diameter_min","outer_diameter_max"),
    ("yield_strength_min","yield_strength_max"),
    ("tensile_strength_min","tensile_strength_max"),
]

rfq_fe = rfq.copy()
for mn, mx in interval_pairs:
    if mn in rfq_fe.columns and mx in rfq_fe.columns:
        a = rfq_fe[mn].fillna(rfq_fe[mx])
        b = rfq_fe[mx].fillna(rfq_fe[mn])
        rfq_fe[mn] = a
        rfq_fe[mx] = b

for col in ["coating","finish","form","surface_type"]:
    if col not in rfq_fe.columns:
        rfq_fe[col] = np.nan

rfq_fe.head()

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,weight_max,inner_diameter_min,inner_diameter_max,outer_diameter_min,outer_diameter_max,yield_strength_min,yield_strength_max,tensile_strength_min,tensile_strength_max,grade_norm
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,25000.0,610.0,610.0,,,,,760.0,810.0,S700MC
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,,,,,,,,,S250GD
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,,,,,,,,,DX51D
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,53800.0,,,60.3,60.3,,,,,S235
4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,,,,,,Round Tubes,1.5,1.5,...,14500.0,,,48.3,48.3,,,,,S235


Similarity & Top3

In [5]:

def pair_similarity(a, b):
    # Dimensions IoU (weighted)
    dim_weights = {
        "thickness": 1.0,
        "width": 1.0,
        "yield_strength": 1.0,
        "tensile_strength": 1.0,
        "length": 0.5,
        "height": 0.5,
        "weight": 0.5,
        "inner_diameter": 0.5,
        "outer_diameter": 0.5,
    }
    dim_score = 0.0; dim_weight_total = 0.0
    for d,(mn,mx) in {
        "thickness": ("thickness_min","thickness_max"),
        "width": ("width_min","width_max"),
        "yield_strength": ("yield_strength_min","yield_strength_max"),
        "tensile_strength": ("tensile_strength_min","tensile_strength_max"),
        "length": ("length_min","length_max"),
        "height": ("height_min","height_max"),
        "weight": ("weight_min","weight_max"),
        "inner_diameter": ("inner_diameter_min","inner_diameter_max"),
        "outer_diameter": ("outer_diameter_min","outer_diameter_max"),
    }.items():
        w = dim_weights.get(d, 0.0)
        if w == 0:
            continue
        iou = interval_iou(a.get(mn, np.nan), a.get(mx, np.nan),
                           b.get(mn, np.nan), b.get(mx, np.nan))
        dim_score += w * iou
        dim_weight_total += w
    dim_score = dim_score / dim_weight_total if dim_weight_total > 0 else 0.0

    # Categorical
    cats = ["coating","finish","form","surface_type"]
    cat_score = np.mean([exact_match(a.get(c, np.nan), b.get(c, np.nan)) for c in cats])

    # Grade similarity (here already partly via yield/tensile intervals)
    grade_score = (
        interval_iou(a.get("yield_strength_min", np.nan), a.get("yield_strength_max", np.nan),
                     b.get("yield_strength_min", np.nan), b.get("yield_strength_max", np.nan)) +
        interval_iou(a.get("tensile_strength_min", np.nan), a.get("tensile_strength_max", np.nan),
                     b.get("tensile_strength_min", np.nan), b.get("tensile_strength_max", np.nan))
    ) / 2.0

    # Aggregate
    return 0.4*dim_score + 0.3*cat_score + 0.3*grade_score

rows = []
rfq_small = rfq_fe.reset_index(drop=True)

for i in range(len(rfq_small)):
    a = rfq_small.iloc[i].to_dict()
    sims = []
    for j in range(len(rfq_small)):
        if i == j: continue
        b = rfq_small.iloc[j].to_dict()
        sims.append((rfq_small.iloc[j]["id"], pair_similarity(a,b)))
    sims.sort(key=lambda x: x[1], reverse=True)
    for mid, score in sims[:3]:
        rows.append({"rfq_id": a["id"], "match_id": mid, "similarity_score": float(score)})

top3 = pd.DataFrame(rows)
top3.to_csv("top3.csv", index=False)
top3.head()

Unnamed: 0,rfq_id,match_id,similarity_score
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,7d1ab305-7fc6-4ab0-bc2a-9ae1e038e67e,0.371077
1,8aff426d-b8c0-43aa-ad26-835ef4de6129,2624655e-ea07-468a-8da8-2e39c9d1e7f0,0.243414
2,8aff426d-b8c0-43aa-ad26-835ef4de6129,a462a4cb-bbaa-4417-b876-4b8606c6f8db,0.226923
3,37e624be-b125-464f-85b6-1838530193ef,4da333cf-6f2a-4b64-9212-2be665c5e1e3,0.15
4,37e624be-b125-464f-85b6-1838530193ef,a3f38767-02ae-4990-be18-35ca819684a7,0.15
