In [None]:
import pandas as pd
import itertools
import random

In [30]:
random.seed(42)  # make it reproducible, remove if you want true randomness

prefixes = [
    "AAF","Aegon","Aequim","AllianceBernstein","Amiral","Analog","Arrowpoint","Asia",
    "Aspen","Aspire","Assetmark","Athora","Banca","Bigelow","Birch","Black","Bontempo",
    # "Booster","Calton","Cutler","Daintree","Dana","Davidson","DWS","Eagle","Earnest",
    # "East","EBA","Elmwood","Giant","Gitterman","GKFO","Gladstone","IAMS","ICICI","IFG",
    # "IHT","Illinois","Janus","Jefferson","Jennison","LBMC","Lombard","Longview",
    # "MerQube","Merriman","Mesirow","Newman","Obrinvest","OCBC","Paulson","PAX","Petra",
    "Pin","Pine","Prudent","PUTNAM","Qtrade","Rockwood","Safeguard","Sittner","Tavira",
    "TCFG","TCI","Tradewinds","University","Usmart"
]

suffixes = [
    "Collective Investment",    "Collective Trust",    "Collective Services",    "Wealth Management",    "Wealth Advisors",
    "Wealth Partners",    "Wealth Group",    "Capital Management",    "Capital Group",    "Capital Partners",
    "Capital Advisors",    "Capital",    "Financial",    "Financial Advisors",    "Financial Partners",
    "Financial Trust",    "Investment Management",    "Investment Company",    "Investment Partners",
    "Advisory",    "Advisory Group",    "Advisory Services",    "Asset Management",
    "Asset Management Group",    "Asset Management Partners"
]

def build_name(prefix, suffix):
    return f"{prefix} {suffix}".strip()

rows = []

# 1. HARD NEGATIVES (different prefix, SAME suffix)
# score ~ [0.02, 0.08]
for sfx in suffixes:
    # all variants with this suffix
    names_for_suffix = [build_name(p, sfx) for p in prefixes]

    # choose all unique pairs of DIFFERENT prefixes but SAME suffix
    for i in range(len(prefixes)):
        for j in range(i + 1, len(prefixes)):
            a = build_name(prefixes[i], sfx)
            b = build_name(prefixes[j], sfx)
            sim_score = random.uniform(0.02, 0.08)
            rows.append({
                "sentence1": a,
                "sentence2": b,
                "similarity": round(sim_score, 3)
            })

random.shuffle(rows)
rows = rows[:400]

# 2. MEDIUM NEGATIVES (different prefix, DIFFERENT suffix)
# score ~ [0.18, 0.32]
# this can explode combinatorially, so we'll just sample some combos
all_generated = [(p, s, build_name(p, s)) for p in prefixes for s in suffixes]
all_pairs = list(itertools.combinations(all_generated, 2))
random.shuffle(all_pairs)

for (p1, s1, name1), (p2, s2, name2) in all_pairs[:200]:
    if p1 != p2 and s1 != s2:
        sim_score = random.uniform(0.12, 0.22)
        rows.append({
            "sentence1": name1,
            "sentence2": name2,
            "similarity": round(sim_score, 3)
        })


rows2 = []
# 3. RELATED BUT NOT IDENTICAL (same prefix, DIFFERENT suffix)
# keep this stable ~0.6 so the model learns this tier as a "close, but not same"
for p in prefixes:
    for sfx_a, sfx_b in itertools.combinations(suffixes, 2):
        a = build_name(p, sfx_a)
        b = build_name(p, sfx_b)
        rows2.append({
            "sentence1": a,
            "sentence2": b,
            "similarity": 0.6
        })
        
random.shuffle(rows2)
rows2 = rows2[:200]
rows += rows2

df = pd.DataFrame(rows, columns=["sentence1", "sentence2", "similarity"])

In [27]:
df

Unnamed: 0,sentence1,sentence2,similarity
0,Aspire Financial,PUTNAM Financial,0.059
1,Assetmark Financial Partners,Tradewinds Financial Partners,0.079
2,Bigelow Asset Management,TCI Asset Management,0.063
3,Pin Wealth Partners,Safeguard Wealth Partners,0.051
4,Qtrade Wealth Partners,Usmart Wealth Partners,0.073
...,...,...,...
782,University Financial Trust,University Asset Management Group,0.600
783,Safeguard Capital Management,Safeguard Advisory Group,0.600
784,Tavira Financial Trust,Tavira Advisory,0.600
785,Analog Financial Trust,Analog Investment Company,0.600


In [32]:
df.to_csv("training-data.csv", index=False, encoding="utf-8-sig")

In [28]:
rows2

[{'sentence1': 'Aspire Wealth Group',
  'sentence2': 'Aspire Investment Partners',
  'similarity': 0.6},
 {'sentence1': 'University Capital Management',
  'sentence2': 'University Financial Partners',
  'similarity': 0.6},
 {'sentence1': 'Pin Wealth Partners',
  'sentence2': 'Pin Advisory',
  'similarity': 0.6},
 {'sentence1': 'Rockwood Capital',
  'sentence2': 'Rockwood Financial Advisors',
  'similarity': 0.6},
 {'sentence1': 'Pine Capital Partners',
  'sentence2': 'Pine Capital Advisors',
  'similarity': 0.6},
 {'sentence1': 'Birch Investment Partners',
  'sentence2': 'Birch Advisory Services',
  'similarity': 0.6},
 {'sentence1': 'Tavira Collective Trust',
  'sentence2': 'Tavira Capital Partners',
  'similarity': 0.6},
 {'sentence1': 'Analog Collective Services',
  'sentence2': 'Analog Capital Group',
  'similarity': 0.6},
 {'sentence1': 'Pin Collective Investment',
  'sentence2': 'Pin Collective Services',
  'similarity': 0.6},
 {'sentence1': 'Tradewinds Capital Management',
  'se