# Unit14｜配方/材料的非監督探索：關聯規則 + 相似度搜尋

對應講義：`Part_3/Unit14_Formulation_Association_Similarity.md`

本 Notebook 用玩具版配方資料示範：
- 頻繁項集（support）
- 規則（confidence/lift）
- Jaccard 相似度搜尋
- 輸出候選清單（可交付）


In [25]:
# ===== Shared environment (repo-root + outputs/) =====
import os, sys
from pathlib import Path

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)

def find_repo_root(start=None):
    start = Path(start or os.getcwd()).resolve()
    for p in [start] + list(start.parents):
        if (p / 'Jupyter_Scripts').is_dir():
            return p
    if IN_COLAB:
        candidate = Path('/content/drive/MyDrive/ChemEng_AI_Course_Materials')
        if (candidate / 'Jupyter_Scripts').is_dir():
            return candidate
    return start

REPO_ROOT = find_repo_root()
OUTPUT_DIR = Path(__file__).parent if '__file__' in globals() else Path.cwd()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(OUTPUT_DIR)
os.makedirs('Unit14_Results', exist_ok=True)
print('REPO_ROOT:', REPO_ROOT)
print('Working dir:', os.getcwd())


REPO_ROOT: G:\我的雲端硬碟\FCU\課程\ChemEng_AI_Course_Materials
Working dir: g:\我的雲端硬碟\FCU\課程\ChemEng_AI_Course_Materials\Part_3


In [26]:
# 匯入套件
import itertools
from collections import Counter

import numpy as np
import pandas as pd

print('Imports OK')


Imports OK


## 1. 建立玩具版配方交易資料

In [27]:
transactions = [
    {'Solvent:Toluene', 'CoSolvent:EtOAc', 'Additive:InhibitorA', 'Surfactant:S1'},
    {'Solvent:Toluene', 'CoSolvent:MEK',   'Additive:InhibitorA', 'Surfactant:S1'},
    {'Solvent:Toluene', 'CoSolvent:THF',   'Additive:InhibitorB', 'Surfactant:S1'},
    {'Solvent:Hexane',  'CoSolvent:EtOAc', 'Additive:InhibitorA', 'Surfactant:S2'},
    {'Solvent:Heptane', 'CoSolvent:EtOAc', 'Additive:InhibitorA', 'Surfactant:S2'},
    {'Solvent:Heptane', 'CoSolvent:IPA',   'Additive:InhibitorC', 'Surfactant:S2'},
    {'Solvent:EtOH',    'CoSolvent:Water', 'Additive:InhibitorC', 'Surfactant:S3'},
    {'Solvent:EtOH',    'CoSolvent:Water', 'Additive:InhibitorC', 'Surfactant:S3', 'Salt:NaCl'},
    {'Solvent:ACN',     'CoSolvent:Water', 'Additive:InhibitorB', 'Surfactant:S3'},
    {'Solvent:EtOAc',   'CoSolvent:IPA',   'Additive:InhibitorA', 'Surfactant:S1'},
    {'Solvent:MEK',     'CoSolvent:IPA',   'Additive:InhibitorA', 'Surfactant:S1'},
    {'Solvent:THF',     'CoSolvent:IPA',   'Additive:InhibitorB', 'Surfactant:S1'},
    {'Solvent:Hexane',  'CoSolvent:MEK',   'Additive:InhibitorA', 'Surfactant:S2'},
    {'Solvent:Heptane', 'CoSolvent:MEK',   'Additive:InhibitorA', 'Surfactant:S2'},
    {'Solvent:DCM',     'CoSolvent:EtOAc', 'Additive:InhibitorB', 'Surfactant:S1'},
    {'Solvent:DCM',     'CoSolvent:THF',   'Additive:InhibitorB', 'Surfactant:S1'},
    {'Solvent:Toluene', 'CoSolvent:EtOAc', 'Additive:InhibitorB', 'Surfactant:S1'},
    {'Solvent:Toluene', 'CoSolvent:EtOAc', 'Additive:InhibitorA', 'Surfactant:S1', 'Catalyst:C1'},
    {'Solvent:EtOH',    'CoSolvent:Water', 'Additive:InhibitorC', 'Surfactant:S3', 'Catalyst:C2'},
    {'Solvent:ACN',     'CoSolvent:Water', 'Additive:InhibitorB', 'Surfactant:S3', 'Catalyst:C2'},
]

df_tx = pd.DataFrame({
    'formulation_id': [f'F{i:02d}' for i in range(len(transactions))],
    'items': [sorted(list(t)) for t in transactions],
})

display(df_tx.head())
print('n_transactions:', len(df_tx))


Unnamed: 0,formulation_id,items
0,F00,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."
1,F01,"[Additive:InhibitorA, CoSolvent:MEK, Solvent:T..."
2,F02,"[Additive:InhibitorB, CoSolvent:THF, Solvent:T..."
3,F03,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."
4,F04,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."


n_transactions: 20


## 2. 頻繁項集與關聯規則（純 Python，避免外部套件）

In [28]:
def all_itemsets(items, max_size=3):
    items = list(items)
    for r in range(1, max_size+1):
        for comb in itertools.combinations(items, r):
            yield tuple(sorted(comb))

N = len(transactions)
count = Counter()
for t in transactions:
    for it in all_itemsets(t, max_size=3):
        count[it] += 1

rows = []
for it, c in count.items():
    rows.append({'itemset': it, 'size': len(it), 'count': c, 'support': c / N})

df_sup = pd.DataFrame(rows).sort_values(['size','support'], ascending=[True, False])
display(df_sup.head(12))


Unnamed: 0,itemset,size,count,support
3,"(Surfactant:S1,)",1,10,0.5
0,"(Additive:InhibitorA,)",1,9,0.45
21,"(Additive:InhibitorB,)",1,7,0.35
1,"(CoSolvent:EtOAc,)",1,6,0.3
2,"(Solvent:Toluene,)",1,5,0.25
33,"(Surfactant:S2,)",1,5,0.25
61,"(Surfactant:S3,)",1,5,0.25
62,"(CoSolvent:Water,)",1,5,0.25
50,"(Additive:InhibitorC,)",1,4,0.2
51,"(CoSolvent:IPA,)",1,4,0.2


In [29]:
min_support = 0.15
min_conf = 0.6

rules = []
for it, c_ab in count.items():
    if len(it) < 2:
        continue
    support_ab = c_ab / N
    if support_ab < min_support:
        continue

    items = list(it)
    for r in range(1, len(items)):
        for A in itertools.combinations(items, r):
            A = tuple(sorted(A))
            B = tuple(sorted(set(items) - set(A)))

            c_a = count.get(A, 0)
            c_b = count.get(B, 0)
            if c_a == 0 or c_b == 0:
                continue

            conf = c_ab / c_a
            lift = conf / (c_b / N)
            if conf >= min_conf:
                rules.append({
                    'A': A,
                    'B': B,
                    'support': support_ab,
                    'confidence': conf,
                    'lift': lift,
                    'count': c_ab,
                })

df_rules = pd.DataFrame(rules).sort_values(['lift','confidence','support'], ascending=False)
display(df_rules.head(15))

df_rules.to_csv('./Unit14_Results/rules_top.csv', index=False)
print('Saved: Unit14_Results/rules_top.csv')


Unnamed: 0,A,B,support,confidence,lift,count
37,"(Solvent:EtOH,)","(Additive:InhibitorC, Surfactant:S3)",0.15,1.0,6.666667,3
40,"(Additive:InhibitorC, Surfactant:S3)","(Solvent:EtOH,)",0.15,1.0,6.666667,3
44,"(Solvent:EtOH,)","(Additive:InhibitorC, CoSolvent:Water)",0.15,1.0,6.666667,3
45,"(Additive:InhibitorC, CoSolvent:Water)","(Solvent:EtOH,)",0.15,1.0,6.666667,3
23,"(Solvent:EtOH,)","(Additive:InhibitorC,)",0.15,1.0,5.0,3
41,"(Solvent:EtOH, Surfactant:S3)","(Additive:InhibitorC,)",0.15,1.0,5.0,3
47,"(CoSolvent:Water, Solvent:EtOH)","(Additive:InhibitorC,)",0.15,1.0,5.0,3
22,"(Additive:InhibitorC,)","(Solvent:EtOH,)",0.15,0.75,5.0,3
36,"(Additive:InhibitorC,)","(Solvent:EtOH, Surfactant:S3)",0.15,0.75,5.0,3
42,"(Additive:InhibitorC,)","(CoSolvent:Water, Solvent:EtOH)",0.15,0.75,5.0,3


Saved: Unit14_Results/rules_top.csv


## 2.1 Guardrails：用 holdout 驗證規則（避免假規則）

示範做法：把交易資料切成 train/holdout，只在 train 挖規則，再到 holdout 檢查 confidence 是否大幅下降。


In [30]:
# 2.1.1 train/holdout split（示範：最後 20% 當 holdout；實務可用時間/批次切分）

N = len(transactions)
split = int(N * 0.8)
train_tx = transactions[:split]
hold_tx = transactions[split:]

print('train:', len(train_tx), 'holdout:', len(hold_tx))


train: 16 holdout: 4


In [31]:
# 2.1.2 在 train 挖規則（沿用前面函數）

from collections import Counter

count_tr = Counter()
for t in train_tx:
    for it in all_itemsets(t, max_size=3):
        count_tr[it] += 1

Ntr = len(train_tx)

rules_tr = []
for it, c_ab in count_tr.items():
    if len(it) < 2:
        continue
    support_ab = c_ab / Ntr
    if support_ab < min_support:
        continue

    items = list(it)
    for r in range(1, len(items)):
        for A in itertools.combinations(items, r):
            A = tuple(sorted(A))
            B = tuple(sorted(set(items) - set(A)))
            c_a = count_tr.get(A, 0)
            c_b = count_tr.get(B, 0)
            if c_a == 0 or c_b == 0:
                continue
            conf = c_ab / c_a
            lift = conf / (c_b / Ntr)
            if conf >= min_conf:
                rules_tr.append({'A': A, 'B': B, 'support': support_ab, 'confidence': conf, 'lift': lift, 'count': c_ab})

rules_tr = pd.DataFrame(rules_tr).sort_values(['lift','confidence','support'], ascending=False)
print('rules in train:', len(rules_tr))
display(rules_tr.head(10))


rules in train: 10


Unnamed: 0,A,B,support,confidence,lift,count
7,"(CoSolvent:Water,)","(Surfactant:S3,)",0.1875,1.0,5.333333,3
8,"(Surfactant:S3,)","(CoSolvent:Water,)",0.1875,1.0,5.333333,3
5,"(Solvent:Heptane,)","(Surfactant:S2,)",0.1875,1.0,3.2,3
6,"(Surfactant:S2,)","(Solvent:Heptane,)",0.1875,0.6,3.2,3
1,"(Solvent:Toluene,)","(Surfactant:S1,)",0.1875,1.0,2.0,3
2,"(CoSolvent:MEK,)","(Additive:InhibitorA,)",0.1875,1.0,2.0,3
3,"(Additive:InhibitorB,)","(Surfactant:S1,)",0.25,0.8,1.6,4
4,"(Surfactant:S2,)","(Additive:InhibitorA,)",0.25,0.8,1.6,4
0,"(CoSolvent:EtOAc,)","(Additive:InhibitorA,)",0.1875,0.75,1.5,3
9,"(CoSolvent:IPA,)","(Surfactant:S1,)",0.1875,0.75,1.5,3


In [32]:
# 2.1.3 到 holdout 驗證 confidence（最小版）

def contains(tx, itemset):
    return set(itemset).issubset(tx)

rows = []
for _, r in rules_tr.head(20).iterrows():
    A = tuple(r['A']); B = tuple(r['B'])

    nA = sum(contains(t, A) for t in hold_tx)
    nAB = sum(contains(t, A + B) for t in hold_tx)

    conf_hold = (nAB / nA) if nA > 0 else np.nan
    rows.append({
        'A': A,
        'B': B,
        'conf_train': float(r['confidence']),
        'conf_hold': float(conf_hold) if conf_hold == conf_hold else np.nan,
        'support_train': float(r['support']),
        'lift_train': float(r['lift']),
        'nA_hold': int(nA),
        'nAB_hold': int(nAB),
    })

df_val = pd.DataFrame(rows)
df_val['conf_drop'] = df_val['conf_train'] - df_val['conf_hold']

display(df_val.sort_values('conf_drop', ascending=False).head(10))

# 你可以用這個規則：conf_hold 太低就淘汰


Unnamed: 0,A,B,conf_train,conf_hold,support_train,lift_train,nA_hold,nAB_hold,conf_drop
6,"(Additive:InhibitorB,)","(Surfactant:S1,)",0.8,0.5,0.25,1.6,2,1,0.3
8,"(CoSolvent:EtOAc,)","(Additive:InhibitorA,)",0.75,0.5,0.1875,1.5,2,1,0.25
0,"(CoSolvent:Water,)","(Surfactant:S3,)",1.0,1.0,0.1875,5.333333,2,2,0.0
1,"(Surfactant:S3,)","(CoSolvent:Water,)",1.0,1.0,0.1875,5.333333,2,2,0.0
4,"(Solvent:Toluene,)","(Surfactant:S1,)",1.0,1.0,0.1875,2.0,2,2,0.0
2,"(Solvent:Heptane,)","(Surfactant:S2,)",1.0,,0.1875,3.2,0,0,
3,"(Surfactant:S2,)","(Solvent:Heptane,)",0.6,,0.1875,3.2,0,0,
5,"(CoSolvent:MEK,)","(Additive:InhibitorA,)",1.0,,0.1875,2.0,0,0,
7,"(Surfactant:S2,)","(Additive:InhibitorA,)",0.8,,0.25,1.6,0,0,
9,"(CoSolvent:IPA,)","(Surfactant:S1,)",0.75,,0.1875,1.5,0,0,


## 3. 相似度搜尋：Jaccard（集合型配方）

In [33]:
def jaccard(a, b):
    a = set(a); b = set(b)
    return len(a & b) / max(1, len(a | b))

query_idx = 0
query = transactions[query_idx]

sims = []
for i, t in enumerate(transactions):
    sims.append({'formulation_id': f'F{i:02d}', 'jaccard': jaccard(query, t), 'items': sorted(list(t))})

df_sim = pd.DataFrame(sims).sort_values('jaccard', ascending=False)
display(df_sim.head(10))

df_sim.to_csv('./Unit14_Results/similarity_top.csv', index=False)
print('Saved: Unit14_Results/similarity_top.csv')


Unnamed: 0,formulation_id,jaccard,items
0,F00,1.0,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."
17,F17,0.8,"[Additive:InhibitorA, Catalyst:C1, CoSolvent:E..."
16,F16,0.6,"[Additive:InhibitorB, CoSolvent:EtOAc, Solvent..."
1,F01,0.6,"[Additive:InhibitorA, CoSolvent:MEK, Solvent:T..."
9,F09,0.333333,"[Additive:InhibitorA, CoSolvent:IPA, Solvent:E..."
14,F14,0.333333,"[Additive:InhibitorB, CoSolvent:EtOAc, Solvent..."
10,F10,0.333333,"[Additive:InhibitorA, CoSolvent:IPA, Solvent:M..."
4,F04,0.333333,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."
3,F03,0.333333,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."
2,F02,0.333333,"[Additive:InhibitorB, CoSolvent:THF, Solvent:T..."


Saved: Unit14_Results/similarity_top.csv


## 4. 候選清單示範：以危害溶劑替換為例

In [34]:
hazard = 'Solvent:Toluene'
idx_h = [i for i,t in enumerate(transactions) if hazard in t]

print(f'找到 {len(idx_h)} 個包含 {hazard} 的配方')

solvent_terms = []
for i in idx_h:
    for item in transactions[i]:
        if item.startswith('Solvent:') and item != hazard:
            solvent_terms.append(item)

print(f'找到 {len(solvent_terms)} 個替代溶劑實例')

alt_counts = Counter(solvent_terms)

if len(alt_counts) == 0:
    print(f'⚠️ 警告：在包含 {hazard} 的配方中未找到其他溶劑')
    alt = pd.DataFrame(columns=['candidate_solvent', 'count', 'support_in_hazard_set', 'ehs_score'])
else:
    alt = pd.DataFrame([
        {'candidate_solvent': k, 'count': v, 'support_in_hazard_set': v/len(idx_h)}
        for k,v in alt_counts.items()
    ]).sort_values('support_in_hazard_set', ascending=False)

    # 示範用 EHS 分數（實務請接你自己的資料表）
    ehs = {
        'Solvent:Toluene': 4,
        'Solvent:DCM': 5,
        'Solvent:Hexane': 3,
        'Solvent:Heptane': 3,
        'Solvent:EtOAc': 2,
        'Solvent:MEK': 2,
        'Solvent:THF': 3,
        'Solvent:EtOH': 1,
        'Solvent:ACN': 3,
    }
    alt['ehs_score'] = alt['candidate_solvent'].map(lambda x: ehs.get(x, np.nan))

    display(alt)

if len(alt) > 0:
    shortlist = alt.sort_values(['ehs_score','support_in_hazard_set'], ascending=[True, False]).head(5)
    shortlist.to_csv('./Unit14_Results/candidate_shortlist.csv', index=False)
    print('✓ Saved: Unit14_Results/candidate_shortlist.csv')
    display(shortlist)
else:
    print('⚠️ 無法生成候選清單（沒有替代溶劑）')


找到 5 個包含 Solvent:Toluene 的配方
找到 0 個替代溶劑實例
⚠️ 警告：在包含 Solvent:Toluene 的配方中未找到其他溶劑
⚠️ 無法生成候選清單（沒有替代溶劑）


## 5. 把規則 + 相似度 + 工程約束整合成可交付 shortlist（示範）

這裡示範一個最小版：
- 先用相似度找到 Top-N
- 再用 EHS/cost 這類工程約束過濾
- 最後列出候選清單（可交付）


In [35]:
# 5.1 以 query 配方的相似度 Top-N 當候選來源

TopN = 8
cand_sim = df_sim.head(TopN).copy()

# 建立配方的主要溶劑（示範）：從 items 抽第一個 Solvent:*
def main_solvent(items):
    for it in items:
        if it.startswith('Solvent:'):
            return it
    return None

cand_sim['main_solvent'] = cand_sim['items'].map(main_solvent)

# 加上示範用 EHS/cost（實務請接資料表）
ehscost = {
    'Solvent:Toluene': {'EHS': 4, 'Cost': 2},
    'Solvent:DCM': {'EHS': 5, 'Cost': 2},
    'Solvent:Hexane': {'EHS': 3, 'Cost': 2},
    'Solvent:Heptane': {'EHS': 3, 'Cost': 2},
    'Solvent:EtOAc': {'EHS': 2, 'Cost': 2},
    'Solvent:MEK': {'EHS': 2, 'Cost': 1},
    'Solvent:THF': {'EHS': 3, 'Cost': 3},
    'Solvent:EtOH': {'EHS': 1, 'Cost': 1},
    'Solvent:ACN': {'EHS': 3, 'Cost': 3},
}

cand_sim['EHS'] = cand_sim['main_solvent'].map(lambda s: ehscost.get(s, {}).get('EHS', np.nan))
cand_sim['Cost'] = cand_sim['main_solvent'].map(lambda s: ehscost.get(s, {}).get('Cost', np.nan))

# 工程約束：EHS <= 3
short = cand_sim[cand_sim['EHS'] <= 3].copy()

if len(short) > 0:
    # 綜合排序：相似度高 + EHS 低 + cost 低（示範）
    short['rank_score'] = (1 - short['jaccard']) + 0.3 * short['EHS'] + 0.1 * short['Cost']
    short = short.sort_values('rank_score')

    short.to_csv('./Unit14_Results/candidate_shortlist_v2.csv', index=False)
    print('✓ Saved: Unit14_Results/candidate_shortlist_v2.csv')

    display(short[['formulation_id','jaccard','main_solvent','EHS','Cost','rank_score','items']])
else:
    print('⚠️ 無符合 EHS <= 3 約束的候選配方')


✓ Saved: Unit14_Results/candidate_shortlist_v2.csv


Unnamed: 0,formulation_id,jaccard,main_solvent,EHS,Cost,rank_score,items
10,F10,0.333333,Solvent:MEK,2,1,1.366667,"[Additive:InhibitorA, CoSolvent:IPA, Solvent:M..."
9,F09,0.333333,Solvent:EtOAc,2,2,1.466667,"[Additive:InhibitorA, CoSolvent:IPA, Solvent:E..."
4,F04,0.333333,Solvent:Heptane,3,2,1.766667,"[Additive:InhibitorA, CoSolvent:EtOAc, Solvent..."
