## Insights into Taxonomy

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.cm as cm
import plotly.colors as pc
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as pc
import plotly.io as pio
from plotly.subplots import make_subplots
import json
import os
import re
import h5py
pio.renderers.default = "vscode"

from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Rectangle
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from matplotlib.ticker import MultipleLocator
from matplotlib.ticker import AutoMinorLocator
from mpl_toolkits.axes_grid1.inset_locator import inset_axes, mark_inset
from collections import defaultdict
from collections import Counter
from pathlib import Path

plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = ["Times New Roman"]
plt.rcParams["mathtext.fontset"] = "dejavuserif" 

sns.set_theme(style="white")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)         # Prevents wrapping
pd.set_option('display.max_colwidth', None)  # Shows full content in each cell

In [2]:
# sankey task formulation
# ==========================================
taxonomy_task_df = pd.read_excel("./taxonomy.xlsx", sheet_name="STUDY_TASK")
df = taxonomy_task_df[['CitationKey', 'Classification', 'Generation']].copy()

df['Classification'] = df['Classification'].fillna('No Classification')
df['Generation'] = df['Generation'].fillna('Classification Only') 

df = df.replace('None', 'No Classification')
df['Generation'] = df['Generation'].replace('No Classification', 'Classification Only')
df = df.replace('nan', 'No Classification')

# Explode lists
for col in ['Classification', 'Generation']:
    df[col] = df[col].astype(str).str.split(',')
    df = df.explode(col)
    df[col] = df[col].str.strip()

df = df[df['Classification'] != '']
df = df[df['Generation'] != '']

# Calculate Weights
df['Class_Count'] = df.groupby('CitationKey')['Classification'].transform('count')
df['Gen_Count'] = df.groupby('CitationKey')['Generation'].transform('count')
df['Weight'] = 1 / (df['Class_Count'] * df['Gen_Count'])

# Create Edges
edges = df.groupby(['Classification', 'Generation'])['Weight'].sum().reset_index(name='Value')
edges = edges.rename(columns={'Classification': 'Source', 'Generation': 'Target'})

# Define Node Properties
all_labels = pd.unique(edges[['Source', 'Target']].values.ravel())
nodes_df = pd.DataFrame({'Label': all_labels})
nodes_df['ID'] = nodes_df.index
label_to_id = dict(zip(nodes_df['Label'], nodes_df['ID']))

edges['SourceID'] = edges['Source'].map(label_to_id)
edges['TargetID'] = edges['Target'].map(label_to_id)

# ==========================================
residual_labels = ['Classification Only', 'Generation Only', 'No Classification']
palette = pc.qualitative.Pastel 

node_colors = []
link_colors = []

# Assign Node Colors
for idx, row in nodes_df.iterrows():
    if row['Label'] in residual_labels:
        # Keep residuals gray
        node_colors.append('rgba(200, 200, 200, 0.5)') 
    else:
        # Assign color from Pastel palette
        color_idx = idx % len(palette)
        node_colors.append(palette[color_idx])

# Assign Link Colors
for idx, row in edges.iterrows():
    source_label = row['Source']
    target_label = row['Target']
    
    if source_label in residual_labels or target_label in residual_labels:
        link_colors.append('rgba(220, 220, 220, 0.5)')
    else:
        source_id = label_to_id[source_label]
        base_color = node_colors[source_id]
        if base_color.startswith('#'):
            h = base_color.lstrip('#')
            rgb = tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
            link_colors.append(f'rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, 0.6)')
        else:
            link_colors.append(base_color)

# ==========================================
# counts
label_to_studies = defaultdict(set)
for idx, row in taxonomy_task_df.iterrows():
    val_c = str(row['Classification'])
    if val_c != 'None' and val_c != 'nan':
        for tag in val_c.split(','):
            label_to_studies[tag.strip()].add(row['CitationKey'])
    else:
        label_to_studies['No Classification'].add(row['CitationKey'])

    val_g = str(row['Generation'])
    if val_g != 'None' and val_g != 'nan':
        for tag in val_g.split(','):
            label_to_studies[tag.strip()].add(row['CitationKey'])
    else:
        label_to_studies['Classification Only'].add(row['CitationKey'])

nodes_df['StudyCount'] = nodes_df['Label'].map(lambda x: len(label_to_studies.get(x, set())))

# --- Taxonomy IDs ---
taxonomy_ids = {
    "Binary": "F1.1",
    "Multi-Class": "F1.2",
    "Multi-Label": "F1.3",
    "Vulnerability-Specific": "F1.1.1",
    "Description": "F2.1",
    "Reasoning": "F2.2",
    "Report": "F2.3"
}

# Label Formatter with () and {}
def format_label(row):
    label = row['Label']
    count = row['StudyCount']
    
    if label == 'No Classification':
        return ""
    
    tax_id = taxonomy_ids.get(label, "")
    if tax_id:
        return f"{label} ({tax_id}) {{{count}}}"
    else:
        return f"{label} {{{count}}}"

nodes_df['LabelDisplay'] = nodes_df.apply(format_label, axis=1)

# Plot
fig = go.Figure(data=[go.Sankey(
    arrangement="snap",
    node=dict(
        pad=20,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes_df['LabelDisplay'],
        color=node_colors,
        hovertemplate='<b>%{label}</b><br>Volume: %{value:.2f}<extra></extra>',
    ),
    link=dict(
        source=edges['SourceID'],
        target=edges['TargetID'],
        value=edges['Value'],
        color=link_colors
    )
)])

# ==========================================
fig.update_layout(
    font=dict(
        family="Times New Roman, serif", 
        size=20,  
        color="black"
    ),
    width=1000,
    height=500,
    margin=dict(b=60, t=40),
    
    annotations=[
        # Left Column Label
        dict(
            x=0,
            y=-0.1,
            xref="paper",
            yref="paper",
            text="Classification (F1)", # Taxonomy in ()
            showarrow=False,
            font=dict(size=20, color="black"), 
            align="center"
        ),
        # Right Column Label
        dict(
            x=1,
            y=-0.1,
            xref="paper",
            yref="paper",
            text="Generation (F2)", # Taxonomy in ()
            showarrow=False,
            font=dict(size=20, color="black"),
            align="center"
        )
    ]
)

fig.show()

In [3]:
# sankey model & adaptation techniques
# ==========================================
df_models = pd.read_excel("taxonomy.xlsx", sheet_name="MODELS_ESTIMATED")
df_study_model = pd.read_excel("taxonomy.xlsx", sheet_name="STUDY_MODEL")
df_techniques = pd.read_excel("taxonomy.xlsx", sheet_name="STUDY_TECHNIQUE")


df_study_model['Adaptation'] = df_study_model['Adaptation'].astype(str).str.split(',')
df_study_model = df_study_model.explode('Adaptation')
df_study_model['Adaptation'] = df_study_model['Adaptation'].str.strip()

merged_models = pd.merge(
    df_study_model[['CitationKey', 'ModelKey', 'Adaptation']],
    df_models[['ModelKey', 'Scale']],
    on='ModelKey',
    how='left'
)

full_df = pd.merge(
    merged_models,
    df_techniques[['CitationKey', 'Prompt-Engineering', 'Training']],
    on='CitationKey',
    how='left'
)

# ==========================================
peft_keywords = ['Low-Rank Decomposition', 'LoRA Derivates', 'Adapter-Tuning', 'Selective', 'Additive-Other', 'Prompt-Tuning', 'Instruction-Tuning']
full_keywords = ['Full-Parameter Fine-Tuning', 'Instruction-Tuning']
prompt_keywords = ['CoT', 'Few-Shot', 'RAG', 'In-Context', 'Zero-Shot']
pre_keywords = ['Pre-Training']

def resolve_technique(row):
    adaptation = str(row['Adaptation']).upper().strip()
    
    if adaptation == 'PROMPT':
        val = str(row['Prompt-Engineering'])
        if val in ['nan', 'None', '']: return ["Unspecified Prompting"]
        tags = [x.strip() for x in val.split(',')]
        valid_tags = [t for t in tags if any(k.lower() in t.lower() for k in prompt_keywords)]
        return valid_tags if valid_tags else tags 

    train_val = str(row['Training'])
    if train_val in ['nan', 'None', '']: return ["Unspecified Training"]
    tags = [x.strip() for x in train_val.split(',')]
    relevant_techniques = []

    if adaptation == 'PEFT':
        for tag in tags:
            if any(k.lower() in tag.lower() for k in peft_keywords):
                relevant_techniques.append(tag)
        if not relevant_techniques: relevant_techniques.append("Other PEFT")

    elif adaptation == 'FULL':
        for tag in tags:
            if any(k.lower() in tag.lower() for k in full_keywords):
                relevant_techniques.append(tag)
        if not relevant_techniques: relevant_techniques.append("Other Fine-Tuning")
        
    elif adaptation == 'PRE':
         for tag in tags:
            if any(k.lower() in tag.lower() for k in pre_keywords):
                relevant_techniques.append(tag)
         if not relevant_techniques: relevant_techniques.append("Pre-Training")

    elif adaptation == 'FEATURE':
        return ["Feature Extraction"]

    return relevant_techniques

full_df['Specific_Techniques'] = full_df.apply(resolve_technique, axis=1)
sankey_df = full_df.explode('Specific_Techniques')
sankey_df = sankey_df.dropna(subset=['Specific_Techniques']) 
sankey_df = sankey_df[sankey_df['Specific_Techniques'] != ""] 


# ==========================================
def get_method_category(code):
    code = str(code).upper()
    if code == 'PROMPT': return "Prompt Engineering"
    if code == 'FULL': return "Fine-Tuning" 
    if code == 'PEFT': return "Parameter-Efficient Fine-Tuning"
    if code == 'PRE': return "Pre-Training"
    if code == 'FEATURE': return "Feature Extraction"
    return "Other"

sankey_df['Method_Category'] = sankey_df['Adaptation'].apply(get_method_category)
replace_map = {'Full-Parameter Fine-Tuning': 'Full-Parameter'}
sankey_df['Specific_Techniques'] = sankey_df['Specific_Techniques'].replace(replace_map)
sankey_df['Scale'] = sankey_df['Scale'].astype(str).str.strip().str.title()

# Weights
sankey_df['Study_Row_Count'] = sankey_df.groupby('CitationKey')['CitationKey'].transform('count')
sankey_df['Weight'] = 1 / sankey_df['Study_Row_Count']

# Unique Counts (Only needed for Level 2 now based on requirements)
unique_counts_lvl2 = sankey_df.groupby('Specific_Techniques')['CitationKey'].nunique()


# ==========================================
raw_to_display = {} 
scale_ids = {
    "Tiny": "S1.3.1",
    "Small": "S1.3.2",
    "Medium": "S1.3.3",
    "Large": "S1.3.4"
}
raw_lvl0 = ["Tiny", "Small", "Medium", "Large"]
lvl0_labels = []

for raw in raw_lvl0:
    if raw in sankey_df['Scale'].unique():
        tax_id = scale_ids.get(raw, "")
        # Format: "Tiny (S1.3.1)"
        final_label = f"{raw} ({tax_id})" if tax_id else raw
        lvl0_labels.append(final_label)
        raw_to_display[raw] = final_label

cat_ids = {
    "Feature Extraction": "T1",
    "Pre-Training": "T2.2.1",
    "Prompt Engineering": "T2.1",
    "Fine-Tuning": "T2.2.2.1",
    "Parameter-Efficient Fine-Tuning": "T2.2.2.2"
}
cat_display_names = {
    "Fine-Tuning": "Full Fine-Tuning"
}

raw_lvl1 = ["Fine-Tuning", "Parameter-Efficient Fine-Tuning", "Prompt Engineering", "Pre-Training", "Feature Extraction"]
lvl1_labels = []
existing_cats = sankey_df['Method_Category'].unique()

for raw in raw_lvl1:
    if raw in existing_cats:
        tax_id = cat_ids.get(raw, "")
        disp_name = cat_display_names.get(raw, raw)
        # Format: "Pre-Training (T2.2.1)"
        final_label = f"{disp_name} ({tax_id})" if tax_id else disp_name
        lvl1_labels.append(final_label)
        raw_to_display[raw] = final_label

# Specific Techniques
# Format: "LoRA {25}" 
raw_lvl2 = sorted(sankey_df['Specific_Techniques'].unique().tolist())
lvl2_labels = []
for raw in raw_lvl2:
    count = unique_counts_lvl2.get(raw, 0)
    # Using triple braces {{{ }}} to print literal braces in f-string
    final_label = f"{raw} {{{count}}}"
    lvl2_labels.append(final_label)
    raw_to_display[raw] = final_label

# Combine all
all_labels = lvl0_labels + lvl1_labels + lvl2_labels
label_map = {label: i for i, label in enumerate(all_labels)}


# ==========================================
palette = pc.qualitative.Pastel
grey_color = 'lightgrey'
grey_link = 'rgba(200, 200, 200, 0.4)'
grey_cats = ['Pre-Training', 'Feature Extraction', 'Other']

color_map = {}
palette_idx = 0

# A. Scales
for raw_name in raw_lvl0:
    if raw_name in raw_to_display:
        color_map[raw_name] = palette[palette_idx % len(palette)]
        palette_idx += 1

# B. Categories
for raw_name in raw_lvl1:
    if raw_name in raw_to_display:
        if raw_name in grey_cats:
            color_map[raw_name] = grey_color
        else:
            color_map[raw_name] = palette[palette_idx % len(palette)]
            palette_idx += 1

def hex_to_rgba(hex_code, opacity=0.4):
    if hex_code == 'lightgrey': return grey_link
    if hex_code.startswith('rgb'): return hex_code.replace(')', f', {opacity})').replace('rgb', 'rgba')
    h = hex_code.lstrip('#')
    rgb = tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
    return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"


# ==========================================
source = []
target = []
value = []
colors = []

# --- Flow 1: Scale -> Category ---
flow1 = sankey_df.groupby(['Scale', 'Method_Category'])['Weight'].sum().reset_index()

for _, row in flow1.iterrows():
    scale_raw = row['Scale']
    cat_raw = row['Method_Category']
    
    src = raw_to_display.get(scale_raw)
    tgt = raw_to_display.get(cat_raw)
    
    if src in label_map and tgt in label_map:
        source.append(label_map[src])
        target.append(label_map[tgt])
        value.append(row['Weight'])
        
        # Color based on Scale raw name
        base_color = color_map.get(scale_raw, grey_color)
        colors.append(hex_to_rgba(base_color))

# --- Flow 2: Category -> Specific ---
flow2 = sankey_df.groupby(['Method_Category', 'Specific_Techniques'])['Weight'].sum().reset_index()

for _, row in flow2.iterrows():
    cat_raw = row['Method_Category']
    tech_raw = row['Specific_Techniques']
    
    src = raw_to_display.get(cat_raw)
    tgt = raw_to_display.get(tech_raw)
    
    if src in label_map and tgt in label_map:
        source.append(label_map[src])
        target.append(label_map[tgt])
        value.append(row['Weight'])
        
        # Color based on Category raw name
        base_color = color_map.get(cat_raw, grey_color)
        colors.append(hex_to_rgba(base_color))


# ==========================================
node_colors = []
# Map specific technique to its parent category raw name
tech_to_cat = pd.Series(sankey_df.Method_Category.values, index=sankey_df.Specific_Techniques).to_dict()

for l in all_labels:
    final_color = grey_color
    
    # Reverse lookup from raw_to_display
    raw_key = None
    for k, v in raw_to_display.items():
        if v == l:
            raw_key = k
            break
            
    if raw_key:
        # Case A: Scale or Category
        if raw_key in color_map:
            final_color = color_map[raw_key]
        # Case B: Specific Technique (Inherit)
        elif raw_key in tech_to_cat:
            parent_raw = tech_to_cat[raw_key]
            final_color = color_map.get(parent_raw, grey_color)
            
    node_colors.append(final_color)

# ==========================================
fig = go.Figure(data=[go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15, thickness=20,
        line=dict(color="black", width=0.5),
        label=all_labels,
        color=node_colors,
        hovertemplate='<b>%{label}</b><br>Weighted Volume: %{value:.2f}<extra></extra>'
    ),
    link=dict(
        source=source, target=target, value=value, color=colors
    )
)])

fig.update_layout(
    # Global Font Settings (mimics plt.rcParams["font.family"] = "serif")
    font=dict(
        family="Times New Roman, serif", 
        size=17,  
        color="black"
    ),
    width=1000,
    height=600,
    margin=dict(b=60, t=40),
    
    annotations=[
        # Left Column Label
        dict(
            x=0,
            y=-0.1,
            xref="paper",
            yref="paper",
            text="Model Scale (S1.3)",
            showarrow=False,
            font=dict(size=20, color="black"), 
            align="center"
        ),
        # Right Column Label
        dict(
            x=1,
            y=-0.1,
            xref="paper",
            yref="paper",
            text="Adaptation Technique (T2)",
            showarrow=False,
            font=dict(size=20, color="black"),
            align="center"
        )
    ]
)

fig.show()