In [6]:
# Library
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from cycler import cycler
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from sklearn.ensemble import RandomForestRegressor
from transformers import AutoTokenizer, AutoModelForCausalLM
import Automation.automation as automation

from Automation.sdscm import plot_dag_from_sample_space, sample_sequences, format_sequences_as_dataframe, sample_counterfactual_sequences

## Load and clean data. Meanwhile dicrete valid data

In [7]:
#Discrete data 
df = pd.read_csv("data/Titanic-Dataset.csv")

to_drop = []
row_count = len(df)

for col in df.columns:
    nunique = df[col].nunique(dropna=True)
    unique_ratio = nunique / row_count

    if (not pd.api.types.is_numeric_dtype(df[col]) and nunique > 10) or (unique_ratio == 1.0):
        to_drop.append(col)

df_filtered = df.drop(columns=to_drop)


for col in df_filtered.columns:
    if pd.api.types.is_numeric_dtype(df_filtered[col]):
        median_val = df_filtered[col].median()
        df_filtered[col] = df_filtered[col].fillna(median_val)
    else:
        mode_val = df_filtered[col].mode(dropna=True)
        if not mode_val.empty:
            df_filtered[col] = df_filtered[col].fillna(mode_val[0])
        else:
            df_filtered[col] = df_filtered[col].fillna("Unknown")


print(f"Deleted {len(to_drop)} variables:", to_drop)
print(f"DataFrame shape after filtering & filling: {df_filtered.shape}")
print("\nData summary after filling:")
print(df_filtered.describe(include='all'))

new_df = automation.discrete_candidates(df_filtered)
print(new_df)

Deleted 4 variables: ['PassengerId', 'Name', 'Ticket', 'Cabin']
DataFrame shape after filtering & filling: (891, 8)

Data summary after filling:
          Survived      Pclass   Sex         Age       SibSp       Parch  \
count   891.000000  891.000000   891  891.000000  891.000000  891.000000   
unique         NaN         NaN     2         NaN         NaN         NaN   
top            NaN         NaN  male         NaN         NaN         NaN   
freq           NaN         NaN   577         NaN         NaN         NaN   
mean      0.383838    2.308642   NaN   29.361582    0.523008    0.381594   
std       0.486592    0.836071   NaN   13.019697    1.102743    0.806057   
min       0.000000    1.000000   NaN    0.420000    0.000000    0.000000   
25%       0.000000    2.000000   NaN   22.000000    0.000000    0.000000   
50%       0.000000    3.000000   NaN   28.000000    0.000000    0.000000   
75%       1.000000    3.000000   NaN   35.000000    1.000000    0.000000   
max       1.000000 

## Use Qwen3-8B to generate the first complete candidate set

In [8]:
# Original llm
model_name = "Qwen/Qwen3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",  
    device_map="auto"          
    
)
model.eval()

def call_Qwen(cur_prompt, stop=None):
    messages = [
        {"role": "user", "content": cur_prompt}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)

    inputs = tokenizer(text, return_tensors="pt").to(model.device)



    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        temperature=0.4,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id
    )


    response = tokenizer.decode(outputs, skip_special_tokens=True)

    return response.strip()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [9]:
import importlib
importlib.reload(automation)

<module 'Automation.automation' from '/home/zeyuan/Capstone/Automation/automation.py'>

In [10]:
#New dataset with discrete candidatews
description= """
This dataset contains detailed information about passengers aboard the Titanic, capturing both demographic and socio-economic characteristics. Each record represents one passenger, including whether they survived the disaster (Survived), their ticket class (Pclass), gender (Sex), and the number of siblings or spouses (SibSp) and parents or children (Parch) aboard.
The Embarked variable indicates the port of boarding: Cherbourg (C), Queenstown (Q), or Southampton (S). In addition, continuous variables such as Age and Fare have been transformed into three discrete levels (low, mid, high) to represent relative categories of age and ticket price among passengers.
"""
candidates_set = automation.build_long_candidates_set(tokenizer,model,new_df,description=description)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[Raw output] The passenger did not survive the disaster and was categorized as deceased in the records.
Okay, let's tackle this query step by step. First, I need to understand what the user is asking for.
[True output]The passenger did not survive the disaster and was categorized as deceased in the records.
[Raw output] The passenger survived the disaster and was among those who lived through the sinking of the Titanic.
Okay, let's tackle this query step by step. First, I need to understand what the user is asking
[True output]The passenger survived the disaster and was among those who lived through the sinking of the Titanic.
Added column: Survived (2 sentences)
[Raw output] The passenger traveled in third class, reflecting a lower socioeconomic status and limited access to premium amenities on board. 

Okay, let's tackle this query step by step. First, I need to understand what
[True output]The passenger traveled in third class, reflecting a lower socioeconomic status and limited acc

In [11]:
for i in candidates_set.keys():
    print(f"{i}: {candidates_set[i]}\n")

Survived: ['The passenger did not survive the disaster and was categorized as deceased in the records.', 'The passenger survived the disaster and was among those who lived through the sinking of the Titanic.']

Pclass: ['The passenger traveled in third class, reflecting a lower socioeconomic status and limited access to premium amenities on board.', 'The passenger traveled in first class, indicating a higher-cost ticket category for the voyage.', 'The passenger traveled in second class, suggesting a middle-tier ticket category with moderate comfort and amenities during the voyage.']

Sex: ["The passenger is male and belongs to the group identified by the 'male' category in the dataset's Sex variable.", "The passenger is female, reflecting her assigned gender at birth based on the data's categorization."]

SibSp: ['The passenger has one sibling or spouse aboard, suggesting a moderate level of family connection during the journey.', 'The passenger had no siblings or spouses aboard, sugge

In [None]:
import json

def save_candidates(candidates_set, path="candidates_set.json"):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(candidates_set, f, ensure_ascii=False, indent=4)
    print(f"Saved to {path}")
save_candidates(candidates_set)

def save_candidates(candidates_set, path="ihdp_candidates_set.json"):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(candidates_set, f, ensure_ascii=False, indent=4)
    print(f"Saved to {path}")
save_candidates(candidates_set)

Saved to candidates_set.json


In [13]:
import json

def load_candidates(path="candidates_set.json"):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data
candidates_set =load_candidates()

## Using the one candiates set to generate n candidate sets by using pegasus_paraphrase

In [17]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# ----------------------
# Load Pegasus model
# ----------------------
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


# ----------------------
# Pegasus paraphrase function
# ----------------------
def paraphrase_sentence(text, num_return_sequences=5, num_beams=5, max_length=60):

    batch = tokenizer(
        [text],
        truncation=True,
        padding='longest',
        max_length=60,
        return_tensors="pt"
    ).to(torch_device)

    outputs = model.generate(
        **batch,
        max_length=max_length,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        temperature=1.5
    )

    paraphrased = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [p.strip() for p in paraphrased]


# ----------------------
# Generate N new candidate sets
# ----------------------
def generate_n_candidate_sets(candidates_set, n=5):
    """
    candidates_set: dict[str, list[str]]
        e.g. {"Survived": ["The passenger died.", "The passenger survived."], ...}
    n: how many candidate_sets you want (also = how many paraphrases per sentence)
    """

    paraphrased = {}

    for var, sentences in candidates_set.items():
        var_paras = []
        for sent in sentences:
            paras = paraphrase_sentence(sent, num_return_sequences=n)

            if len(paras) < n:
                while len(paras) < n:
                    paras.append(random.choice(paras))
            var_paras.append(paras)
        paraphrased[var] = var_paras

    all_sets = []

    for k in range(n):  
        new_set = {}
        for var, var_paras in paraphrased.items():
            new_sents = [paras[k] for paras in var_paras]
            new_set[var] = new_sents
        all_sets.append(new_set)

    return all_sets

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
generated_sets = generate_n_candidate_sets(candidates_set, n=3)

for i, s in enumerate(generated_sets):
    print(f"\n=== Candidate Set {i+1} ===")
    print(s)


=== Candidate Set 1 ===
{'Survived': ['The passenger died in the disaster and was categorized as dead in the records.', 'The passenger lived through the sinking of the Titanic.'], 'Pclass': ['The passenger traveled in third class because of their lower status and limited access to premium amenities.', 'The passenger traveled in first class for the voyage.', 'The passenger traveled in second class, suggesting a middle tier ticket category with moderate comfort and amenities.'], 'Sex': ["The passenger is male and belongs to the group identified by the 'male' category in the dataset's Sex variable.", "The passenger's assigned gender at birth is based on the data's categorization."], 'SibSp': ['A moderate level of family connection is suggested by the fact that the passenger has one sibling or spouse.', 'The passenger had no siblings or spouses with him.', 'The passenger suggested a larger family group on the ship.', "A larger family group was suggested by the passenger's four siblings or

In [24]:
print("Let us see the short verison")
candidates_set_short = load_candidates("candidates_set_short.json")
generated_sets = generate_n_candidate_sets(candidates_set_short, n=3)

for i, s in enumerate(generated_sets):
    print(f"\n=== Candidate Set {i+1} ===")
    print(s)

Let us see the short verison

=== Candidate Set 1 ===
{'Survived': ['The passenger died.', 'The passenger is alive.'], 'Pclass': ["The third class is the passenger's ticket.", "The first class is the passenger's ticket class.", "2 is the passenger's ticket class."], 'Sex': ['The passenger is male.', 'The passenger is female.'], 'SibSp': ['There is a passenger with one sibling or spouse.', "The passenger doesn't have any siblings or spouses.", 'There are 3 siblings or spouses on the passenger.', 'There are four siblings or spouses on the passenger.', 'There are two siblings or spouses on the passenger.', 'There are five siblings or spouses on the passenger.', 'There are 8 siblings or spouses on the passenger.'], 'Parch': ['The passenger does not have parents or children with him.', 'There is a passenger with one parent or child.', 'The passenger has two people with them.', 'The passenger has a lot of people on board.', 'The passenger has 3 people with him.', 'The passenger has people wi

In [23]:

for i in range(len(generated_sets)):
    filled_config = automation.fill_candidate_sets(
    'titanic_framework.json',  
    generated_sets[i],            
    f'titanic_ready_{i}.json'       
)


Filled Sex: 2 candidates
Filled Age_3class: 3 candidates
Filled Pclass: 3 candidates
Filled Embarked: 3 candidates
Filled Fare_3class: 3 candidates
Filled SibSp: 7 candidates
Filled Parch: 7 candidates
Filled Survived: 2 candidates
Updated intervention_choices for Sex: [0, 1]
Updated possible_outcome_choices for Survived: [0, 1]

Filled configuration saved to titanic_ready_0.json
Filled Sex: 2 candidates
Filled Age_3class: 3 candidates
Filled Pclass: 3 candidates
Filled Embarked: 3 candidates
Filled Fare_3class: 3 candidates
Filled SibSp: 7 candidates
Filled Parch: 7 candidates
Filled Survived: 2 candidates
Updated intervention_choices for Sex: [0, 1]
Updated possible_outcome_choices for Survived: [0, 1]

Filled configuration saved to titanic_ready_1.json
Filled Sex: 2 candidates
Filled Age_3class: 3 candidates
Filled Pclass: 3 candidates
Filled Embarked: 3 candidates
Filled Fare_3class: 3 candidates
Filled SibSp: 7 candidates
Filled Parch: 7 candidates
Filled Survived: 2 candidates
Up