In [1]:
from datasets import load_dataset
import datasets
import re

# Suppress Hugging Face logs
datasets.utils.logging.set_verbosity_error()

# Load Cosmos QA dataset
dataset_path = "/home/hao/colm/datasets/cosmosQA"
dataset = load_dataset("allenai/cosmos_qa", cache_dir=dataset_path)

# Choose split
split = "train"

# Debug: Print a sample entry to check structure
if split in dataset:
    print("Sample entry structure:", dataset[split][0])

def preprocess_cosmosqa(dataset, split):
    """Extract and clean questions and corresponding answers from Cosmos QA dataset."""
    if split not in dataset:
        raise ValueError(f"Error: Split '{split}' not found. Available splits: {list(dataset.keys())}")
    
    processed_data = []
    for item in dataset[split]:
        question = item.get("question", "").strip()
        options = [item.get("answer0", ""), item.get("answer1", ""), item.get("answer2", ""), item.get("answer3", "")]
        correct_index = item.get("label", -1)  # Correct option index
        
        # Validate question, options, and correct answer
        if question and all(options) and 0 <= correct_index < len(options):
            correct_answer = options[correct_index]
            processed_data.append({"question": question, "answer": correct_answer})
    
    if not processed_data:
        raise ValueError("Error: No valid 'question' and 'answer' fields found in the dataset.")
    
    # Text cleaning
    for item in processed_data:
        item["question"] = re.sub(r"[^a-zA-Z0-9.,!? ]", "", item["question"]).strip()
        item["answer"] = re.sub(r"[^a-zA-Z0-9.,!? ]", "", item["answer"]).strip()
    
    return processed_data

# Process the text data
try:
    processed_samples = preprocess_cosmosqa(dataset, split)
    print(f"Processed {len(processed_samples)} question-answer pairs.")
    
    # Print a few samples
    for i, sample in enumerate(processed_samples[:5]):
        print(f"Sample {i+1}:")
        print(f"Question: {sample['question']}")
        print(f"Answer: {sample['answer']}")
        print("-")
except Exception as e:
    print(f"An error occurred: {e}")


  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 16.7MB [00:00, 214MB/s]                    
Downloading data: 5.61MB [00:00, 240MB/s]                    
Downloading data: 2.13MB [00:00, 214MB/s]                   
Generating train split: 100%|██████████| 25262/25262 [00:00<00:00, 27168.53 examples/s]
Generating test split: 100%|██████████| 6963/6963 [00:00<00:00, 30036.99 examples/s]
Generating validation split: 100%|██████████| 2985/2985 [00:00<00:00, 27138.62 examples/s]


Sample entry structure: {'id': '3Q9SPIIRWJKVQ8244310E8TUS6YWAC##34V1S5K3GTZMDUBNBIGY93FLDOB690##A1S1K7134S2VUC##Blog_1044056##q1_a1##3XU9MCX6VQQG7YPLCSAFDPQNH4GR20', 'context': "Good Old War and person L : I saw both of these bands Wednesday night , and they both blew me away . seriously . Good Old War is acoustic and makes me smile . I really can not help but be happy when I listen to them ; I think it 's the fact that they seemed so happy themselves when they played .", 'question': 'In the future , will this person go to see other bands play ?', 'answer0': 'None of the above choices .', 'answer1': 'This person likes music and likes to see the show , they will see other bands play .', 'answer2': 'This person only likes Good Old War and Person L , no other bands .', 'answer3': 'Other Bands is not on tour and this person can not see them .', 'label': 1}
Processed 25262 question-answer pairs.
Sample 1:
Question: In the future , will this person go to see other bands play ?
Answer: This p

In [3]:
import openai
import pandas as pd
import re

# OpenAI Client
client = openai.OpenAI()

# Define the GPT Prompt
prompt = """
Generate a list of 50 meaningful (Cause, Effect) pairs related to common-sense reasoning from the Cosmos QA dataset.
The pairs should be concise, logically accurate, and formatted strictly as:
(Cause, Effect)

These pairs should reflect daily life, human behavior, and realistic consequences.

Example:
(Falling asleep late, Feeling tired in the morning)
(Eating too much sugar, Developing cavities)
(Not wearing a seatbelt, Increased injury risk in an accident)
(Studying hard, Scoring high on a test)

Now generate 50 more such cause-effect pairs:
"""

# Call GPT to generate causal pairs
try:
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Extract response content
    gpt_output = response.choices[0].message.content.strip()

    # Extracting cause-effect pairs using regex
    causal_pairs = []
    for match in re.findall(r"\(([^,]+),\s*([^,]+)\)", gpt_output):
        cause, effect = match
        causal_pairs.append((cause.strip(), effect.strip()))

    # If API fails, use a fallback list
    if not causal_pairs:
        print("⚠️ GPT returned an empty response. Using fallback data.")
        causal_pairs = [
            ("Skipping breakfast", "Feeling hungry before lunch"),
            ("Driving too fast", "Higher chance of accidents"),
            ("Not exercising regularly", "Weight gain"),
            ("Leaving food out overnight", "Food spoilage"),
            ("Not getting enough sleep", "Difficulty concentrating"),
            ("Spending too much time on screens", "Eye strain"),
            ("Drinking coffee late at night", "Difficulty falling asleep"),
            ("Not saving money", "Financial difficulties"),
            ("Forgetting an umbrella", "Getting wet in the rain"),
            ("Procrastinating on homework", "Missing deadlines"),
            ("Not locking the door", "Higher risk of burglary"),
            ("Skipping a meal", "Feeling weak"),
            ("Ignoring alarm clocks", "Oversleeping"),
            ("Not doing laundry", "Running out of clean clothes"),
            ("Texting while walking", "Bumping into objects"),
            ("Leaving the stove on", "Fire hazard"),
            ("Using a phone at full brightness", "Battery drains quickly"),
            ("Parking illegally", "Getting a parking ticket"),
            ("Littering in public places", "Fines for littering"),
            ("Not watering plants", "Plants wilting"),
        ]

    # Convert to DataFrame
    df = pd.DataFrame(causal_pairs, columns=["Cause", "Effect"])
    
    # Save to CSV
    output_path = "cosmosQA_generated_causal_pairs.csv"
    df.to_csv(output_path, index=False)

    print(f"\n✅ Successfully generated {len(df)} causal pairs and saved to {output_path}")

except Exception as e:
    print(f"❌ Error generating causal pairs: {e}")



✅ Successfully generated 50 causal pairs and saved to cosmosQA_generated_causal_pairs.csv


In [4]:
import openai
import pandas as pd
import re

# OpenAI Client
client = openai.OpenAI()

# Define the GPT Prompt for Cosmos QA
prompt = """
Generate a list of 50 meaningful (Cause, Effect) pairs based on contextual commonsense reasoning from the Cosmos QA dataset.
The pairs should reflect daily life, human behavior, and logical consequences.

They should include:
- Causes and effects of events (e.g., "Running late, Missing the bus")
- Motivations behind actions (e.g., "Feeling lonely, Calling a friend")
- Reactions to situations (e.g., "Hearing a loud noise, Feeling startled")
- Temporal reasoning (e.g., "Sleeping late, Feeling tired the next morning")
- Counterfactuals (e.g., "Leaving an umbrella at home, Getting wet in the rain")

Each pair should be formatted as: (Cause, Effect)

Example:
(Waking up late, Rushing to work)
(Studying hard, Passing the exam)
(Eating spicy food, Feeling heartburn)
(Losing a wallet, Feeling stressed)

Now generate 50 more such meaningful (Cause, Effect) pairs:
"""

# Call GPT to generate causal pairs
try:
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Extract response content
    gpt_output = response.choices[0].message.content.strip()

    # Extracting cause-effect pairs using regex
    causal_pairs = []
    for match in re.findall(r"\(([^,]+),\s*([^,]+)\)", gpt_output):
        cause, effect = match
        causal_pairs.append((cause.strip(), effect.strip()))

    # If API fails, use a fallback list
    if not causal_pairs:
        print("⚠️ GPT returned an empty response. Using fallback data.")
        causal_pairs = [
            ("Waking up late", "Rushing to work"),
            ("Studying hard", "Passing the exam"),
            ("Skipping breakfast", "Feeling hungry before lunch"),
            ("Driving too fast", "Getting a speeding ticket"),
            ("Not checking the weather", "Getting caught in the rain"),
            ("Forgetting an umbrella", "Getting wet"),
            ("Texting while walking", "Bumping into an object"),
            ("Leaving food uncovered", "Attracting insects"),
            ("Ignoring alarm clocks", "Oversleeping"),
            ("Not saving money", "Struggling with unexpected expenses"),
            ("Leaving the stove on", "Fire hazard"),
            ("Not locking the door", "Higher risk of burglary"),
            ("Spending too much time on social media", "Feeling disconnected in real life"),
            ("Skipping lunch", "Overeating at dinner"),
            ("Not charging a phone overnight", "Phone dying midday"),
            ("Drinking coffee before bed", "Difficulty falling asleep"),
            ("Leaving a candle unattended", "Risk of fire"),
            ("Using a weak password", "Getting hacked"),
            ("Forgetting to study", "Failing the test"),
            ("Spilling water near electronics", "Device getting damaged"),
        ]

    # Convert to DataFrame
    df = pd.DataFrame(causal_pairs, columns=["Cause", "Effect"])
    
    # Save to CSV
    output_path = "cosmosQA_generated_causal_pairs.csv"
    df.to_csv(output_path, index=False)

    print(f"\n✅ Successfully generated {len(df)} causal pairs and saved to {output_path}")

except Exception as e:
    print(f"❌ Error generating causal pairs: {e}")



✅ Successfully generated 50 causal pairs and saved to cosmosQA_generated_causal_pairs.csv
