In [29]:
import pandas as pd
import numpy as np
import os

In [30]:
df_articles = pd.read_csv(r"C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\factiva_articles_all.csv")

In [31]:
df_articles_sample = df_articles.head(10)

In [40]:
df_articles_sample.columns 

Index(['Doc_ID', 'Date', 'Headline', 'Content'], dtype='object')

In [32]:
df_articles.shape

(7028, 4)

In [33]:
def run_ollama_prompt(model_name, prompt_text):
    try:
        command = ['ollama', 'run', model_name]
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding='utf-8'
        )
        stdout, stderr = process.communicate(input=prompt_text)

        if process.returncode != 0:
            print(f"Error: {stderr}")
            return None

        return stdout.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [34]:
def create_prompt(article_content):
    prompt_text = f"""
You are a financial summarizer focused only on gold-related events.

Here is a news article:
---
{article_content}
---

Task:
- Summarize ONLY the parts related to gold: gold prices, gold futures, bullion, gold demand, safe haven, inflation impact on gold.
- Ignore content about equities, bonds, crypto, general economy unless it causally impacts gold.
- If no gold-related discussion found, output: {{"gold_summary": "No relevant gold content."}}

Important: output ONLY valid JSON with this format:
{{"gold_summary": "..."}}.
"""
    return prompt_text.strip()

In [35]:
def extract_gold_summary(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_summary', '')
        else:
            return ''
    except json.JSONDecodeError:
        return ''

In [36]:
def normalize_similarity(cos_sim):
    norm = (cos_sim + 1) / 2  # map (-1,1) ‚Üí (0,1)
    scaled = 0.1 + 0.9 * norm # map (0,1) ‚Üí (0.1,1.0)
    return round(scaled, 4)

In [None]:
def summarize_and_score(df, model_name, output_csv):
    #df = pd.read_csv(input_csv)
    df = df.copy()  # Avoid modifying the original DataFrame
    df['Content'] = df['Content'].astype(str)  # Ensure content is string type

    # Load sentence-transformers model
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    prototype_sentence = "gold price, bullion, inflation, gold futures, safe haven"
    proto_emb = embed_model.encode(prototype_sentence, convert_to_tensor=True)

    summaries = []
    scores = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        Content = row['Content']

        prompt = create_prompt(Content)
        response = run_ollama_prompt(model_name, prompt)
        if not response:
            gold_summary = "No summary generated."
            relevance_score = 0.1
        else:
            gold_summary = extract_gold_summary(response)

            # Embed gold summary
            summary_emb = embed_model.encode(gold_summary, convert_to_tensor=True)
            cos_sim = util.cos_sim(summary_emb, proto_emb).item()
            relevance_score = normalize_similarity(cos_sim)

        summaries.append(gold_summary)
        scores.append(relevance_score)

    df['gold_summary'] = summaries
    df['gold_relevance_score'] = scores

    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Finished processing. Output saved to {output_csv}")


In [46]:
import os
import csv
import json
import re
import subprocess
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# --- Ollama runner ---
def run_ollama_prompt(model_name, prompt_text):
    try:
        command = ['ollama', 'run', model_name]
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding='utf-8'
        )
        stdout, stderr = process.communicate(input=prompt_text)

        if process.returncode != 0:
            print(f"Error: {stderr}")
            return None

        return stdout.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Summarization Prompt ---
def create_prompt(article_content):
    prompt_text = f"""
You are a financial summarizer focused only on gold-related events.

Here is a news article:
---
{article_content}
---

Task:
- Summarize ONLY the parts related to gold: gold prices, gold futures, bullion, gold demand, safe haven, inflation impact on gold.
- Ignore content about equities, bonds, crypto, general economy unless it causally impacts gold.
- If no gold-related discussion found, output: {{"gold_summary": "No relevant gold content."}}

Important: output ONLY valid JSON with this format:
{{"gold_summary": "..."}}.
"""
    return prompt_text.strip()

# --- Parse LLM JSON safely ---
def extract_gold_summary(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_summary', '')
        else:
            return ''
    except json.JSONDecodeError:
        return ''

# --- Normalize cosine similarity ---
def normalize_similarity(cos_sim):
    norm = (cos_sim + 1) / 2  # map (-1,1) ‚Üí (0,1)
    scaled = 0.1 + 0.9 * norm # map (0,1) ‚Üí (0.1,1.0)
    return round(scaled, 4)

# --- Main Processor ---
def summarize_and_score(df, model_name, output_csv):
    # Load sentence-transformers model
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    prototype_sentence = "gold price, bullion, inflation, gold futures, safe haven"
    proto_emb = embed_model.encode(prototype_sentence, convert_to_tensor=True)

    summaries = []
    scores = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        content = row['Content']  # Important: Using "Content" not "content"

        prompt = create_prompt(content)
        response = run_ollama_prompt(model_name, prompt)
        if not response:
            gold_summary = "No summary generated."
            relevance_score = 0.1
        else:
            gold_summary = extract_gold_summary(response)

            # Embed gold summary
            summary_emb = embed_model.encode(gold_summary, convert_to_tensor=True)
            cos_sim = util.cos_sim(summary_emb, proto_emb).item()
            relevance_score = normalize_similarity(cos_sim)

        summaries.append(gold_summary)
        scores.append(relevance_score)

    df['gold_summary'] = summaries
    df['gold_relevance_score'] = scores

    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Finished processing. Output saved to {output_csv}")

# --- Running Section --- #
if __name__ == "__main__":
    # Instead of loading CSV, using provided DataFrame
    df = df_articles_sample.copy()  # Your in-memory sample DataFrame
    output_csv = r"C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\all_gold_summarized_scored_articles_llama3.2.csv"
    model_name = "llama3.2"  # or your local Llama model

    summarize_and_score(df, model_name, output_csv)


Processing articles: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:19<00:00,  1.94s/it]


‚úÖ Finished processing. Output saved to C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\all_gold_summarized_scored_articles_llama3.2.csv





## inluding the cause and effect 

In [47]:
import os
import json
import re
import subprocess
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# --- Ollama LLM Runner ---
def run_ollama_prompt(model_name, prompt_text):
    try:
        command = ['ollama', 'run', model_name]
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding='utf-8'
        )
        stdout, stderr = process.communicate(input=prompt_text)

        if process.returncode != 0:
            print(f"Error: {stderr}")
            return None

        return stdout.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Causal Gold Extraction Prompt ---
def create_prompt(article_content):
    prompt_text = f"""
You are a causal summarizer focused on the gold market.

Here is a news article:
---
{article_content}
---

Task:
- Analyze only gold-related parts.
- Identify clear CAUSE (reason) and EFFECT (impact on gold prices, volatility, demand).
- Extract clean cause-effect pairs.
- Only output JSON with exact structure:

{{
  "gold_causal_summary": [
    {{
      "cause": "....",
      "effect": "...."
    }},
    {{
      "cause": "....",
      "effect": "...."
    }}
  ]
}}

Notes:
- Be very specific.
- If no gold-related causality is found, output: {{"gold_causal_summary": []}}
- Do not summarize unrelated parts of the article.
- No extra text outside JSON.
"""
    return prompt_text.strip()

# --- Parse LLM JSON Output ---
def extract_causal_pairs(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_causal_summary', [])
        else:
            return []
    except json.JSONDecodeError:
        return []

# --- Normalize Cosine Similarity ---
def normalize_similarity(cos_sim):
    norm = (cos_sim + 1) / 2  # map (-1,1) ‚Üí (0,1)
    scaled = 0.1 + 0.9 * norm # map (0,1) ‚Üí (0.1,1.0)
    return round(scaled, 4)

# --- Main Processor ---
def summarize_and_score(df, model_name, output_csv):
    # Load sentence-transformers model
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    prototype_sentence = "gold price, bullion, inflation, gold futures, safe haven"
    proto_emb = embed_model.encode(prototype_sentence, convert_to_tensor=True)

    cause_list = []
    effect_list = []
    cause_effect_summary_list = []
    similarity_scores = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        content = row['Content']

        prompt = create_prompt(content)
        response = run_ollama_prompt(model_name, prompt)
        if not response:
            causes = []
        else:
            causes = extract_causal_pairs(response)

        if causes:
            # If multiple causes-effects, combine them
            cause_texts = [c['cause'] for c in causes]
            effect_texts = [c['effect'] for c in causes]
            joined_cause_effect = ["Cause: " + c['cause'] + " --> Effect: " + c['effect'] for c in causes]

            cause_text = " || ".join(cause_texts)
            effect_text = " || ".join(effect_texts)
            cause_effect_summary = " || ".join(joined_cause_effect)
        else:
            cause_text = "No gold cause identified."
            effect_text = "No gold effect identified."
            cause_effect_summary = "No gold causality found."

        # Embed the cause_effect_summary for scoring
        summary_emb = embed_model.encode(cause_effect_summary, convert_to_tensor=True)
        cos_sim = util.cos_sim(summary_emb, proto_emb).item()
        relevance_score = normalize_similarity(cos_sim)

        cause_list.append(cause_text)
        effect_list.append(effect_text)
        cause_effect_summary_list.append(cause_effect_summary)
        similarity_scores.append(relevance_score)

    # Add new columns
    df['gold_cause'] = cause_list
    df['gold_effect'] = effect_list
    df['gold_cause_effect_summary'] = cause_effect_summary_list
    df['gold_relevance_score'] = similarity_scores

    # Save
    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Finished processing. Output saved to {output_csv}")

# --- RUNNING SECTION --- #
if __name__ == "__main__":
    df = df_articles_sample.copy()  # Your provided DataFrame
    output_csv = r"C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\causal_gold_articles.csv"
    model_name = "llama3.1"

    summarize_and_score(df, model_name, output_csv)


Processing articles: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:24<00:00,  8.48s/it]


‚úÖ Finished processing. Output saved to C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\causal_gold_articles.csv





##  extedning it to next level adding the BERT Embeddings also 

In [48]:
import os
import json
import re
import subprocess
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# --- Ollama LLM Runner ---
def run_ollama_prompt(model_name, prompt_text):
    try:
        command = ['ollama', 'run', model_name]
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding='utf-8'
        )
        stdout, stderr = process.communicate(input=prompt_text)

        if process.returncode != 0:
            print(f"Error: {stderr}")
            return None

        return stdout.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Causal Gold Extraction Prompt ---
def create_causal_prompt(article_content):
    prompt_text = f"""
You are a causal summarizer focused on the gold market.

Here is a news article:
---
{article_content}
---

Task:
- Analyze only gold-related parts.
- Identify clear CAUSE (reason) and EFFECT (impact on gold prices, volatility, demand).
- Extract clean cause-effect pairs.
- Only output JSON with exact structure:

{{
  "gold_causal_summary": [
    {{
      "cause": "....",
      "effect": "...."
    }},
    {{
      "cause": "....",
      "effect": "...."
    }}
  ]
}}

Notes:
- If no gold-related causality found, output: {{"gold_causal_summary": []}}
- No extra text, only valid JSON.
"""
    return prompt_text.strip()

# --- General Gold Summary Prompt ---
def create_general_prompt(article_content):
    prompt_text = f"""
You are a financial summarizer.

Here is a news article:
---
{article_content}
---

Task:
- Summarize briefly (2-3 sentences) any discussion related to GOLD: prices, futures, volatility, safe haven demand.
- If no gold-related content found, output: {{"gold_summary": "No gold-related content found."}}

Output ONLY valid JSON:

{{
  "gold_summary": "...."
}}
"""
    return prompt_text.strip()

# --- Parse LLM JSON Outputs ---
def extract_causal_pairs(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_causal_summary', [])
        else:
            return []
    except json.JSONDecodeError:
        return []

def extract_general_summary(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_summary', '')
        else:
            return ''
    except json.JSONDecodeError:
        return ''

# --- Normalize Cosine Similarity ---
def normalize_similarity(cos_sim):
    norm = (cos_sim + 1) / 2  # map (-1,1) ‚Üí (0,1)
    scaled = 0.1 + 0.9 * norm # map (0,1) ‚Üí (0.1,1.0)
    return round(scaled, 4)

# --- Main Processor ---
def summarize_and_score(df, model_name, output_csv):
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    prototype_sentence = "gold price, bullion, inflation, gold futures, safe haven"
    proto_emb = embed_model.encode(prototype_sentence, convert_to_tensor=True)

    cause_list = []
    effect_list = []
    cause_effect_summary_list = []
    general_summary_list = []
    general_embedding_list = []
    similarity_scores = []
    bad_rows = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        try:
            content = row['Content']

            # Causal extraction
            causal_prompt = create_causal_prompt(content)
            causal_response = run_ollama_prompt(model_name, causal_prompt)
            causes = extract_causal_pairs(causal_response) if causal_response else []

            if causes:
                cause_texts = [c['cause'] for c in causes]
                effect_texts = [c['effect'] for c in causes]
                joined_cause_effect = ["Cause: " + c['cause'] + " --> Effect: " + c['effect'] for c in causes]

                cause_text = " || ".join(cause_texts)
                effect_text = " || ".join(effect_texts)
                cause_effect_summary = " || ".join(joined_cause_effect)
            else:
                cause_text = "No gold cause identified."
                effect_text = "No gold effect identified."
                cause_effect_summary = "No gold causality found."

            # General summarization
            general_prompt = create_general_prompt(content)
            general_response = run_ollama_prompt(model_name, general_prompt)
            gold_general_summary = extract_general_summary(general_response) if general_response else "No gold-related content found."

            # Embedding the general summary
            general_emb = embed_model.encode(gold_general_summary, convert_to_numpy=True).tolist()

            # Relevance scoring
            summary_emb = embed_model.encode(cause_effect_summary, convert_to_tensor=True)
            cos_sim = util.cos_sim(summary_emb, proto_emb).item()
            relevance_score = normalize_similarity(cos_sim)

            # Save
            cause_list.append(cause_text)
            effect_list.append(effect_text)
            cause_effect_summary_list.append(cause_effect_summary)
            general_summary_list.append(gold_general_summary)
            general_embedding_list.append(general_emb)
            similarity_scores.append(relevance_score)

        except Exception as e:
            print(f"‚ö†Ô∏è Warning: Skipping row {idx} due to error: {e}")
            bad_rows.append(row)

    # Final dataframe
    df['gold_cause'] = cause_list
    df['gold_effect'] = effect_list
    df['gold_cause_effect_summary'] = cause_effect_summary_list
    df['gold_general_summary'] = general_summary_list
    df['gold_general_embedding'] = general_embedding_list
    df['gold_relevance_score'] = similarity_scores

    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Finished processing. Output saved to {output_csv}")

    # Save bad rows separately
    if bad_rows:
        bad_rows_df = pd.DataFrame(bad_rows)
        bad_rows_output = output_csv.replace(".csv", "_bad_rows.csv")
        bad_rows_df.to_csv(bad_rows_output, index=False)
        print(f"\nüö® Saved bad rows separately to {bad_rows_output}")

# --- RUNNING SECTION --- #
if __name__ == "__main__":
    df = df_articles_sample.copy()  # your sample df
    output_csv = r"C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\causal_gold_articles_full.csv"
    model_name = "llama3.1"

    summarize_and_score(df, model_name, output_csv)


Processing articles: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [02:34<00:00, 15.48s/it]


‚úÖ Finished processing. Output saved to C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\causal_gold_articles_full.csv





## the problem befor is even though we have cause and effects its giving blank general summaries and that was handled here

In [51]:
import os
import json
import re
import subprocess
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# --- Ollama LLM Runner ---
def run_ollama_prompt(model_name, prompt_text):
    try:
        command = ['ollama', 'run', model_name]
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding='utf-8'
        )
        stdout, stderr = process.communicate(input=prompt_text)

        if process.returncode != 0:
            print(f"Error: {stderr}")
            return None

        return stdout.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Causal Gold Extraction Prompt ---
def create_causal_prompt(article_content):
    prompt_text = f"""
You are a causal summarizer focused on the gold market.

Here is a news article:
---
{article_content}
---

Task:
- Analyze only gold-related parts.
- Identify clear CAUSE (reason) and EFFECT (impact on gold prices, volatility, demand).
- Extract clean cause-effect pairs.
- Only output JSON with exact structure:

{{
  "gold_causal_summary": [
    {{
      "cause": "....",
      "effect": "...."
    }},
    {{
      "cause": "....",
      "effect": "...."
    }}
  ]
}}

Notes:
- If no gold-related causality found, output: {{"gold_causal_summary": []}}
- No extra text, only valid JSON.
"""
    return prompt_text.strip()

# --- General Gold Summary Prompt ---
def create_general_prompt(article_content):
    prompt_text = f"""
You are a financial summarizer.

Here is a news article:
---
{article_content}
---

Task:
- Summarize briefly (2-3 sentences) any discussion related to GOLD: prices, futures, volatility, safe haven demand.
- If no gold-related content found, output: {{"gold_summary": "No gold-related content found."}}

Output ONLY valid JSON:

{{
  "gold_summary": "...."
}}
"""
    return prompt_text.strip()

# --- Parse LLM JSON Outputs ---
def extract_causal_pairs(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_causal_summary', [])
        else:
            return []
    except json.JSONDecodeError:
        return []

def extract_general_summary(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_summary', '')
        else:
            return ''
    except json.JSONDecodeError:
        return ''

# --- Normalize Cosine Similarity ---
def normalize_similarity(cos_sim):
    norm = (cos_sim + 1) / 2  # map (-1,1) ‚Üí (0,1)
    scaled = 0.1 + 0.9 * norm # map (0,1) ‚Üí (0.1,1.0)
    return round(scaled, 4)

# --- Main Processor ---
def summarize_and_score(df, model_name, output_csv):
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    prototype_sentence = "gold price, bullion, inflation, gold futures, safe haven"
    proto_emb = embed_model.encode(prototype_sentence, convert_to_tensor=True)

    cause_list = []
    effect_list = []
    cause_effect_summary_list = []
    general_summary_list = []
    general_embedding_list = []
    similarity_scores = []
    causal_only_flags = []
    bad_rows = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        try:
            content = row['Content']

            # Causal extraction
            causal_prompt = create_causal_prompt(content)
            causal_response = run_ollama_prompt(model_name, causal_prompt)
            causes = extract_causal_pairs(causal_response) if causal_response else []

            if causes:
                cause_texts = [c['cause'] for c in causes]
                effect_texts = [c['effect'] for c in causes]
                joined_cause_effect = ["Cause: " + c['cause'] + " --> Effect: " + c['effect'] for c in causes]

                cause_text = " || ".join(cause_texts)
                effect_text = " || ".join(effect_texts)
                cause_effect_summary = " || ".join(joined_cause_effect)
            else:
                cause_text = "No gold cause identified."
                effect_text = "No gold effect identified."
                cause_effect_summary = "No gold causality found."

            # General summarization
            general_prompt = create_general_prompt(content)
            general_response = run_ollama_prompt(model_name, general_prompt)
            gold_general_summary = extract_general_summary(general_response) if general_response else "No gold-related content found."

            # Fallback: if no summary but causes exist
            if gold_general_summary == "No gold-related content found." and causes:
                fallback_summary = f"This article discusses gold causally, mentioning: {', '.join(cause_texts)}."
                gold_general_summary = fallback_summary
                causal_only_flags.append(True)
            else:
                causal_only_flags.append(False)

            # Embedding the general summary
            general_emb = embed_model.encode(gold_general_summary, convert_to_numpy=True).tolist()

            # Relevance scoring
            summary_emb = embed_model.encode(cause_effect_summary, convert_to_tensor=True)
            cos_sim = util.cos_sim(summary_emb, proto_emb).item()
            relevance_score = normalize_similarity(cos_sim)

            # Save
            cause_list.append(cause_text)
            effect_list.append(effect_text)
            cause_effect_summary_list.append(cause_effect_summary)
            general_summary_list.append(gold_general_summary)
            general_embedding_list.append(general_emb)
            similarity_scores.append(relevance_score)

        except Exception as e:
            print(f"‚ö†Ô∏è Warning: Skipping row {idx} due to error: {e}")
            bad_rows.append(row)

    # Add final columns
    df['gold_cause'] = cause_list
    df['gold_effect'] = effect_list
    df['gold_cause_effect_summary'] = cause_effect_summary_list
    df['gold_general_summary'] = general_summary_list
    df['gold_general_embedding'] = general_embedding_list
    df['gold_relevance_score'] = similarity_scores
    df['causal_only'] = causal_only_flags

    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Finished processing. Output saved to {output_csv}")

    # Save bad rows separately
    if bad_rows:
        bad_rows_df = pd.DataFrame(bad_rows)
        bad_rows_output = output_csv.replace(".csv", "_bad_rows.csv")
        bad_rows_df.to_csv(bad_rows_output, index=False)
        print(f"\nüö® Saved bad rows separately to {bad_rows_output}")

# --- RUNNING SECTION --- #
if __name__ == "__main__":
    df = df_articles  # your sample DataFrame
    output_csv = r"C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\causal_gold_articles_full_articles.csv"
    model_name = "llama3.2"

    summarize_and_score(df, model_name, output_csv)


Processing articles:   0%|          | 6/7028 [00:19<6:01:06,  3.09s/it]



Processing articles:   0%|          | 20/7028 [01:24<8:15:12,  4.24s/it]


KeyboardInterrupt: 

# adding the good rows and the bad rows to the same csv file

In [None]:
import os
import json
import re
import subprocess
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# --- Ollama LLM Runner ---
def run_ollama_prompt(model_name, prompt_text):
    try:
        command = ['ollama', 'run', model_name]
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding='utf-8'
        )
        stdout, stderr = process.communicate(input=prompt_text)

        if process.returncode != 0:
            print(f"Error: {stderr}")
            return None

        return stdout.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Causal Gold Extraction Prompt ---
def create_causal_prompt(article_content):
    prompt_text = f"""
You are a causal summarizer focused on the gold market.

Here is a news article:
---
{article_content}
---

Task:
- Analyze only gold-related parts.
- Identify clear CAUSE (reason) and EFFECT (impact on gold prices, volatility, demand).
- Extract clean cause-effect pairs.
- Only output JSON with exact structure:

{{
  "gold_causal_summary": [
    {{
      "cause": "....",
      "effect": "...."
    }},
    {{
      "cause": "....",
      "effect": "...."
    }}
  ]
}}

Notes:
- If no gold-related causality found, output: {{"gold_causal_summary": []}}
- No extra text, only valid JSON.
"""
    return prompt_text.strip()

# --- General Gold Summary Prompt ---
def create_general_prompt(article_content):
    prompt_text = f"""
You are a financial summarizer.

Here is a news article:
---
{article_content}
---

Task:
- Summarize briefly (2-3 sentences) any discussion related to GOLD: prices, futures, volatility, safe haven demand.
- If no gold-related content found, output: {{"gold_summary": "No gold-related content found."}}

Output ONLY valid JSON:

{{
  "gold_summary": "...."
}}
"""
    return prompt_text.strip()

# --- Parse LLM JSON Outputs ---
def extract_causal_pairs(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_causal_summary', [])
        else:
            return []
    except json.JSONDecodeError:
        return []

def extract_general_summary(ollama_response):
    try:
        json_match = re.search(r'\{.*\}', ollama_response, re.DOTALL)
        if json_match:
            parsed = json.loads(json_match.group(0))
            return parsed.get('gold_summary', '')
        else:
            return ''
    except json.JSONDecodeError:
        return ''

# --- Normalize Cosine Similarity ---
def normalize_similarity(cos_sim):
    norm = (cos_sim + 1) / 2  # map (-1,1) ‚Üí (0,1)
    scaled = 0.1 + 0.9 * norm # map (0,1) ‚Üí (0.1,1.0)
    return round(scaled, 4)

# --- Main Processor ---
def summarize_and_score(df, model_name, output_csv):
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    prototype_sentence = "gold price, bullion, inflation, gold futures, safe haven"
    proto_emb = embed_model.encode(prototype_sentence, convert_to_tensor=True)

    cause_list = []
    effect_list = []
    cause_effect_summary_list = []
    general_summary_list = []
    general_embedding_list = []
    similarity_scores = []
    causal_only_flags = []
    bad_rows = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        try:
            content = row['Content']

            # Causal extraction
            causal_prompt = create_causal_prompt(content)
            causal_response = run_ollama_prompt(model_name, causal_prompt)
            causes = extract_causal_pairs(causal_response) if causal_response else []

            if causes:
                cause_texts = [c['cause'] for c in causes]
                effect_texts = [c['effect'] for c in causes]
                joined_cause_effect = ["Cause: " + c['cause'] + " --> Effect: " + c['effect'] for c in causes]

                cause_text = " || ".join(cause_texts)
                effect_text = " || ".join(effect_texts)
                cause_effect_summary = " || ".join(joined_cause_effect)
            else:
                cause_text = "No gold cause identified."
                effect_text = "No gold effect identified."
                cause_effect_summary = "No gold causality found."

            # General summarization
            general_prompt = create_general_prompt(content)
            general_response = run_ollama_prompt(model_name, general_prompt)
            gold_general_summary = extract_general_summary(general_response) if general_response else "No gold-related content found."

            # Fallback: if no summary but causes exist
            if gold_general_summary == "No gold-related content found." and causes:
                fallback_summary = f"This article discusses gold causally, mentioning: {', '.join(cause_texts)}."
                gold_general_summary = fallback_summary
                causal_only_flags.append(True)
            else:
                causal_only_flags.append(False)

            # Embedding the general summary
            general_emb = embed_model.encode(gold_general_summary, convert_to_numpy=True).tolist()

            # Relevance scoring
            summary_emb = embed_model.encode(cause_effect_summary, convert_to_tensor=True)
            cos_sim = util.cos_sim(summary_emb, proto_emb).item()
            relevance_score = normalize_similarity(cos_sim)

            # Save
            cause_list.append(cause_text)
            effect_list.append(effect_text)
            cause_effect_summary_list.append(cause_effect_summary)
            general_summary_list.append(gold_general_summary)
            general_embedding_list.append(general_emb)
            similarity_scores.append(relevance_score)

        except Exception as e:
            print(f"‚ö†Ô∏è Warning: Skipping row {idx} due to error: {e}")
            bad_rows.append(row)

    # Create clean DataFrame
    good_df = df.drop(index=[row.name for row in bad_rows])

    # Add processed columns
    good_df['gold_cause'] = cause_list
    good_df['gold_effect'] = effect_list
    good_df['gold_cause_effect_summary'] = cause_effect_summary_list
    good_df['gold_general_summary'] = general_summary_list
    good_df['gold_general_embedding'] = general_embedding_list
    good_df['gold_relevance_score'] = similarity_scores
    good_df['causal_only'] = causal_only_flags

    good_df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Finished processing. Output saved to {output_csv}")

    # Save bad rows
    if bad_rows:
        bad_rows_df = pd.DataFrame(bad_rows)
        bad_rows_output = output_csv.replace(".csv", "_bad_rows.csv")
        bad_rows_df.to_csv(bad_rows_output, index=False)
        print(f"\nüö® Saved bad rows separately to {bad_rows_output}")

# --- RUNNING SECTION --- #
if __name__ == "__main__":
    df = df_articles  # Your sample DataFrame
    output_csv = r"C:\Users\balaj\code_files\Documents\Brahmanda\context_aware_risk_methodology\event_causal_prediction_system\data\causal_gold_articles_full_llama3.2.csv"
    model_name = "llama3.2"

    summarize_and_score(df, model_name, output_csv)
