In [1]:
import time

from google import genai

# For extracting vertex experiment details.
from google.cloud import aiplatform
from google.cloud.aiplatform.metadata import context
from google.cloud.aiplatform.metadata import utils as metadata_utils
from google.genai import types

# For data handling.
import jsonlines
import pandas as pd

# For visualization.
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# For evaluation metric computation.
from rouge_score import rouge_scorer
from tqdm import tqdm

# For fine tuning Gemini model.
import vertexai
import json 

In [2]:
PROMPT_TEMPLATE = """
You are an expert in propaganda analysis. Analyze the following text. 
Always keep in mind that you are only looking for russian propaganda, not any other type of propaganda. Any other type of propaganda should be ignored and labeled as "No Propaganda".

Text:
"{text}"

Identify the main label that best describes the text.

•1 — Loaded language: Use emotionally charged or stereotyped wording to sway feelings.
•2 — Appeal to fear/prejudice: Try to persuade by warning of frightening outcomes or playing on biases.
•3 — Doubt: Undermine credibility by questioning motives, facts, or sources.
•4 — Name calling/labeling: Attack a person or group with insulting or demeaning labels.
•5 — Flag-waving: Justify a position by appealing to patriotism or group identity.
•6 — Exaggeration or minimization: Overstate benefits/harms or downplay them to mislead.
•7 — Causal oversimplification: Claim a simple cause for a complex issue or outcome.
•8 — Red herring: Introduce an irrelevant point to distract from the main issue.
•9 — Black-and-white fallacy: Present only two options and ignore reasonable alternatives.
•10 — Reductio ad hitlerum: Dismiss an idea by associating it with Nazis/Hitler or similarly reviled groups.
•11 — Appeal to authority: Cite an authority’s support as proof rather than evidence.
•12 — Straw man: Misrepresent an opponent’s argument, then refute the weaker version.
•13 — Thought-terminating cliches: Use stock phrases that shut down questioning or debate.
•14 — Whataboutism: Deflect criticism by accusing others of similar or worse behavior.
•15 — Slogans: Use a short, catchy phrase to promote an idea.
•16 — Bandwagon: Argue something is good/true because it’s popular or widely adopted.
•17 — Repetition: Repeat the same claim or phrase many times to make it stick.
•18 — no_propaganda: No relevant propaganda detected.

Return the result in the following JSON format:
{{"main": integer}}

Follow the json format strictly. Do not add any additional text or explanations. Just use the integers for labeling, no strings.
Remember to always return Integers only! Only return one integer that best describes the text. Never retunn multiple integers or a list of integers!
"""

In [3]:
def generate_from_pickle():
    input_pickle_file = r"C:\Users\david\PY\golden_test_set_with_ids_extended.pkl"
    output_json_path = "gemini_high_only_golden_test.json"

    # --- Step 1: Load pickle file ---
    try:
        df = pd.read_pickle(input_pickle_file)
        print(f"✅ Loaded pickle file with {len(df)} rows.")
    except Exception as e:
        print(f"❌ Error loading pickle file: {e}")
        return

    # --- Step 2: Initialize Gemini client ---
    try:
        client = genai.Client(
            vertexai=True,
            project="472207425752",
            location="us-central1",
        )
        model = "projects/472207425752/locations/us-central1/endpoints/2874918341913346048"
        print("✅ Gemini client and model path initialized successfully.")
    except Exception as e:
        print(f"❌ Error initializing Gemini client: {e}")
        return

    # --- Step 3: Define integer schema (instead of array) ---
    response_schema_json = {
        "type": "OBJECT",
        "properties": {
            "main": {
                "type": "INTEGER"
            }
        },
        "required": ["main"]
    }

    processed_data = []

    # --- Step 4: Loop through rows ---
    for i, row in df.iterrows():
        text_to_analyze = str(row.get("text", "")).strip()
        record_id = row.get("id", f"index-{i}")
        print(f"\n--- Processing record {i + 1} (ID: {record_id}) ---")

        if not text_to_analyze:
            print("⚠️ Skipping empty text.")
            processed_data.append({
                "id": record_id,
                "text": "",
                "analysis_result": None,
                "error": "Empty input text"
            })
            continue

        analysis_result = None
        error_message = None

        try:
            full_prompt = PROMPT_TEMPLATE.format(text=text_to_analyze)
            text_part = types.Part.from_text(text=full_prompt)
            contents = [types.Content(role="user", parts=[text_part])]

            generate_content_config = types.GenerateContentConfig(
                temperature=0,
                top_p=1,
                seed=0,
                max_output_tokens=8192,
                safety_settings=[
                    types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                    types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                    types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                    types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
                ],
                response_mime_type="application/json",
                # ✅ Now enforces integer output
                response_schema=response_schema_json,
            )

            full_response_text = ""
            for chunk in client.models.generate_content_stream(
                model=model,
                contents=contents,
                config=generate_content_config,
            ):
                full_response_text += chunk.text

            # --- Parse single integer JSON ---
            analysis_result = json.loads(full_response_text)
            print("✅ Analyzed record:", analysis_result)

        except json.JSONDecodeError as e:
            error_message = f"JSON parsing failed: {e}. Raw output: {full_response_text}"
            print(f"❌ JSON Error for record {i + 1}: {error_message}")
        except Exception as e:
            error_message = f"API call failed: {e}"
            print(f"❌ API Error for record {i + 1}: {error_message}")

        processed_data.append({
            "id": record_id,
            "text": text_to_analyze,
            "analysis_result": analysis_result,
            "error": error_message
        })

    # --- Step 5: Save results ---
    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(processed_data, f, ensure_ascii=False, indent=4)
        print(f"\n✅ All results saved to '{output_json_path}'")
    except Exception as e:
        print(f"❌ Error saving JSON file: {e}")


# Run it
generate_from_pickle()

✅ Loaded pickle file with 200 rows.
✅ Gemini client and model path initialized successfully.

--- Processing record 1 (ID: 1502603074757709825) ---
✅ Analyzed record: {'main': 0}

--- Processing record 2 (ID: 1500847775663235077) ---
✅ Analyzed record: {'main': 0}

--- Processing record 3 (ID: 1530502111242473472) ---
✅ Analyzed record: {'main': 0}

--- Processing record 4 (ID: 1523284572015935490) ---
✅ Analyzed record: {'main': 0}

--- Processing record 5 (ID: 1504082826412662788) ---
✅ Analyzed record: {'main': 0}

--- Processing record 6 (ID: 1505686198274121735) ---
✅ Analyzed record: {'main': 0}

--- Processing record 7 (ID: 1495984898821312518) ---
✅ Analyzed record: {'main': 0}

--- Processing record 8 (ID: 1536468551577620480) ---
✅ Analyzed record: {'main': 0}

--- Processing record 9 (ID: 1504635793935159301) ---
✅ Analyzed record: {'main': 0}

--- Processing record 10 (ID: 1498264371738574848) ---
✅ Analyzed record: {'main': 0}

--- Processing record 11 (ID: 152074946908357

KeyboardInterrupt: 