In [2]:
import pandas as pd

In [None]:
    # Paths
llm_path = f"data/llm_crowdsource/prompt_1/llama3.2/author_results.csv"
crowd_path = f"data/crowdsource/author_type.csv"

# Load
df_llm = pd.read_csv(llm_path)
df_crowd = pd.read_csv(crowd_path)

In [2]:
import pandas as pd

In [3]:
def get_examples_by_label_from_crowd(df_crowd: pd.DataFrame, label_column="publisher_verification", labels=None):
    """
    Select one crowd-sourced example per label using majority vote, without requiring LLM agreement.

    Parameters:
        df_crowd: DataFrame with columns ['ref_value', label_column]
        label_column: e.g., 'author_type', 'publisher_type', etc.
        labels: list of label names to cover (optional). If None, uses all labels in data.

    Returns:
        List of tuples (ref_value, label) — one per label if possible.
    """
    if labels is None:
        labels = df_crowd[label_column].unique().tolist()

    # Compute majority label per reference
    majority = df_crowd.groupby("ref_value")[label_column].agg(lambda x: x.mode()[0]).reset_index()
    majority.columns = ["ref_value", "majority_label"]

    # Sample one reference per label
    examples = []
    for lbl in labels:
        subset = majority[majority["majority_label"] == lbl]
        if not subset.empty:
            sample = subset.sample(n=1, random_state=42)
            examples.append((sample.iloc[0]["ref_value"], lbl))

    return examples


In [None]:
pv = pd.read_csv('data/crowdsource/publisher_verification_type.csv')
unique_verification = pv['ref_value'].dropna().unique()
pt = pd.read_csv('data/crowdsource/publisher_type.csv')
unique_publisher = pt['ref_value'].dropna().unique()
at = pd.read_csv('data/crowdsource/author_type.csv')
unique_author = pt['ref_value'].dropna().unique()


In [67]:
properties = [
    "P31", "P1215", "P258", "P17", "P131", "P106", "P625", "P2215", "P3083",
    "P6257", "P6259", "P6258", "P21", "P2671", "P59", "P735", "P569", "P27",
    "P18", "P646", "P361", "P684", "P1476", "P734", "P2216", "P1566", "P2214",
    "P171", "P225", "P105", "P373", "P279", "P19", "P2583", "P214", "P570",
    "P1087", "P703", "P407", "P276", "P846", "P571", "P577", "P1412", "P1082",
    "P971", "P69", "P1435", "P421", "P195", "P527"
]


In [1]:
import pandas as pd
from predict import predict_author_type, predict_publisher_type, predict_verification_type, save_all_results
from prompt import author_type_prompt, publisher_type_prompt, publisher_verification_prompt
from llm_wrapper import LLMWrapper
from extract_example import get_diverse_examples_from_crowd

# Configuration
model = "deepseek-r1:1.5b"
prompt_id = "1"

# Load unique values from crowdsource
pv = pd.read_csv('data/crowdsource/publisher_verification_type.csv')
unique_verification = pv['ref_value'].dropna().unique()

pt = pd.read_csv('data/crowdsource/publisher_type.csv')
unique_publisher = pt['ref_value'].dropna().unique()

at = pd.read_csv('data/crowdsource/author_type.csv')
unique_author = at['ref_value'].dropna().unique()



# Init LLM
llm = LLMWrapper(model_name=model)

# Run predictions
author_results = predict_author_type(llm, "http://phoronix.com/scan.php?page=news_item&px=GNU-Binutils-2.27",examples=None)


print("✅ All predictions completed and saved.")


[LLMWrapper] Initialized Ollama model 'deepseek-r1:1.5b' at http://localhost:11434
⚠️ Skipped h at index 1 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped t at index 2 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped t at index 3 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped p at index 4 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped : at index 5 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped / at index 6 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped / at index 7 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped p at index 8 due to error: Expecting value: line 1 column 1 (char 0)
⚠️ Skipped h at index 9 due to error: Expecting value: line 1 column 1 (char 0)


KeyboardInterrupt: 

In [4]:
def get_item_label(item):
    if re.match(r'[Q|P][0-9]{1,}', item):
        url = 'https://www.wikidata.org/wiki/Special:EntityData/' + item + '.json'
        r = requests.get(url, auth=('user', 'pass'))

        try:
            json_data = r.json()
            try:
                json_data['entities'][item]
            except KeyError:
                item = list(json_data['entities'].keys())[0]
            except json.decoder.JSONDecodeError:
                pass
            try:
                label = json_data['entities'][item]['labels']['en']['value']
            except KeyError:
                try:
                    label = json_data['entities'][item]['labels']['de']['value']
                except KeyError:
                    try:
                        label = json_data['entities'][item]['labels']['es']['value']
                    except KeyError:
                        try:
                            label = json_data['entities'][item]['labels']['fr']['value']
                        except KeyError:
                            try:
                                label = json_data['entities'][item]['labels']['nl']['value']
                            except KeyError:
                                label = json_data['entities'][item]['labels']
            except json.decoder.JSONDecodeError:
                return item
            return label
        except:
            return item
    else:
        return item

In [5]:
import json
import re

In [8]:
import pandas as pd
import os
import requests
# Load the processed file
crowd_path = "data/crowdsource/relevance/prediction_data.csv"
df = pd.read_csv(crowd_path, sep='\t', header=0)

# Add human-readable labels for item and stat_value
df['item_label'] = df['item_id'].apply(get_item_label)
df['stat_value_label'] = df['stat_value'].apply(get_item_label)

# Select only the necessary columns for LLM prompting
prompt_df = df[[
    'rev_id',
    'item_id',
    'item_label',
    'stat_property',
    'stat_value',
    'stat_value_label',
    'ref_value',
    'ref_domain'
]]

# Drop rows with missing values in any critical field
prompt_df = prompt_df.dropna(subset=['item_label', 'stat_property', 'stat_value_label', 'ref_value'])

# Ensure the output directory exists
os.makedirs("data/crowdsource", exist_ok=True)

# Save to CSV
output_path = "data/crowdsource/relevance/llm_prompt_input_relevance.csv"
prompt_df.to_csv(output_path, index=False)

print(f"✅ Prompt-ready DataFrame saved to: {output_path}")


KeyboardInterrupt: 

In [11]:
df.columns

Index(['rev_id', 'ref_value', 'ref_domain', 'ref_count', 'domain_count',
       'stat_property', 'stat_value', 'item_id', 'user_type', 'user_edits',
       'user_ref_edits', 'support_object', 'code_2', 'domain',
       'publisher_type', 'author_type', 'all_types', 'authoritative',
       'item_match', 'object_match', 'statement_match', 'user_ref_edits_pc',
       'item_text', 'item_text_clean', 'instance_of', 'subclass',
       'object_instance_of', 'object_subclass', 'property_instance_of',
       'object_text', 'authority_baseline'],
      dtype='object')