In [2]:
import pandas as pd

In [3]:
df_publisher = pd.read_csv("data/crowdsource/publisher_type.csv")
df_author = pd.read_csv("data/crowdsource/author_type.csv")
df_verification = pd.read_csv("data/crowdsource/publisher_verification_type.csv") 

In [7]:
# Extract unique entries for each file
author_series = pd.DataFrame({'author': df_author['ref_value'].dropna().drop_duplicates()})
publisher_series = pd.DataFrame({'publisher': df_publisher['ref_value'].dropna().drop_duplicates()})
verification_series = pd.DataFrame({'verification': df_verification['ref_value'].dropna().drop_duplicates()})


In [8]:
author_urls = [
    "http://8tracks.com/angelo8flight12/shake-it-up-dance",
    "http://akas.imdb.com/name/nm23804352"
]
publisher_urls = [
    "http://8tracks.com",
    "http://addons.mozilla.org"
]


In [26]:
from ollama import Client

class LLMWrapper:
    def __init__(
        self,
        model_name: str = "llama3.2",  # default to llama3.2 as you wrote
        host: str = "http://localhost:11434",
        verbose: bool = True
    ):
        """
        Initialize an Ollama LLM with customizable settings.

        Args:
            model_name (str): Name of the local Ollama model.
            host (str): Host for the Ollama server.
            verbose (bool): Whether to print init confirmation.
        """
        self.model_name = model_name
        self.client = Client(host=host)

        if verbose:
            print(f"[LLMWrapper] Initialized Ollama LLM: {self.model_name} at {host}")

    def run_prompt(self, prompt):
        """
        Send a prompt to the LLM and return the response content.
        """
        response = self.client.chat(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}]
        )
        return response["message"]["content"]


In [27]:
llm = LLMWrapper()

[LLMWrapper] Initialized Ollama LLM: llama3.2 at http://localhost:11434


In [17]:
def get_agreeing_examples_by_label(df_llm: pd.DataFrame, df_crowd: pd.DataFrame, label_column="author_type", labels=None):
    if labels is None:
        labels = ["organisation", "collective", "individual"]

    # Compute majority vote
    majority = df_crowd.groupby("ref_value")[label_column].agg(lambda x: x.mode()[0]).reset_index()
    majority.columns = ["ref_value", "majority_label"]

    # Merge with LLM
    df_llm = df_llm.rename(columns={label_column: "llm_label"})
    merged = pd.merge(majority, df_llm, on="ref_value")

    # Keep only agreeing samples
    agreeing = merged[merged["llm_label"] == merged["majority_label"]]

    # Get one example per specified label
    examples = []
    for lbl in labels:
        match = agreeing[agreeing["llm_label"] == lbl]
        if not match.empty:
            sample = match.sample(n=1, random_state=42)
            examples.append((sample.iloc[0]["ref_value"], lbl))

    return examples


In [None]:
    # Paths
llm_path = f"data/llm_crowdsource/prompt_1/llama3.2/author_results.csv"
crowd_path = f"data/crowdsource/author_type.csv"

# Load
df_llm = pd.read_csv(llm_path)
df_crowd = pd.read_csv(crowd_path)

In [20]:
examples = get_agreeing_examples_by_label(df_llm, df_crowd, label_column="author_type", labels=None)

In [22]:
examples.append(('https://lawandorder.fandom.com/wiki/Censure','collective'))

In [23]:
examples

[('http://www.cdc.gov/niosh/ershdb/emergencyresponsecard_29750036.html',
  'organisation'),
 ('http://amturing.acm.org/award_winners/mccarthy_0239596.cfm', 'individual'),
 ('https://lawandorder.fandom.com/wiki/Censure', 'collective')]

In [83]:
def publisher_verification_prompt(domain, examples=None):
    shots = ""
    if examples:
        shots = "\n\n".join([
            f"Domain: {ref}\nLabel: {lbl}"
            for ref, lbl in examples
        ]) + "\n\n"

    return f"""
You are evaluating the **verification type** of the publisher associated with a domain.

Your task is to classify the domain into **one** of the following labels:
- 'yes': if the publisher is verified or reputable.
- 'no': if the publisher is known to be unreliable.
- 'vendor': if it's a commercial seller or online marketplace.
- 'no_profit': if the site belongs to a nonprofit organization.
- 'political': if the domain represents a political entity or campaign.
- 'cultural': if it's a cultural or artistic institution.
- 'trad_news': a traditional media outlet (e.g., established newspapers).
- 'non_trad_news': blogs, YouTube channels, or alt-media sites.
- 'academia_uni': university-level academic publisher.
- 'academia_pub': peer-reviewed academic journal or press.
- 'academia_other': other academic institution.
- 'nw': not well-defined.
- 'ne': not enough evidence to decide.
- 'dn': does not apply.

{shots}Now classify this domain:
Domain: {domain}

Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON.  
STRICT INSTRUCTIONS:
Do not include any explanations.
"""


In [88]:
def get_examples_by_label_from_crowd(df_crowd: pd.DataFrame, label_column="publisher_verification", labels=None):
    """
    Select one crowd-sourced example per label using majority vote, without requiring LLM agreement.

    Parameters:
        df_crowd: DataFrame with columns ['ref_value', label_column]
        label_column: e.g., 'author_type', 'publisher_type', etc.
        labels: list of label names to cover (optional). If None, uses all labels in data.

    Returns:
        List of tuples (ref_value, label) — one per label if possible.
    """
    if labels is None:
        labels = df_crowd[label_column].unique().tolist()

    # Compute majority label per reference
    majority = df_crowd.groupby("ref_value")[label_column].agg(lambda x: x.mode()[0]).reset_index()
    majority.columns = ["ref_value", "majority_label"]

    # Sample one reference per label
    examples = []
    for lbl in labels:
        subset = majority[majority["majority_label"] == lbl]
        if not subset.empty:
            sample = subset.sample(n=1, random_state=42)
            examples.append((sample.iloc[0]["ref_value"], lbl))

    return examples


In [89]:
examples_verification = get_examples_by_label_from_crowd(df_crowd, label_column="publisher_verification", labels=[
    "yes", "no", "vendor", "no_profit", "political", "cultural",
    "trad_news", "non_trad_news", "academia_uni", "academia_pub",
    "academia_other", "nw", "ne", "dn"
])


In [90]:
examples_verification

[('http://www.census.gov', 'yes'),
 ('http://www.hardcoregamer.com', 'no'),
 ('http://www.tsmc.com', 'vendor'),
 ('http://www.openvas.org', 'no_profit'),
 ('http://www.catholic-hierarchy.org', 'political'),
 ('http://www.gmv.com', 'cultural'),
 ('http://electronicintifada.net', 'trad_news'),
 ('http://www.rollingstone.com', 'non_trad_news'),
 ('http://chibi.ubc.ca', 'academia_uni'),
 ('http://figshare.com', 'academia_pub'),
 ('http://www.artic.edu', 'academia_other'),
 ('http://iba-world.com', 'nw'),
 ('http://www.badmintonlink.com', 'ne')]

In [91]:
pv = pd.read_csv('data/crowdsource/publisher_verification_type.csv')
unique_verification = pv['ref_value'].dropna().unique()

In [92]:
#4. Predict Author Type
# author_results = []
# for i, ref in enumerate(unique_authors, start=1):
#     prompt = author_type_prompt(ref, examples)
#     response = llm.run_prompt(prompt)
    
#     try:
#         label = json.loads(response)["label"]
#         author_results.append((ref, label))
#     except Exception as e:
#         print(f"⚠️ Skipped {ref} at index {i} due to error: {e}")
#         continue

#     if i % 10 == 0 or i == len(unique_authors):
#         print(f"✅ Processed {i}/{len(unique_authors)} references")

# #4. Predict Author Type
# publisher_results = []
# for i, ref in enumerate(unique_publishers, start=1):
#     prompt = publisher_type_prompt(ref, examples_publisher)
#     response = llm.run_prompt(prompt)
    
#     try:
#         label = json.loads(response)["label"]
#         publisher_results.append((ref, label))
#     except Exception as e:
#         print(f"⚠️ Skipped {ref} at index {i} due to error: {e}")
#         continue

#     if i % 10 == 0 or i == len(unique_publishers):
#         print(f"✅ Processed {i}/{len(unique_publishers)} references")

verification_results = []
for i, ref in enumerate(unique_verification, start=1):
    prompt = publisher_verification_prompt(ref, examples_verification)
    response = llm.run_prompt(prompt)
    
    try:
        label = json.loads(response)["label"]
        verification_results.append((ref, label))
    except Exception as e:
        print(f"⚠️ Skipped {ref} at index {i} due to error: {e}")
        continue

    if i % 10 == 0 or i == len(unique_verification):
        print(f"✅ Processed {i}/{len(unique_verification)} references")


✅ Processed 10/293 references
✅ Processed 20/293 references
✅ Processed 30/293 references
✅ Processed 40/293 references
✅ Processed 50/293 references
✅ Processed 60/293 references
✅ Processed 70/293 references
✅ Processed 80/293 references
✅ Processed 90/293 references
✅ Processed 100/293 references
✅ Processed 110/293 references
✅ Processed 120/293 references
✅ Processed 130/293 references
✅ Processed 140/293 references
✅ Processed 150/293 references
✅ Processed 160/293 references
✅ Processed 170/293 references
✅ Processed 180/293 references
✅ Processed 190/293 references
✅ Processed 200/293 references
✅ Processed 210/293 references
✅ Processed 220/293 references
✅ Processed 230/293 references
✅ Processed 240/293 references
✅ Processed 250/293 references
✅ Processed 260/293 references
✅ Processed 270/293 references
✅ Processed 280/293 references
✅ Processed 290/293 references
✅ Processed 293/293 references


In [94]:
verification_results

[('http://8tracks.com', 'non_trad_news'),
 ('http://addons.mozilla.org', 'trad_news'),
 ('http://akas.imdb.com', 'nw'),
 ('http://americanart.si.edu', 'trad_news'),
 ('http://amturing.acm.org', 'academia_uni'),
 ('http://archive.org', 'trad_news'),
 ('http://art.famsf.org', 'ne'),
 ('http://backuppc.cvs.sourceforge.net', 'non_trad_news'),
 ('http://bbfc.co.uk', 'trad_news'),
 ('http://bbti.bodleian.ox.ac.uk', 'nw'),
 ('http://blog.mathieui.net', 'non_trad_news'),
 ('http://blog.process-one.net', 'non_trad_news'),
 ('http://blog.torproject.org', 'no_profit'),
 ('http://blogs.windows.com', 'non_trad_news'),
 ('http://books.google.co.uk', 'nw'),
 ('http://brightlightsfilm.com', 'non_trad_news'),
 ('http://chem.nlm.nih.gov', 'trad_news'),
 ('http://chibi.ubc.ca', 'academia_uni'),
 ('http://classify.oclc.org', 'academia_pub'),
 ('http://collection.cooperhewitt.org', 'academia_uni'),
 ('http://collections.artsmia.org', 'cultural'),
 ('http://collections.britishart.yale.edu', 'academia_uni'),

In [95]:
# Create the target directory if it doesn't exist
output_dir = "data/llm_crowdsource/prompt_2_onesl/llama_3.2"
os.makedirs(output_dir, exist_ok=True)

# Convert to DataFrames
# df_author = pd.DataFrame(author_results, columns=["ref_value", "author_type"])
# df_publisher = pd.DataFrame(publisher_results, columns=["domain", "publisher_type"])
df_verification = pd.DataFrame(verification_results, columns=["ref_value", "publisher_verification"])

# Save as CSV
# df_author.to_csv(os.path.join(output_dir, "author_results.csv"), index=False)
# df_publisher.to_csv(os.path.join(output_dir, "publisher_results.csv"), index=False)
df_verification.to_csv(os.path.join(output_dir, "verification_results.csv"), index=False)

In [67]:
properties = [
    "P31", "P1215", "P258", "P17", "P131", "P106", "P625", "P2215", "P3083",
    "P6257", "P6259", "P6258", "P21", "P2671", "P59", "P735", "P569", "P27",
    "P18", "P646", "P361", "P684", "P1476", "P734", "P2216", "P1566", "P2214",
    "P171", "P225", "P105", "P373", "P279", "P19", "P2583", "P214", "P570",
    "P1087", "P703", "P407", "P276", "P846", "P571", "P577", "P1412", "P1082",
    "P971", "P69", "P1435", "P421", "P195", "P527"
]


In [68]:
print('hello')

hello


In [None]:
from ollama import Client
client = Client(
  host='http://localhost:11434',
)
response = client.chat(model='llama3.2', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])

In [2]:
import pandas as pd
from metrics import computeFleissKappa

# Load crowdsource and LLM data
crowd = pd.read_csv("data/crowdsource/author_type.csv")
llm = pd.read_csv("data/llm_crowdsource/author_results.csv")
llm.columns = ['ref_value', 'author_type']

# Deduplicate crowdsource to 1 row per ref_value by taking first rater
#crowd_one = crowd[['ref_value', 'author_type']].drop_duplicates('ref_value')

# Merge
merged = pd.merge(crowd, llm, on='ref_value', suffixes=('_crowd', '_llm'))

# Possible labels
LABELS = ['organisation', 'collective', 'nw', 'individual', 'ne', 'dn']

# Count matrix
fleiss_ratings = []
for _, row in merged.iterrows():
    counts = [0] * len(LABELS)
    if row['author_type_crowd'] in LABELS:
        counts[LABELS.index(row['author_type_crowd'])] += 1
    if row['author_type_llm'] in LABELS:
        counts[LABELS.index(row['author_type_llm'])] += 1
    fleiss_ratings.append(counts)

# Compute Fleiss' Kappa
kappa = computeFleissKappa(fleiss_ratings)
print("Fleiss' Kappa (Author Type, 1 crowd + 1 LLM):", kappa)


Fleiss' Kappa (Author Type, 1 crowd + 1 LLM): 0.14419999243665924


In [3]:
crowd

Unnamed: 0,ref_value,X_worker_id,author_type
0,http://8tracks.com/angeloflight12/shake-it-up-...,42298345,organisation
1,http://8tracks.com/angeloflight12/shake-it-up-...,33163510,nw
2,http://8tracks.com/angeloflight12/shake-it-up-...,41983503,collective
3,http://8tracks.com/angeloflight12/shake-it-up-...,11746564,organisation
4,http://8tracks.com/angeloflight12/shake-it-up-...,42708352,organisation
...,...,...,...
9165,http://www.artistdirect.com/nad/store/artist/a...,6680559,organisation
9166,http://www.artistdirect.com/nad/store/artist/a...,31650069,organisation
9167,http://www.artistdirect.com/nad/store/artist/a...,32963386,organisation
9168,http://www.artistdirect.com/nad/store/artist/a...,34010900,organisation


In [4]:
llm

Unnamed: 0,ref_value,author_type
0,http://8tracks.com/angeloflight12/shake-it-up-...,ne
1,http://akas.imdb.com/name/nm2304352/,individual
2,http://americanart.si.edu/collections/search/a...,organisation
3,http://americanart.si.edu/collections/search/a...,organisation
4,http://americanart.si.edu/collections/search/a...,ne
...,...,...
1173,http://www.sports-reference.com/olympics/athle...,individual
1174,http://www.nfl.com/player/calvinhill/2516499/c...,individual
1175,http://www.historyofparliamentonline.org/volum...,individual
1176,https://git.kernel.org/cgit/network/connman/co...,organisation


# 1. Majority Voting + Cohen’s Kappa
This compares LLM’s prediction to the majority vote from human annotators per reference.

In [6]:
from sklearn.metrics import cohen_kappa_score

# Get majority vote per ref_value from crowd
majority = crowd.groupby('ref_value')['author_type'].agg(lambda x: x.mode()[0]).reset_index()
majority.columns = ['ref_value', 'majority_author_type']

# Merge with LLM predictions
merged = pd.merge(majority, llm, on='ref_value')
merged.columns = ['ref_value', 'human_majority', 'llm']

# Compute Cohen’s Kappa
kappa = cohen_kappa_score(merged['human_majority'], merged['llm'])
print(f"Cohen's Kappa (LLM vs Human Majority): {kappa:.3f}")


Cohen's Kappa (LLM vs Human Majority): 0.141


In [10]:
from statsmodels.stats.inter_rater import fleiss_kappa
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode categories
le = LabelEncoder()
all_labels = pd.concat([crowd['author_type'], llm['author_type']])
le.fit(all_labels)

# Build voting matrix
category_names = le.classes_
label_count = len(category_names)

# Prepare a function to count votes per reference
def build_vote_row(ref):
    labels = crowd[crowd['ref_value'] == ref]['author_type'].tolist()
    llm_label = llm[llm['ref_value'] == ref]['author_type'].values[0]
    labels.append(llm_label)  # add LLM as 6th annotator

    counts = [labels.count(cls) for cls in category_names]
    return counts

# Unique references
refs = crowd['ref_value'].unique()
vote_matrix = np.array([build_vote_row(ref) for ref in refs])

# Fleiss' Kappa
fk = fleiss_kappa(vote_matrix)
print(f"Fleiss' Kappa (Human + LLM): {fk:.3f}")


AssertionError: 

In [11]:
# Unique users
users = crowd['X_worker_id'].unique()
results = []

for user in users:
    df_user = crowd[crowd['X_worker_id'] == user]
    merged = pd.merge(df_user, llm, on='ref_value', suffixes=('_human', '_llm'))

    # Ensure matching ref_values
    if len(merged) > 0:
        score = cohen_kappa_score(merged['author_type_human'], merged['author_type_llm'])
        results.append((user, score))

# Print results
for user, score in results:
    print(f"Cohen's Kappa (LLM vs User {user}): {score:.3f}")


Cohen's Kappa (LLM vs User 42298345): 0.099
Cohen's Kappa (LLM vs User 33163510): 0.093
Cohen's Kappa (LLM vs User 41983503): 0.126
Cohen's Kappa (LLM vs User 11746564): 0.295
Cohen's Kappa (LLM vs User 42708352): 0.118
Cohen's Kappa (LLM vs User 11376021): 0.085
Cohen's Kappa (LLM vs User 40652679): 0.401
Cohen's Kappa (LLM vs User 40263699): 0.044
Cohen's Kappa (LLM vs User 41141777): 0.268
Cohen's Kappa (LLM vs User 40893415): 0.353
Cohen's Kappa (LLM vs User 41881477): 0.552
Cohen's Kappa (LLM vs User 38687371): 0.467
Cohen's Kappa (LLM vs User 40914155): 0.448
Cohen's Kappa (LLM vs User 36624581): 0.240
Cohen's Kappa (LLM vs User 39419117): 0.211
Cohen's Kappa (LLM vs User 34307153): -0.065
Cohen's Kappa (LLM vs User 41399716): 0.114
Cohen's Kappa (LLM vs User 42160208): 0.125
Cohen's Kappa (LLM vs User 39632557): -0.084
Cohen's Kappa (LLM vs User 38514103): -0.037
Cohen's Kappa (LLM vs User 33599386): 0.287
Cohen's Kappa (LLM vs User 39499917): 0.012
Cohen's Kappa (LLM vs User 42

In [55]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import cohen_kappa_score

# Step 1: Compute per-user kappa and agreement count
results = []

users = crowd['X_worker_id'].unique()

for user in users:
    df_user = crowd[crowd['X_worker_id'] == user]
    merged = pd.merge(df_user, llm, on='ref_value', suffixes=('_human', '_llm'))

    if len(merged) > 0:
        kappa = cohen_kappa_score(merged['author_type_human'], merged['author_type_llm'])
        agree_count = sum(merged['author_type_human'] == merged['author_type_llm'])
        results.append({'user': user, 'kappa': kappa, 'agree_count': agree_count})

df_results = pd.DataFrame(results)

# Step 2: Bin by Kappa score
bins = [-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ['<0', '0–0.2', '0.2–0.4', '0.4–0.6', '0.6–0.8', '0.8–1.0']
df_results['kappa_bin'] = pd.cut(df_results['kappa'], bins=bins, labels=labels)

# Step 3A: Sum of agreement counts per bin
bin_sum = df_results.groupby('kappa_bin')['agree_count'].sum()

# Step 3B: Average agreement counts per user in each bin (optional)
bin_avg = df_results.groupby('kappa_bin')['agree_count'].mean()

# Step 4: Bar chart
plt.figure(figsize=(10, 6))
bin_sum.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title("Total Agreement Counts with LLM per Kappa Score Bin")
plt.xlabel("Cohen's Kappa Bin")
plt.ylabel("Total Agreements with LLM")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


TypeError: Can only merge Series or DataFrame objects, a <class '__main__.LLMWrapper'> was passed