In [1]:
import pandas as pd

In [9]:
df_publisher = pd.read_csv("data/crowdsource/publisher_type.csv")
df_author = pd.read_csv("data/crowdsource/author_type.csv")
df_verification = pd.read_csv("data/crowdsource/publisher_verification.csv") 

In [10]:
# Extract unique entries for each file
author_series = pd.DataFrame({'author': df_author['ref_value'].dropna().drop_duplicates()})
publisher_series = pd.DataFrame({'publisher': df_publisher['domain'].dropna().drop_duplicates()})
verification_series = pd.DataFrame({'verification': df_verification['domain'].dropna().drop_duplicates()})


In [11]:
# Display the results
print("✅ Unique author ref_values:")
display(author_series.head())

print("✅ Unique publisher domains:")
display(publisher_series.head())

print("✅ Unique verification domains:")
display(verification_series.head())

✅ Unique author ref_values:


Unnamed: 0,author
0,http://8tracks.com/angeloflight12/shake-it-up-...
5,http://akas.imdb.com/name/nm2304352/
10,http://americanart.si.edu/collections/search/a...
15,http://americanart.si.edu/collections/search/a...
20,http://americanart.si.edu/collections/search/a...


✅ Unique publisher domains:


Unnamed: 0,publisher
0,http://8tracks.com
5,http://addons.mozilla.org
14,http://akas.imdb.com
23,http://americanart.si.edu
28,http://archive.org


✅ Unique verification domains:


Unnamed: 0,verification
0,http://8tracks.com
7,http://addons.mozilla.org
13,http://akas.imdb.com
21,http://americanart.si.edu
27,http://amturing.acm.org


In [12]:
# Optional: save them
author_series.to_csv("data/unique/unique_authors.csv", index=False)
publisher_series.to_csv("data/unique/unique_publishers.csv", index=False)
verification_series.to_csv("data/unique/unique_verifications.csv", index=False)

In [13]:
author_urls = [
    "http://8tracks.com/angelo8flight12/shake-it-up-dance",
    "http://akas.imdb.com/name/nm23804352"
]
publisher_urls = [
    "http://8tracks.com",
    "http://addons.mozilla.org"
]


In [14]:
from ollama import Client

class LLMWrapper:
    def __init__(
        self,
        model_name: str = "llama3.2",  # default to llama3.2 as you wrote
        host: str = "http://localhost:11434",
        verbose: bool = True
    ):
        """
        Initialize an Ollama LLM with customizable settings.

        Args:
            model_name (str): Name of the local Ollama model.
            host (str): Host for the Ollama server.
            verbose (bool): Whether to print init confirmation.
        """
        self.model_name = model_name
        self.client = Client(host=host)

        if verbose:
            print(f"[LLMWrapper] Initialized Ollama LLM: {self.model_name} at {host}")

    def run_prompt(self, prompt):
        """
        Send a prompt to the LLM and return the response content.
        """
        response = self.client.chat(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}]
        )
        return response["message"]["content"]


In [15]:
llm = LLMWrapper()

[LLMWrapper] Initialized Ollama LLM: llama3.2 at http://localhost:11434


In [16]:
response = llm.run_prompt('who is the CEO of APPLE')

In [17]:
response

"As of my knowledge cutoff in 2023, the CEO of Apple Inc. is Tim Cook. He has been serving as the company's CEO since August 2011, after Steve Jobs' passing. However, please note that this information may have changed since my knowledge cutoff date."

In [18]:
author = pd.read_csv('data/crowdsource/author_type.csv')

In [19]:
author["author_type"].unique()

array(['organisation', 'nw', 'collective', 'individual', 'ne', 'dn'],
      dtype=object)

In [20]:
pt = pd.read_csv('data/crowdsource/publisher_type.csv')

In [21]:
pt['publisher_type'].unique()

array(['news', 'company', 'sp_source', 'academia', 'other', 'nw', 'ne',
       'govt'], dtype=object)

In [40]:
pv = pd.read_csv('data/crowdsource/publisher_verification.csv')

In [42]:
pv


Unnamed: 0,domain,X_worker_id,results
0,http://8tracks.com,29415754,vendor
1,http://8tracks.com,41189648,vendor
2,http://8tracks.com,36268643,vendor
3,http://8tracks.com,28375885,vendor
4,http://8tracks.com,39006856,vendor
...,...,...,...
1660,http://yum.baseurl.org,39332461,vendor
1661,http://yum.baseurl.org,41652222,vendor
1662,http://yum.baseurl.org,34630521,no
1663,http://yum.baseurl.org,35673676,no


In [24]:
def author_type_prompt(ref_value):
    return f"""
You are an expert assistant tasked with classifying the **type of author** for the following reference URL or citation:

Reference: {ref_value}

Please determine the author type based on the reference and choose one of the following categories:
- 'individual': if the content is authored by a named person.
- 'organisation': if the content is authored by a company, institution, or group.
- 'collective': if the article is signed by a collective group (e.g., editorial board).
- 'nw' (not well-defined): if the URL or metadata is broken, unclear, or not informative.
- 'ne' (not enough evidence): if there's not enough information to decide.
- 'dn' (does not apply): if the reference is irrelevant or malformed.


Your output should be **only one of these labels**: individual, organisation, collective, nw, ne, dn.
Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON. Do not include any explanations.
"""


In [25]:
def publisher_type_prompt(domain):
    return f"""
You are evaluating the **type of publisher** associated with this web domain:

Domain: {domain}

Classify the domain into one of the following types:
- 'news': News outlet (e.g., BBC, CNN).
- 'company': Corporate website or brand.
- 'sp_source': Special interest group (e.g., activist organizations).
- 'academia': University, scholarly, or academic institution.
- 'govt': Government-related website.
- 'other': If it doesn't fit any of the above.
- 'nw' (not well-defined): if the domain is ambiguous or lacks proper information.
- 'ne' (not enough evidence): if you cannot confidently decide.
- 'dn' (does not apply): if the domain is invalid or unrelated.

Your response must be one of these labels: news, company, sp_source, academia, other, nw, ne, govt, dn.
Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON. Do not include any explanations.
"""


In [53]:
def publisher_verification_prompt(domain):
    return f"""
You are evaluating the **verification type** of the publisher associated with this domain:

Domain: {domain}

Choose the most appropriate label from the following options:
- 'yes': if the publisher is verified or reputable.
- 'no': if the publisher is known to be unreliable.
- 'vendor': if it's a commercial seller or online marketplace.
- 'no_profit': if the site belongs to a nonprofit organization.
- 'political': if the domain represents a political entity or campaign.
- 'cultural': if it's a cultural or artistic institution.
- 'trad_news': a traditional media outlet (e.g., established newspapers).
- 'non_trad_news': blogs, YouTube channels, or alt-media sites.
- 'academia_uni': university-level academic publisher.
- 'academia_pub': peer-reviewed academic journal or press.
- 'academia_other': other academic institution.
- 'nw': not well-defined.
- 'ne': not enough evidence to decide.
- 'dn': does not apply.

Please return one label from the list above.
Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON. 
STRICT INSTRUCTIONS :
Do not include any explanations.
"""


In [46]:
pv

Unnamed: 0,domain,X_worker_id,results
0,http://8tracks.com,29415754,vendor
1,http://8tracks.com,41189648,vendor
2,http://8tracks.com,36268643,vendor
3,http://8tracks.com,28375885,vendor
4,http://8tracks.com,39006856,vendor
...,...,...,...
1660,http://yum.baseurl.org,39332461,vendor
1661,http://yum.baseurl.org,41652222,vendor
1662,http://yum.baseurl.org,34630521,no
1663,http://yum.baseurl.org,35673676,no


In [51]:
# # 2. Extract unique values
# unique_authors = author['ref_value'].dropna().unique()
# unique_publishers = pt['domain'].dropna().unique()
unique_verifiers = pv['domain'].dropna().unique()

In [52]:
len(unique_verifiers)

293

In [28]:
author_results = []
publisher_results = []
verification_results = []

In [29]:
print('---------- Number of unique ref for the AUTHORS --------------')
print(len(unique_authors))
print('---------- Number of unique ref for the PUBLISHER --------------')
print(len(unique_publishers))
print('---------- Number of unique ref for the VERIFICATION --------------')
print(len(unique_verifiers))


---------- Number of unique ref for the AUTHORS --------------
1178
---------- Number of unique ref for the PUBLISHER --------------
278
---------- Number of unique ref for the VERIFICATION --------------
293


In [31]:
import json

In [30]:
prompt = author_type_prompt("http://8tracks.com/angeloflight12/shake-it-up-dance")
response = llm.run_prompt(prompt)

In [35]:
json.loads(response)["label"]

'nw'

In [None]:
# #4. Predict Author Type
# for ref in unique_authors:
#     prompt = author_type_prompt(ref)
#     response = llm.run_prompt(prompt) 
#     label = json.loads(response)["label"]
#     author_results.append((ref, label))
    
    

# # 5. Predict Publisher Type
# for domain in unique_publishers:
#     prompt = publisher_type_prompt(domain)
#     response = llm.run_prompt(prompt)
#     label = json.loads(response)["label"]
#     publisher_results.append((domain, label))

# 6. Predict Publisher Verification Type
# for domain in unique_verifiers:
#     prompt = publisher_verification_prompt(domain)
#     response = llm.run_prompt(prompt)
#     label = json.loads(response)["label"]
#     verification_results.append((domain, label))


In [55]:
import os

In [56]:
# Create the target directory if it doesn't exist
output_dir = "data/llm_crowdsource"
os.makedirs(output_dir, exist_ok=True)

# Convert to DataFrames
df_author = pd.DataFrame(author_results, columns=["ref_value", "author_type"])
df_publisher = pd.DataFrame(publisher_results, columns=["domain", "publisher_type"])
df_verification = pd.DataFrame(verification_results, columns=["domain", "publisher_verification"])

# Save as CSV
df_author.to_csv(os.path.join(output_dir, "author_results.csv"), index=False)
df_publisher.to_csv(os.path.join(output_dir, "publisher_results.csv"), index=False)
df_verification.to_csv(os.path.join(output_dir, "verification_results.csv"), index=False)

In [67]:
properties = [
    "P31", "P1215", "P258", "P17", "P131", "P106", "P625", "P2215", "P3083",
    "P6257", "P6259", "P6258", "P21", "P2671", "P59", "P735", "P569", "P27",
    "P18", "P646", "P361", "P684", "P1476", "P734", "P2216", "P1566", "P2214",
    "P171", "P225", "P105", "P373", "P279", "P19", "P2583", "P214", "P570",
    "P1087", "P703", "P407", "P276", "P846", "P571", "P577", "P1412", "P1082",
    "P971", "P69", "P1435", "P421", "P195", "P527"
]


In [68]:
print('hello')

hello


In [None]:
from ollama import Client
client = Client(
  host='http://localhost:11434',
)
response = client.chat(model='llama3.2', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])

In [1]:
import pandas as pd
from metrics import computeFleissKappa

# Load crowdsource and LLM data
crowd = pd.read_csv("data/crowdsource/author_type.csv")
llm = pd.read_csv("data/llm_crowdsource/author_results.csv")
llm.columns = ['ref_value', 'author_type']

# Deduplicate crowdsource to 1 row per ref_value by taking first rater
crowd_one = crowd[['ref_value', 'author_type']].drop_duplicates('ref_value')

# Merge
merged = pd.merge(crowd_one, llm, on='ref_value', suffixes=('_crowd', '_llm'))

# Possible labels
LABELS = ['organisation', 'collective', 'nw', 'individual', 'ne', 'dn']

# Count matrix
fleiss_ratings = []
for _, row in merged.iterrows():
    counts = [0] * len(LABELS)
    if row['author_type_crowd'] in LABELS:
        counts[LABELS.index(row['author_type_crowd'])] += 1
    if row['author_type_llm'] in LABELS:
        counts[LABELS.index(row['author_type_llm'])] += 1
    fleiss_ratings.append(counts)

# Compute Fleiss' Kappa
kappa = computeFleissKappa(fleiss_ratings)
print("Fleiss' Kappa (Author Type, 1 crowd + 1 LLM):", kappa)


Fleiss' Kappa (Author Type, 1 crowd + 1 LLM): 0.14679739451374615


ModuleNotFoundError: No module named 'metrics'