In [1]:
import pandas as pd

In [2]:
df_publisher = pd.read_csv("data/publisher_type.csv")
df_author = pd.read_csv("data/author_type.csv")
df_verification = pd.read_csv("data/publisher_verification.csv") 

In [7]:
# Extract unique entries for each file
author_series = pd.DataFrame({'author': df_author['ref_value'].dropna().drop_duplicates()})
publisher_series = pd.DataFrame({'publisher': df_publisher['domain'].dropna().drop_duplicates()})
verification_series = pd.DataFrame({'verification': df_verification['domain'].dropna().drop_duplicates()})


In [8]:
# Display the results
print("✅ Unique author ref_values:")
display(author_series.head())

print("✅ Unique publisher domains:")
display(publisher_series.head())

print("✅ Unique verification domains:")
display(verification_series.head())

✅ Unique author ref_values:


Unnamed: 0,author
0,http://8tracks.com/angeloflight12/shake-it-up-...
5,http://akas.imdb.com/name/nm2304352/
10,http://americanart.si.edu/collections/search/a...
15,http://americanart.si.edu/collections/search/a...
20,http://americanart.si.edu/collections/search/a...


✅ Unique publisher domains:


Unnamed: 0,publisher
0,http://8tracks.com
5,http://addons.mozilla.org
14,http://akas.imdb.com
23,http://americanart.si.edu
28,http://archive.org


✅ Unique verification domains:


Unnamed: 0,verification
0,http://8tracks.com
7,http://addons.mozilla.org
13,http://akas.imdb.com
21,http://americanart.si.edu
27,http://amturing.acm.org


In [10]:
# Optional: save them
author_series.to_csv("data/unique/unique_authors.csv", index=False)
publisher_series.to_csv("data/unique/unique_publishers.csv", index=False)
verification_series.to_csv("data/unique/unique_verifications.csv", index=False)

In [11]:
author_urls = [
    "http://8tracks.com/angelo8flight12/shake-it-up-dance",
    "http://akas.imdb.com/name/nm23804352"
]
publisher_urls = [
    "http://8tracks.com",
    "http://addons.mozilla.org"
]


In [1]:
from groq import Groq

In [3]:
import os
from langchain_groq import ChatGroq

In [5]:
class LLMWrapper:
    def __init__(
        self,
        model_name: str = "meta-llama/llama-4-scout-17b-16e-instruct",
        temperature: float = 0.0,
        api_key: str = None,
        verbose: bool = True
    ):
        """
        Initialize a Groq LLM with customizable settings.

        Args:
            model_name (str): Name of the Groq model to use.
            temperature (float): Sampling temperature for output diversity.
            api_key (str): Groq API key (uses env var if None).
            verbose (bool): Whether to print init confirmation.
        """
        self.model_name = model_name
        self.temperature = temperature
        self.api_key = api_key or os.getenv("GROQ_API_KEY")

        if not self.api_key:
            raise ValueError("GROQ_API_KEY is missing. Set it via env or parameter.")

        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=self.temperature
        )

        if verbose:
            print(f"[LLMWrapper] Initialized Groq LLM: {self.model_name} (temp={self.temperature})")

    def get_llm(self):
        """Return the internal LLM instance."""
        return self.llm

In [65]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
llm = LLMWrapper().get_llm()

[LLMWrapper] Initialized Groq LLM: meta-llama/llama-4-scout-17b-16e-instruct (temp=0.0)


In [9]:
response = llm.invoke('who is the CEO of APPLE')

In [10]:
response

AIMessage(content="The CEO of Apple Inc. is **Tim Cook**. He has been in this position since August 24, 2011, when he succeeded Steve Jobs, the co-founder and former CEO of Apple. Under Cook's leadership, Apple has continued to innovate and expand its product lines, services, and global reach.\n\nHere are some key facts about Tim Cook:\n\n* **Background**: Cook joined Apple in 1994 and served in various roles, including Senior Vice President of Worldwide Sales and Operations.\n* **CEO tenure**: He became CEO on August 24, 2011, and has been leading the company for over 11 years.\n* **Net worth**: Estimated to be around $1.5 billion (mostly in Apple stock).\n\nLet me know if you'd like to know more about Tim Cook or Apple!", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 165, 'prompt_tokens': 17, 'total_tokens': 182, 'completion_time': 0.375586407, 'prompt_time': 0.002663772, 'queue_time': 0.084058639, 'total_time': 0.378250179}, 'model_name': 'meta-llama/

In [14]:
import pandas as pd

In [15]:
author = pd.read_csv('data/crowdsource/author_type.csv')

In [17]:
author["author_type"].unique()

array(['organisation', 'nw', 'collective', 'individual', 'ne', 'dn'],
      dtype=object)

In [18]:
pt = pd.read_csv('data/crowdsource/publisher_type.csv')

In [19]:
pt['publisher_type'].unique()

array(['news', 'company', 'sp_source', 'academia', 'other', 'nw', 'ne',
       'govt'], dtype=object)

In [20]:
pv = pd.read_csv('data/crowdsource/publisher_verification.csv')

In [21]:
pv['results'].unique()

array(['vendor', 'no_profit', 'ne', 'cultural', 'no', 'political', 'yes',
       'nw', 'trad_news', 'non_trad_news', 'academia_uni', 'academia_pub',
       'academia_other'], dtype=object)

In [40]:
def author_type_prompt(ref_value):
    return f"""
You are an expert assistant tasked with classifying the **type of author** for the following reference URL or citation:

Reference: {ref_value}

Please determine the author type based on the reference and choose one of the following categories:
- 'individual': if the content is authored by a named person.
- 'organisation': if the content is authored by a company, institution, or group.
- 'collective': if the article is signed by a collective group (e.g., editorial board).
- 'nw' (not well-defined): if the URL or metadata is broken, unclear, or not informative.
- 'ne' (not enough evidence): if there's not enough information to decide.
- 'dn' (does not apply): if the reference is irrelevant or malformed.


Your output should be **only one of these labels**: individual, organisation, collective, nw, ne, dn.
Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON. Do not include any explanations.
"""


In [41]:
def publisher_type_prompt(domain):
    return f"""
You are evaluating the **type of publisher** associated with this web domain:

Domain: {domain}

Classify the domain into one of the following types:
- 'news': News outlet (e.g., BBC, CNN).
- 'company': Corporate website or brand.
- 'sp_source': Special interest group (e.g., activist organizations).
- 'academia': University, scholarly, or academic institution.
- 'govt': Government-related website.
- 'other': If it doesn't fit any of the above.
- 'nw' (not well-defined): if the domain is ambiguous or lacks proper information.
- 'ne' (not enough evidence): if you cannot confidently decide.
- 'dn' (does not apply): if the domain is invalid or unrelated.

Your response must be one of these labels: news, company, sp_source, academia, other, nw, ne, govt, dn.
Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON. Do not include any explanations.
"""


In [42]:
def publisher_verification_prompt(domain):
    return f"""
You are evaluating the **verification type** of the publisher associated with this domain:

Domain: {domain}

Choose the most appropriate label from the following options:
- 'yes': if the publisher is verified or reputable.
- 'no': if the publisher is known to be unreliable.
- 'vendor': if it's a commercial seller or online marketplace.
- 'no_profit': if the site belongs to a nonprofit organization.
- 'political': if the domain represents a political entity or campaign.
- 'cultural': if it's a cultural or artistic institution.
- 'trad_news': a traditional media outlet (e.g., established newspapers).
- 'non_trad_news': blogs, YouTube channels, or alt-media sites.
- 'academia_uni': university-level academic publisher.
- 'academia_pub': peer-reviewed academic journal or press.
- 'academia_other': other academic institution.
- 'nw': not well-defined.
- 'ne': not enough evidence to decide.
- 'dn': does not apply.

Please return one label from the list above.
Return your answer as a JSON object using this format:
{{"label": "<one_of_the_labels_above>"}}

Only output the JSON. Do not include any explanations.
"""


In [26]:
# 2. Extract unique values
unique_authors = author['ref_value'].dropna().unique()
unique_publishers = pt['domain'].dropna().unique()
unique_verifiers = pv['domain'].dropna().unique()

In [43]:
author_results = []
publisher_results = []
verification_results = []

In [31]:
print('---------- Number of unique ref for the AUTHORS --------------')
print(len(unique_authors))
print('---------- Number of unique ref for the PUBLISHER --------------')
print(len(unique_publishers))
print('---------- Number of unique ref for the VERIFICATION --------------')
print(len(unique_verifiers))


---------- Number of unique ref for the AUTHORS --------------
1178
---------- Number of unique ref for the PUBLISHER --------------
278
---------- Number of unique ref for the VERIFICATION --------------
293


In [60]:
prompt = author_type_prompt("http://8tracks.com/angeloflight12/shake-it-up-dance")
response = llm.invoke(prompt)

In [61]:
import json

In [62]:
label = json.loads(response.content)["label"]

In [63]:
label

'nw'

In [66]:
# 4. Predict Author Type
# for ref in unique_authors:
#     prompt = author_type_prompt(ref)
#     response = llm.invoke(prompt)  
#     label = json.loads(response.content)["label"]
#     author_results.append((ref, label))
    
    

# 5. Predict Publisher Type
for domain in unique_publishers:
    prompt = publisher_type_prompt(domain)
    response = llm.invoke(prompt)
    label = json.loads(response.content)["label"]
    publisher_results.append((domain, label))

# 6. Predict Publisher Verification Type
for domain in unique_verifiers:
    prompt = publisher_verification_prompt(domain)
    response = llm.invoke(prompt)
    label = json.loads(response.content)["label"]
    verification_results.append((domain, label))


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01j9xv8kpbf88bkesy5wtxvnca` service tier `on_demand` on requests per day (RPD): Limit 1000, Used 1000, Requested 1. Please try again in 1m26.242s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'requests', 'code': 'rate_limit_exceeded'}}

In [56]:
author_results

[('http://8tracks.com/angeloflight12/shake-it-up-dance', 'nw'),
 ('http://akas.imdb.com/name/nm2304352/', 'individual'),
 ('http://americanart.si.edu/collections/search/artwork/?id=11098',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=11416',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=11819',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=14493',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=20476',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=21513',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=22834',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=25639',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=25825',
  'organisation'),
 ('http://americanart.si.edu/collections/search/artwork/?id=25858',
  'organisation'),
 ('http:/

In [58]:
len(author_results)

967

In [59]:
len(unique_authors)

1178

In [67]:
properties = [
    "P31", "P1215", "P258", "P17", "P131", "P106", "P625", "P2215", "P3083",
    "P6257", "P6259", "P6258", "P21", "P2671", "P59", "P735", "P569", "P27",
    "P18", "P646", "P361", "P684", "P1476", "P734", "P2216", "P1566", "P2214",
    "P171", "P225", "P105", "P373", "P279", "P19", "P2583", "P214", "P570",
    "P1087", "P703", "P407", "P276", "P846", "P571", "P577", "P1412", "P1082",
    "P971", "P69", "P1435", "P421", "P195", "P527"
]


In [68]:
print('hello')

hello
