In [6]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv() 

api_key = os.getenv("API_KEY") 
data_folder = os.getenv("DATA_FOLDER")
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-70b-instruct"  # Choose any available model
temperature = 0.01

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
    )

def process_prompt(prompt):
    # Get response
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "SYSTEM MESSAGE"},
            {"role": "user", "content": prompt}
        ],
        model=model,
        temperature=temperature
    )

    return chat_completion.choices[0].message.content

print(process_prompt("Hello!"))

Hello. How can I help you today?


Get the prompt template and the pre-processed articles.

In [7]:
#open the prompt
version = 3
prompt_path = f"../prompts/prompt_v{version}.txt"
with open(prompt_path, "r", encoding="utf-8") as f:
    prompt = f.read()

print(prompt)

We define a claim as an elementary information unit in a sentence, which no longer
needs to be further split. Furthermore, core claims are ones that best describe 
the core ideas of an article.



For example, given the following article:
INPUT:
{'headline': 'Three Russia Hoax Bombshells Hidden In IG Report On DOJ Surveillance Of Congress', 1: 'By: Mollie Hemingway\nDecember 17, 2024\n9 min read\nSen. Adam Schiff\nImage Credit\nSen. Adam Schiff/YouTube\n\nThese revelations show why the DOJ needs massive reform in the next administration.', 2: 'Author Mollie Hemingway profile\nMollie Hemingway\nVisit on Twitter\n@mzhemingway\nMore Articles\nShare\n\n    Share Article on Facebook\n    Share Article on Twitter\n\nShare Article on Truth Social\n\n    Share Article via Email\n\nLast week the Department of Justice’s inspector general released a report on some of the DOJ’s tracking of communications from media and congressional figures as part of its purported investigation into who was leaki

In [23]:
import pandas as pd
article_file_path = f"{data_folder}/TRUE_articles_pre_processed.csv" 

df = pd.read_csv(article_file_path, sep=";", index_col=0)
df

Unnamed: 0,URL,Response,Wayback URL,Title,Body,id_body
0,www.washingtontimes.com/news/2021/jul/19/arizo...,,,Majority of Arizona Republicans believe electi...,\nMaricopa County ballots cast in the 2020 gen...,{'headline': 'Majority of Arizona Republicans ...
1,www.nbcnews.com/news/olympics/member-u-s-women...,,,"Kara Eaker, U.S. women's gymnastics alternate,...",U.S. women's gymnastics alternates Kara Eaker ...,"{'headline': ""Kara Eaker, U.S. women's gymnast..."
2,www.dailywire.com/news/shock-nbc-poll-shows-am...,,,Shock NBC Poll Shows Americans Have ‘Lost Thei...,A whopping 71% of Americans believe the U.S. i...,{'headline': 'Shock NBC Poll Shows Americans H...
3,www.newsmax.com/us/ama-medical-doctor-langauge...,,,AMA Document: Doctors Should Use Language 'Ins...,he American Medical Association on Thursday re...,"{'headline': ""AMA Document: Doctors Should Use..."
4,www.huffpost.com/entry/steve-buscemi-30-rock-m...,,,Steve Buscemi Hands Out Candy Dressed As His O...,“Fargo” star Steve Buscemi handed out Hallowee...,{'headline': 'Steve Buscemi Hands Out Candy Dr...
5,crooksandliars.com/2021/11/major-ivermectin-st...,,,Ivermectin Study Retracted After Data Found To...,Remember all those studies that purportedly sh...,{'headline': 'Ivermectin Study Retracted After...
6,www.newsmax.com/headline/terry-mcauliffe-glenn...,,,McAuliffe Concedes Virginia Governor's Race,Democrat Terry McAuliffe conceded defeat in th...,"{'headline': ""McAuliffe Concedes Virginia Gove..."
7,news.yahoo.com/qanon-supporters-gather-over-th...,,,QAnon supporters gather over theory that JFK J...,Some supporters of the QAnon conspiracy gather...,{'headline': 'QAnon supporters gather over the...
8,bipartisanreport.com/2021/11/07/liz-cheney-app...,,,Liz Cheney Appears On ‘Fox Sunday’ To Hand Tru...,Rep. Liz Cheney (R-Wyo.) is continuing to face...,{'headline': 'Liz Cheney Appears On ‘Fox Sunda...
9,www.foxnews.com/politics/biden-approval-harris...,,,Nearly half of voters say Biden worse presiden...,With exactly one year until the midterm electi...,"{'headline': ""Nearly half of voters say Biden ..."


In [24]:
import time
last_call_time = time.time()

def get_throttled_prompt_response(value):
    global last_call_time
    elapsed_time = time.time() - last_call_time #dynamic buffer
    if elapsed_time < 8:
        time.sleep(8 - elapsed_time)
    try:
        last_call_time = time.time()
        return process_prompt(f"{prompt}{value}")
    except:
        return "ERROR: " + str(value)

In [26]:
df['claims'] = df['id_body'].apply(get_throttled_prompt_response) #run the last 700 entries again
print(df['claims'])
df.to_csv(f"{data_folder}/true_base_articles_with_claims.csv", sep=";") #for FM articles: this was run over two sessions (first one completing after 550mins, with the last ca. 700 entries being faulty (502 Error code))

0    {\n    "article_title": "Majority of Arizona R...
1    {\n    "article_title": "Kara Eaker, U.S. wome...
2    {\n    "article_title": "Shock NBC Poll Shows ...
3    {\n    "article_title": "AMA Document: Doctors...
4    {\n    "article_title": "Steve Buscemi Hands O...
5    {\n    "article_title": "Ivermectin Study Retr...
6    {\n    "article_title": "McAuliffe Concedes Vi...
7    {\n    "article_title": "QAnon supporters gath...
8    {\n    "article_title": "Liz Cheney Appears On...
9    {\n    "article_title": "Nearly half of voters...
Name: claims, dtype: object


When running over multiple sessions, reimport:

In [None]:
import pandas as pd
# df1 = pd.read_csv(f"{data_folder}/articles_with_claims.csv", sep=";", index_col=0)
# df2 = pd.read_csv(f"{data_folder}/articles_with_claims_part2.csv", sep=";", index_col=0)
df = pd.read_csv(f"{data_folder}/true_articles_with_claims.csv", sep=";", index_col=0) 

In [None]:
# df = pd.concat([df1.head(1069), df2.tail(700)]) #joining the results from the two sessions
# df

Unnamed: 0,url,title,body,id_body,claims
1,https://abc13.com/us-shortages-gas-shortage-20...,"US shortages 2021: Gas, lumber prices soar; Ke...","WATCH LIVE NEW YORK -- Chicken, lumber, microc...","{'headline': 'US shortages 2021: Gas, lumber p...","{\n""article_title"": ""US shortages 2021: Gas, l..."
2,https://abc7.com/covid-supply-chain-shortage-2...,Global supply chain problems now leading to em...,WATCH LIVE LOS ANGELES (KABC) -- At the beginn...,{'headline': 'Global supply chain problems now...,"{\n""article_title"": ""Global supply chain probl..."
3,https://abcnews.go.com/Politics/biden-replace-...,Biden to replace White House doctor with long-...,O'Connor will take on a role that faced scruti...,{'headline': 'Biden to replace White House doc...,"{\n ""article_title"": ""Biden to replace White H..."
4,https://abcnews.go.com/Politics/whats-causing-...,What’s causing America’s massive supply-chain ...,Untangling supply chain woes could take much l...,{'headline': 'What’s causing America’s massive...,"{\n""article_title"": ""What’s causing America’s ..."
5,https://abcnews.go.com/US/nature-based-man-mad...,Nature-based or lab leak? Unraveling the debat...,Accomplished scientists and public health offi...,{'headline': 'Nature-based or lab leak? Unrave...,"{\n""article_title"": ""Nature-based or lab leak?..."
...,...,...,...,...,...
1765,https://www.washingtonpost.com/politics/trump-...,Trump clings to one marker as a sign of succes...,clockThis article was published more than 3 ye...,{'headline': 'Trump clings to one marker as a ...,"{\n""article_title"": ""Trump clings to one marke..."
1766,https://www.washingtonpost.com/politics/trump-...,Trump says ‘there was no reason’ for officer t...,clockThis article was published more than 3 ye...,{'headline': 'Trump says ‘there was no reason’...,"{\n""article_title"": ""Trump says ‘there was no ..."
1767,https://www.washingtonpost.com/politics/trump-...,Trump Organization removes indicted top financ...,clockThis article was published more than 3 ye...,{'headline': 'Trump Organization removes indic...,"{\n""article_title"": ""Trump Organization remove..."
1768,https://www.washingtonpost.com/politics/trump-...,Trump business and its longtime chief financia...,clockThis article was published more than 3 ye...,{'headline': 'Trump business and its longtime ...,ERROR: {'headline': 'Trump business and its lo...


In [None]:
#df.to_csv(f"{data_folder}/articles_with_claims_merged.csv", sep=";") 

In [9]:
def mark_errors(claim_output):
    if "ERROR:" in claim_output: #checking for model output errors (serverside)
        return True
    if "INVALID INPUT" in claim_output: #checking for input errors (that occured during sentencesplitting)
        return True
    else:
        return False

In [18]:
df['output_errors'] = df['claims'].apply(mark_errors)
df['pre_processing_errors'] = df['id_body'].apply(mark_errors) 
df['claims'].loc[df['output_errors'] == True] #FM: three iterations bringing the number of faulty outputs down from: 157 -> 1 -> 0; TRUE: 202 -> 1

2757    ERROR: {'headline': 'Covid-19 news archive: Pf...
Name: claims, dtype: object

In [19]:
len(df['claims'].loc[df['pre_processing_errors'] == True]) #16 faulty inputs (FM); 2 for TRUE sample

2

In [17]:
#running until no output errors are left
df['claims'].loc[df['output_errors'] == True] = df['id_body'].loc[df['output_errors'] == True].apply(get_throttled_prompt_response)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['claims'].loc[df['output_errors'] == True] = df['id_body'].loc[df['output_errors'] == True].apply(get_throttled_prompt_response)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [20]:
df.to_csv(f"{data_folder}/true_articles_with_claims.csv", sep=";") #articles_with_claims.csv

In [30]:
import json

def safe_json_loads(json_str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {json_str}")
        print(e)
        return {}


In [16]:
import ast
df['id_body'] = df['id_body'].apply(ast.literal_eval) #convert the string literals back to a dict
df['Claims'] = df['Claims'].apply(ast.literal_eval)

SyntaxError: unexpected EOF while parsing (<unknown>, line 1)

In [33]:
#export example

with open("newsmax_example.json", "w") as j:
    json.dump(df['Claims'][14], j)

In [14]:
# example = json.loads(df['Claims'][0])
# example_claims = example['claims']
# example_reference = df['ID_Body'][0]

def test_claim(claims, reference):
    for claim in claims:
        print(f"""
Claim: {claim['claim']}, 
    Position {claim['position']}, 
    Corresponding sentence: {reference[claim['position'] if claim['position'] != 0 else 'headline']}
        """)

for claim_data, reference in zip(df['Claims'], df['ID_Body']):
    claim_data =  r"" + claim_data
    claim_data = safe_json_loads(claim_data)
    claims = claim_data['claims']
    test_claim(claims, reference)

# test_claim(example_claims, example_reference)


Claim: Arizona state Sen. Wendy Rogers is demanding that the state’s electors — which went for President Joe Biden — be recalled, and a new election be conducted in the state., 
    Position 1, 
    Corresponding sentence: C. Douglas Golden, The Western Journal By C. Douglas Golden, The Western Journal
Published July 17, 2021 at 12:12pm
Share on Facebook
Tweet
Mewe Share
P Share
Email

In the wake of the Maricopa County audit of the 2020 election results, one GOP Arizona state senator is demanding that the state’s electors — which went for President Joe Biden — be recalled, and a new election be conducted in the state.
        

Claim: The Maricopa County audit of the 2020 election results found that thousands of names were added to the voting rolls after election day, but are recorded as casting a ballot in the 2020 election., 
    Position 11, 
    Corresponding sentence: According to Newsweek, Cyber Ninjas founder Doug Logan told the Arizona state Senate this week, “Thousands of na