In [20]:
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv
load_dotenv()

data_folder = os.getenv("DATA_FOLDER")

#load the three relevant dataframes
claim_df = pd.read_csv(f'{data_folder}/articles_with_claims.csv', sep=";",index_col=0)
search_result_df = pd.read_csv(f'{data_folder}/All_Search_Results_Combined.csv', index_col=0)
search_result_df.drop(columns="X", inplace=True)
fm_articles_df = pd.read_excel(f'{data_folder}/FM_Articles_with_ids_processed.xlsx', index_col=0)

In [21]:
claim_df.rename(columns={'url': 'URL', 'title': 'Title', 'body': 'Body', 'claims': 'Claims'}, inplace=True) #rename columns to make them consistent between dataframes, enabling the flexibility of functions

Claims were extracted for 1769 articles. Note, however, that these are stored as strings, which can result in potential evaluation errors, making them unusuable.

In [22]:
claim_df

Unnamed: 0,URL,Title,Body,id_body,Claims,errors,output_errors,pre_processing_errors
1,https://abc13.com/us-shortages-gas-shortage-20...,"US shortages 2021: Gas, lumber prices soar; Ke...","WATCH LIVE NEW YORK -- Chicken, lumber, microc...","{'headline': 'US shortages 2021: Gas, lumber p...","{\n""article_title"": ""US shortages 2021: Gas, l...",False,False,False
2,https://abc7.com/covid-supply-chain-shortage-2...,Global supply chain problems now leading to em...,WATCH LIVE LOS ANGELES (KABC) -- At the beginn...,{'headline': 'Global supply chain problems now...,"{\n""article_title"": ""Global supply chain probl...",False,False,False
3,https://abcnews.go.com/Politics/biden-replace-...,Biden to replace White House doctor with long-...,O'Connor will take on a role that faced scruti...,{'headline': 'Biden to replace White House doc...,"{\n ""article_title"": ""Biden to replace White H...",False,False,False
4,https://abcnews.go.com/Politics/whats-causing-...,What’s causing America’s massive supply-chain ...,Untangling supply chain woes could take much l...,{'headline': 'What’s causing America’s massive...,"{\n""article_title"": ""What’s causing America’s ...",False,False,False
5,https://abcnews.go.com/US/nature-based-man-mad...,Nature-based or lab leak? Unraveling the debat...,Accomplished scientists and public health offi...,{'headline': 'Nature-based or lab leak? Unrave...,"{\n""article_title"": ""Nature-based or lab leak?...",False,False,False
...,...,...,...,...,...,...,...,...
1765,https://www.washingtonpost.com/politics/trump-...,Trump clings to one marker as a sign of succes...,clockThis article was published more than 3 ye...,{'headline': 'Trump clings to one marker as a ...,"{\n""article_title"": ""Trump clings to one marke...",False,False,False
1766,https://www.washingtonpost.com/politics/trump-...,Trump says ‘there was no reason’ for officer t...,clockThis article was published more than 3 ye...,{'headline': 'Trump says ‘there was no reason’...,"{\n""article_title"": ""Trump says ‘there was no ...",False,False,False
1767,https://www.washingtonpost.com/politics/trump-...,Trump Organization removes indicted top financ...,clockThis article was published more than 3 ye...,{'headline': 'Trump Organization removes indic...,"{\n""article_title"": ""Trump Organization remove...",False,False,False
1768,https://www.washingtonpost.com/politics/trump-...,Trump business and its longtime chief financia...,clockThis article was published more than 3 ye...,{'headline': 'Trump business and its longtime ...,"{\n ""article_title"": ""Trump business and its l...",True,False,False


We only care about the search queries that relate to the Fake or Misleading articles

In [23]:
search_result_df = search_result_df[search_result_df['URL'].isin(fm_articles_df['URL'])] #filter down to only contain results that have F/M articles as URL
search_result_df.reset_index(drop=True, inplace=True) 
search_result_df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_80,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls
0,Day_1_1,R_10pkUNEtcV6OMCU,Day_1,Coul,0,5,3,35,1,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,91.055556,"newsweek.com,yahoo.com,azmirror.com,fr24news.c...","100,100,57,92.5,100,100,75,100,95",'https://www.newsweek.com/arizona-state-senato...
1,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,95.277778,"abc15.com,salon.com,cnn.com,politico.com,cbsne...","100,87.5,87.5,100,95,95,100,100,92.5",'https://www.abc15.com/news/state/poll-many-re...
2,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,83.450000,"cnn.com,thehill.com,salon.com,washingtontimes....","87.5,80,87.5,42,87.5,100,92.5,87.5,85,85",'https://www.cnn.com/2021/07/18/politics/fact-...
3,Day_1_1,R_3KT6q7Vntwvmg8Z,Day_1,True,1,7,3,33,1,FM,...,0.0,0.0,0.0,0.0,0.0,0.0,92.812500,"thehill.com,recorder.maricopa.gov,washingtonpo...","80,100,100,100,100,92.5,75,95",'https://thehill.com/homenews/campaign/563100-...
4,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,96.000000,"apnews.com,detroitnews.com,cnn.com,apnews.com,...","95,92.5,87.5,95,95,100,100,100,100,95",'https://apnews.com/article/technology-joe-bid...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,Day_9_3,R_1P6MQAwGMgs0Loo,Day_9,True,1,5,3,39,0,FM,...,0.0,0.0,0.0,0.0,0.0,0.0,94.166667,"googleadservices.com,scientificamerican.com,wh...","100,77.5,87.5,100,100,100","'https://www.googleadservices.com/','https://w..."
1263,Day_9_3,R_rcgF4h6LylUXPUd,Day_9,Misl,0,1,2,23,0,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,92.500000,"covid19.nih.gov,grants.nih.gov,grants.nih.gov,...","100,87.5,100,87.5,87.5",'https://covid19.nih.gov/funding#:~:text=NIH%2...
1264,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,85.777778,"science.org,the-scientist.com,rollcall.com,van...","100,100,75,82.5,82.5,95,80,57,100",'https://www.science.org/content/article/nih-s...
1265,Day_9_3,R_SHIu8anABwVwNMZ,Day_9,True,1,7,4,33,1,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,7.500000,"',zerohedge.com,madisonarealymesupportgroup.co...",7.5,'/advanced_search?q=https://www.zerohedge.com/...


As shown below, a noticeable portion of the Aslett et al. data does not contain any clean urls at all, making them unusuable for our purposes. This brings the potential number of valid search queries down from 1267 to 961, a decrease of 24.15%.

In [24]:
search_result_df[search_result_df['all_clean_urls'].isna()]

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_80,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls
6,Day_1_1,R_3LXaF7Z7z740d2k,Day_1,Misl,0,1,1,26,1,FM,...,,,,,,,,,,
9,Day_1_1,R_3LXaF7Z7z740d2k,Day_1,Misl,0,1,1,26,1,FM,...,,,,,,,,,,
10,Day_1_1,R_2PpsctGRoIizGAS,Day_1,Misl,0,1,1,39,1,FM,...,,,,,,,,,,
12,Day_1_1,R_2PpsctGRoIizGAS,Day_1,Misl,0,1,1,39,1,FM,...,,,,,,,,,,
14,Day_1_1,R_2PpsctGRoIizGAS,Day_1,Misl,0,1,1,39,1,FM,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1252,Day_9_3,R_2QEWqk6A2nnsg7O,Day_9,True,1,6,4,26,0,FM,...,,,,,,,,,,
1255,Day_9_3,R_1mQbS6eXGog8n3j,Day_9,Coul,0,4,3,57,1,FM,...,,,,,,,,,,
1257,Day_9_3,R_2QEWqk6A2nnsg7O,Day_9,True,1,6,4,26,0,FM,...,,,,,,,,,,
1259,Day_9_3,R_2QEWqk6A2nnsg7O,Day_9,True,1,6,4,26,0,FM,...,,,,,,,,,,


In [25]:
search_result_df = search_result_df[search_result_df['all_clean_urls'].notna()]
search_result_df.reset_index(drop=True, inplace=True) 
search_result_df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_80,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls
0,Day_1_1,R_10pkUNEtcV6OMCU,Day_1,Coul,0,5,3,35,1,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,91.055556,"newsweek.com,yahoo.com,azmirror.com,fr24news.c...","100,100,57,92.5,100,100,75,100,95",'https://www.newsweek.com/arizona-state-senato...
1,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,95.277778,"abc15.com,salon.com,cnn.com,politico.com,cbsne...","100,87.5,87.5,100,95,95,100,100,92.5",'https://www.abc15.com/news/state/poll-many-re...
2,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,83.450000,"cnn.com,thehill.com,salon.com,washingtontimes....","87.5,80,87.5,42,87.5,100,92.5,87.5,85,85",'https://www.cnn.com/2021/07/18/politics/fact-...
3,Day_1_1,R_3KT6q7Vntwvmg8Z,Day_1,True,1,7,3,33,1,FM,...,0.0,0.0,0.0,0.0,0.0,0.0,92.812500,"thehill.com,recorder.maricopa.gov,washingtonpo...","80,100,100,100,100,92.5,75,95",'https://thehill.com/homenews/campaign/563100-...
4,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,96.000000,"apnews.com,detroitnews.com,cnn.com,apnews.com,...","95,92.5,87.5,95,95,100,100,100,100,95",'https://apnews.com/article/technology-joe-bid...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,98.437500,"factcheck.org,yahoo.com,theintercept.com,polit...","100,100,87.5,100,100,100,100,100",'https://www.factcheck.org/2021/10/scicheck-re...
957,Day_9_3,R_1P6MQAwGMgs0Loo,Day_9,True,1,5,3,39,0,FM,...,0.0,0.0,0.0,0.0,0.0,0.0,94.166667,"googleadservices.com,scientificamerican.com,wh...","100,77.5,87.5,100,100,100","'https://www.googleadservices.com/','https://w..."
958,Day_9_3,R_rcgF4h6LylUXPUd,Day_9,Misl,0,1,2,23,0,FM,...,1.0,1.0,0.0,0.0,0.0,0.0,92.500000,"covid19.nih.gov,grants.nih.gov,grants.nih.gov,...","100,87.5,100,87.5,87.5",'https://covid19.nih.gov/funding#:~:text=NIH%2...
959,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,0.0,0.0,0.0,1.0,1.0,1.0,85.777778,"science.org,the-scientist.com,rollcall.com,van...","100,100,75,82.5,82.5,95,80,57,100",'https://www.science.org/content/article/nih-s...


In [40]:
search_result_df.groupby('Category')['avg_score'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Coul,151.0,80.324021,20.866727,5.0,79.7,89.375,93.638889,100.0
Misl,420.0,86.607946,14.821564,15.555556,84.705357,92.113095,95.232955,100.0
True,359.0,83.667999,17.965217,5.0,82.5,90.0,93.880952,100.0


The available clean urls are evaluated as tuples

In [7]:
import ast

def safe_literal_eval(input):
    try:
        return ast.literal_eval(input)
    except:
        return ()
    
#search_result_df['URL']
search_result_df['all_clean_urls'] = search_result_df['all_clean_urls'].apply(safe_literal_eval)
search_result_df['all_clean_urls']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  search_result_df['all_clean_urls'] = search_result_df['all_clean_urls'].apply(safe_literal_eval)


0      (https://www.newsweek.com/arizona-state-senato...
1      (https://www.abc15.com/news/state/poll-many-re...
2      (https://www.cnn.com/2021/07/18/politics/fact-...
3      (https://thehill.com/homenews/campaign/563100-...
4      (https://apnews.com/article/technology-joe-bid...
                             ...                        
956    (https://www.factcheck.org/2021/10/scicheck-re...
957    (https://www.googleadservices.com/, https://ww...
958    (https://covid19.nih.gov/funding#:~:text=NIH%2...
959    (https://www.science.org/content/article/nih-s...
960    (/advanced_search?q=https://www.zerohedge.com/...
Name: all_clean_urls, Length: 961, dtype: object

Initilization of log data

In [8]:
#declaring the possible errors for the log file
domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors = 0, 0, 0 #newsguard score related errors

claim_dict_eval_errors = 0 #times a dictionary of claims could not be evaluated (ast.literal_eval)

fm_article_url_not_found = 0
serp_article_not_found = 0 #times a SERP article's could not be retrieved via its url
response_errors = 0 #times an API request failed


claim_retrieval_errors, claim_headline_retrieval_errors = 0, 0 # errors occuring during the retrieval of the claim positions and the headline bool

post_processing_error = 0

serp_dict_empty_error = 0

successful_parse = 0

def reset_error_count():
    global domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors
    global claim_dict_eval_errors
    global fm_article_url_not_found
    global serp_article_not_found
    global response_errors
    global claim_retrieval_errors, claim_headline_retrieval_errors
    global post_processing_error
    global serp_dict_empty_error
    global successful_parse 

    domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors = 0, 0, 0

    claim_dict_eval_errors = 0

    serp_article_not_found = 0
    response_errors = 0 

    claim_retrieval_errors, claim_headline_retrieval_errors = 0, 0

    post_processing_error = 0

    successful_parse = 0

def create_log(file_path):
    global domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors
    global claim_dict_eval_errors
    global fm_article_url_not_found
    global serp_article_not_found
    global response_errors
    global claim_retrieval_errors, claim_headline_retrieval_errors
    global post_processing_error
    global serp_dict_empty_error
    global successful_parse

    error_dict = {'Successful_Parses': successful_parse,
                  'Domain_Processing_Errors': domain_processing_errors,
                  'Newsguard_Dict_Errors': ng_dict_errors,
                  'Newsguard_Score_Retrieval_Errors': ng_score_retrieval_errors,
                  'Claim_Dict_Eval_Errors:': claim_dict_eval_errors,
                  'FM_Article_Retrieval_Errors': fm_article_url_not_found,
                  'SERP_Article_Retrieval_Errors': serp_article_not_found,
                  'SERP_Dict_Empty_Errors': serp_dict_empty_error,
                  'API_Response_Errors': response_errors,
                  'Claim_Position_Retrieval_Errors': claim_retrieval_errors,
                  'Claim_Headline_Retrieval_Errors': claim_headline_retrieval_errors,
                  'Post_Processing_Error': post_processing_error
                  }
    
    with open(file_path, 'w') as f:
        f.write(str(error_dict))

In [9]:

#adds quotes around each domain in order for ast.literal_eval to interpret it properly as a list

def pre_process_domains(domain_string):  
    global domain_processing_errors 
    try:
        return ast.literal_eval(f"'{domain_string}'")
    except:
        print('ERROR: could not convert string of domains to list')
        domain_processing_errors += 1
        return ()

In [10]:
def normalize_quotes(string):
    return string.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")

def get_claim_dict(url, df):
    global claim_dict_eval_errors
    claims = df.loc[df['URL'] == url, 'Claims']
    if not claims.empty:
        specific_string_content = claims.iloc[0]
        specific_string_content = normalize_quotes(specific_string_content)
        try:
            dict_content = ast.literal_eval(specific_string_content)
            return dict_content
        except:
            claim_dict_eval_errors += 1
            print("ERROR: could not eval string as dict: " + specific_string_content)
            return {}
    else:
        return {}

In [11]:
label_assignment_prompt = """For each claim in article A, check for all claims in article B, 
whether the claim made in article B contradicts the claim made in article A (-1), 
reinforces the claim made inarticle A (1), or whether the two claims are unrelated (0). 

Adhere to the following output format:
{"comparisons": [
    {
        'claim_article_a': "This is a claim made in article a"
        'claim_article_b': "This is a claim made in article b"
        'relation': -1 
    },
    {
        'claim_article_a': "This is a claim made in article a"
        'claim_article_b': "This is another claim made in article b"
        'relation': 1
    },
    {
        'claim_article_a': "This is a claim made in article a"
        'claim_article_b': "This is yet another claim made in article b"
        'relation': 0
    }
]
}

Please consider the following:

• Your response must be directly the JSON, do not add any other text, such as
”Here is the output:” and similar.
• Make sure that the JSON is properly formatted, especially with correct types of braces and brackets
• Make sure to consider all of the possible claim pairs, which should add up to 5*5 = 25


Now do this task for the following input:
"""

In [12]:
def extract_claims(claim_dict):
    return [claim['claim'] for claim in claim_dict['claims']]

Prompting the model

In [13]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv() 

api_key = os.getenv("API_KEY") 
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-70b-instruct"  # Choose any available model
temperature = 0.01

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
    )

def process_prompt(prompt):
    # Get response
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "SYSTEM MESSAGE"},
            {"role": "user", "content": prompt}
        ],
        model=model,
        temperature=temperature
    )

    return chat_completion.choices[0].message.content

print(process_prompt("Hello!"))

Hello. How can I assist you today?


In [14]:
import time
last_call_time = time.time()

def get_throttled_prompt_response(value):
    global last_call_time
    global response_errors
    elapsed_time = time.time() - last_call_time #dynamic buffer
    if elapsed_time < 8:
        time.sleep(8 - elapsed_time)
    try:
        last_call_time = time.time()
        return process_prompt(f"{label_assignment_prompt}{value}")
    except:
        response_errors += 1
        return "ERROR: " + str(value)

In [15]:
#can be used to retrieve domains later
def get_newsguard_dict(d, s):
    domains = pre_process_domains(d)
    try:
        domains_newsguard = ast.literal_eval(s)
    except:
        return {}

    try:
        ng_dict = {k:v for k, v in zip(domains, domains_newsguard)}
        return ng_dict
    except:
        return {}

In [16]:
search_result_df['list_domains']

0      newsweek.com,yahoo.com,azmirror.com,fr24news.c...
1      abc15.com,salon.com,cnn.com,politico.com,cbsne...
2      cnn.com,thehill.com,salon.com,washingtontimes....
3      thehill.com,recorder.maricopa.gov,washingtonpo...
4      apnews.com,detroitnews.com,cnn.com,apnews.com,...
                             ...                        
956    factcheck.org,yahoo.com,theintercept.com,polit...
957    googleadservices.com,scientificamerican.com,wh...
958    covid19.nih.gov,grants.nih.gov,grants.nih.gov,...
959    science.org,the-scientist.com,rollcall.com,van...
960    ',zerohedge.com,madisonarealymesupportgroup.co...
Name: list_domains, Length: 961, dtype: object

In [17]:
search_result_df['list_domains'] = search_result_df['list_domains'].str.replace(",", "', '")
search_result_df['list_domains']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  search_result_df['list_domains'] = search_result_df['list_domains'].str.replace(",", "', '")


0      newsweek.com', 'yahoo.com', 'azmirror.com', 'f...
1      abc15.com', 'salon.com', 'cnn.com', 'politico....
2      cnn.com', 'thehill.com', 'salon.com', 'washing...
3      thehill.com', 'recorder.maricopa.gov', 'washin...
4      apnews.com', 'detroitnews.com', 'cnn.com', 'ap...
                             ...                        
956    factcheck.org', 'yahoo.com', 'theintercept.com...
957    googleadservices.com', 'scientificamerican.com...
958    covid19.nih.gov', 'grants.nih.gov', 'grants.ni...
959    science.org', 'the-scientist.com', 'rollcall.c...
960    '', 'zerohedge.com', 'madisonarealymesupportgr...
Name: list_domains, Length: 961, dtype: object

In [18]:
search_result_df['newsguard_dict'] = search_result_df.apply(lambda row: get_newsguard_dict(row['list_domains'], row['list_scores']), axis=1)

ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  search_result_df['newsguard_dict'] = search_result_df.apply(lambda row: get_newsguard_dict(row['list_domains'], row['list_scores']), axis=1)


In [19]:
from urllib.parse import urlparse

def get_newsguard_score(raw_url, score_dict):
    global ng_dict_errors
    global ng_score_retrieval_errors
    if not score_dict:
        ng_dict_errors += 1
        return None
    try:
        domain = urlparse(raw_url).netloc #to get the the desired format
        return(score_dict['.'.join(domain.split('.')[-2:])])
    except:
        ng_score_retrieval_errors += 1
        return None
    

In [20]:
def get_claim_positions(claim_dict, relative=True):
    global claim_retrieval_errors
    positions = []
    try:
        for c in claim_dict['claims']:
            if relative == False:
                positions.append(c['position'])
            else:
                positions.append((c['position']/claim_dict['article_length']))
    except:
         claim_retrieval_errors += 1
         print("ERROR: could not extract positions")
    return positions

def get_claim_headline_info(claim_dict):
    global claim_headline_retrieval_errors
    h_info = []
    try:
        for c in claim_dict['claims']:
            h_info.append(c['headline'])
    except:
         claim_headline_retrieval_errors += 1
         print("ERROR: could not extract headline info")
    return h_info

In [21]:
def process_response(response, fm_url, serp_article, newsguard_dict, fm_dict):
    response_dict = ast.literal_eval(response)
    response_dict['fm_url'] = fm_url
    response_dict['fm_article_length'] = fm_dict['article_length']
    response_dict['serp_url'] = serp_article['url']
    response_dict['serp_position'] = serp_article['serp_position']
    response_dict['serp_article_length'] = serp_article['article_length']
    try:
        response_dict['serp_newsguard'] = get_newsguard_score(serp_article['url'], newsguard_dict)
    except:
        response_dict['serp_newsguard'] = None

    #get the relative positions of the claims
    positions_fake_article = get_claim_positions(fm_dict)
    positions_fake_article = [item for x in positions_fake_article for item in [x]*5]
    for comp, position in zip(response_dict['comparisons'], positions_fake_article):
        comp['fm_claim_position'] =  position
    positions_search_article = get_claim_positions(serp_article)
    positions_search_article = positions_search_article*5
    for comp, position in zip(response_dict['comparisons'], positions_search_article):
        comp['serp_claim_position'] =  position

    #get the absolute positions of the claims
    positions_fake_article = get_claim_positions(fm_dict, relative=False)
    positions_fake_article = [item for x in positions_fake_article for item in [x]*5]
    for comp, position in zip(response_dict['comparisons'], positions_fake_article):
        comp['fm_claim_position_abs'] =  position
    positions_search_article = get_claim_positions(serp_article, relative=False)
    positions_search_article = positions_search_article*5
    for comp, position in zip(response_dict['comparisons'], positions_search_article):
        comp['serp_claim_position_abs'] =  position 

    h_info_fake_article = get_claim_headline_info(fm_dict)
    h_info_fake_article = [item for x in h_info_fake_article for item in [x]*5]
    for comp, h_info in zip(response_dict['comparisons'], h_info_fake_article):
        comp['fm_claim_headline'] =  h_info

    h_info_serp_article = get_claim_headline_info(serp_article)
    h_info_serp_article = h_info_serp_article*5
    for comp, h_info in zip(response_dict['comparisons'], h_info_serp_article):
        comp['serp_claim_headline'] =  h_info
        
    return response_dict

In [22]:
#catch empty cases
def label_assignment(fm_url, serp_urls, newsguard_dict):
    label_list = []
    global fm_article_url_not_found
    global post_processing_error
    global serp_dict_empty_error

    fm_article_dict = get_claim_dict(fm_url, fm_articles_df) #get fm_claim_dict, this is also called in the get_serp_articles function, but that one also has some more details
    if not fm_article_dict:
        fm_article_url_not_found += 1
        return "ERROR: FM Article Claims could not be retrieved"
    fm_article_claims = extract_claims(fm_article_dict)

    serp_dict = get_serp_dict(serp_urls=serp_urls)
    if not serp_dict: #check for empty list
        serp_dict_empty_error += 1
        return None
    for serp_article in serp_dict:
        if not serp_article:
            label_list.append({})
            continue
        serp_claims = extract_claims(serp_article) #extract claims from the entire dict
        comparison_dict = {'claims_article_a': fm_article_claims, 'claims_article_b': serp_claims} #pass only the pure claims into the model
        response = get_throttled_prompt_response(comparison_dict)
        if "ERROR" in response:
            label_list.append({})
            continue
        try:
            processed_response = process_response(response, fm_url, serp_article, newsguard_dict, fm_article_dict)       
            label_list.append(processed_response)
        except:
            print("ERROR: Post-Processing Error")
            post_processing_error += 1
            # catch faulty responses
            label_list.append({})
    return label_list
    

#returns serp articles where a valid claim_dict could be retrieved

def get_serp_dict(serp_urls):
    global serp_article_not_found
    global successful_parse
    serp_articles = []  
    for i, serp_url in enumerate(serp_urls):
        try: 
            article = get_claim_dict(serp_url, claim_df)
            if isinstance(article, dict) and article:
                article['url'] = serp_url
                article['serp_position'] = i + 1
                serp_articles.append(article)
                successful_parse += 1
            else:
                serp_article_not_found += 1
                serp_articles.append({})
        except TypeError as e:
            print(f"TypeError for URL {serp_url}: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error for URL {serp_url}: {e}")
    return serp_articles

In [23]:
#change these variables to conduct over multiple sessions, 100 at a time, since we have up to 10 API calls for each entry (1 for each SERP article) -> 1000 API calls per session
current_start = 100 #Completed runs: 0-100; 100-200 (but 215 API response errors); 200-300; 300-400; 400-500; 500-600; 600-700; 700-800; 800-900
current_end = current_start + 100 #last run for 61

In [24]:
current_slice = search_result_df.iloc[current_start:current_end]
current_slice

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict
100,Day_1_3,R_3hfIgiMxT4QVOM6,Day_1,True,1,5,3,51,0,FM,...,0.0,0.0,0.0,0.0,0.0,93.055556,"forbes.com', 'forbes.com', 'vox.com', 'abcnews...","95,95,87.5,82.5,100,95,87.5,100,95",(https://www.forbes.com/sites/roberthart/2021/...,"{'forbes.com': 87.5, 'vox.com': 87.5, 'abcnews..."
101,Day_1_3,R_Wie14Y6yLeam4mt,Day_1,Misl,0,6,2,33,0,FM,...,0.0,0.0,0.0,0.0,0.0,94.722222,"nytimes.com', 'ourworldindata.org', 'usnews.co...","100,87.5,95,100,100,82.5,87.5,100,100",(https://www.nytimes.com/interactive/2021/worl...,"{'nytimes.com': 100, 'ourworldindata.org': 87...."
102,Day_1_3,R_bqJ3HoBP0lqi7CN,Day_1,Misl,0,2,1,30,0,FM,...,1.0,1.0,0.0,0.0,0.0,98.888889,"apnews.com', 'fortune.com', 'health.gov.au', '...",9510010095100100100100100,(https://apnews.com/article/africa-coronavirus...,"{'apnews.com': 95, 'fortune.com': 100, 'health..."
103,Day_1_3,R_5ovNt0RyjDe4zhD,Day_1,True,1,6,3,37,1,FM,...,0.0,0.0,1.0,1.0,1.0,83.000000,"totalhealth.co.uk', 'yahoo.com', ''http:', 'ga...","100,92.5,22.5,100,100",(https://www.totalhealth.co.uk/blog/are-people...,{}
104,Day_1_3,R_3PhjRWT2oK9dOnT,Day_1,Coul,0,1,1,35,1,FM,...,1.0,0.0,0.0,0.0,0.0,95.000000,"galvnews.com', 'padailypost.com', 'yahoo.com',...","92.5,100,100,92.5,87.5,92.5,100",(https://www.galvnews.com/opinion/editorials/f...,"{'galvnews.com': 92.5, 'padailypost.com': 100,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Day_10_3,R_ZKPyrIM3fhnsZfH,Day_10,Misl,0,5,3,33,1,FM,...,0.0,0.0,0.0,0.0,0.0,88.750000,"googleadservices.com', 'feedchildreneverywhere...","77.5,100","(https://www.googleadservices.com/, https://ww...","{'googleadservices.com': 77.5, 'feedchildrenev..."
196,Day_10_3,R_2q46cBzIiH3UPNG,Day_10,Misl,0,3,2,22,0,FM,...,0.0,0.0,1.0,1.0,1.0,44.000000,"noqreport.com', 'naturalnews.com', 'nytimes.co...",1051001005,(https://noqreport.com/2021/11/01/u-s-faces-en...,"{'noqreport.com': 10, 'naturalnews.com': 5, 'n..."
197,Day_10_3,R_2sGvyvdHbUmZ02Z,Day_10,Misl,0,6,3,32,0,FM,...,0.0,0.0,0.0,0.0,0.0,92.083333,"googleadservices.com', 'totesnewsworthy.com', ...","82.5,82.5,87.5,100,100,100","(https://www.googleadservices.com/, https://to...","{'googleadservices.com': 82.5, 'totesnewsworth..."
198,Day_10_3,R_1jrH8MMHgTNb3Xq,Day_10,True,1,7,4,24,1,FM,...,0.0,0.0,0.0,0.0,0.0,78.333333,"""https:', 'parade.com', 'ezprepping.com', 'thr...","70,82.5,82.5",(https://www.greenmatters.com/p/food-shortage-...,"{'""https:': 70, 'parade.com': 82.5, 'ezpreppin..."


In [25]:
reset_error_count()
current_slice['Labels'] = current_slice.apply(lambda row: label_assignment(row['URL'], row['all_clean_urls'], row['newsguard_dict']), axis=1)
create_log(f'./logs/error_log{current_start}_{current_end}.txt')
current_slice.to_csv(f"{data_folder}/label_data_{current_start}_{current_end}.csv", sep=";")
current_slice['Labels'].to_json(f"{data_folder}/label_data_{current_start}_{current_end}.json")

ERROR: could not eval string as dict: {
"article_title": "As Delta drives COVID surge, vaccines, strategies under scrutiny | Coronavirus pandemic News | Al Jazeera",
"url": "https://www.aljazeera.com/news/2021/07/17/as-delta-drives-covid-surge-vaccines-strategies-under-scrutiny",
"article_length": 56,
"claims": [
{
"claim": "Malaysia's health ministry has announced that it will stop using the COVID-19 vaccine produced by China's Sinovac once its supplies end.",
"headline": 0,
"position": 2
},
{
"claim": "The World Health Organization (WHO) has approved the Sinovac and Sinopharm vaccines for emergency use, but questions have been raised about their efficacy against new variants of the coronavirus.",
"headline": 0,
"position": 7
},
{
"claim": "Thailand is the first country to publicly announce a plan to mix and match vaccines produced in China and ones developed by Western manufacturers.",
"headline": 0,
"position": 11
},
{
"claim": "The World Health Organization (WHO) has warned against

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_slice['Labels'] = current_slice.apply(lambda row: label_assignment(row['URL'], row['all_clean_urls'], row['newsguard_dict']), axis=1)
