# Label Assignment (for the True Sample/Control Group)

In [1]:
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv
load_dotenv()

data_folder = os.getenv("DATA_FOLDER")

#load the three relevant dataframes
claim_df = pd.read_csv(f'{data_folder}/true_articles_with_claims.csv', sep=";",index_col=0) #the articles obtained from the user's queries with its urls, title, body, and claims; /articles_with_claims.csv'
search_result_df = pd.read_csv(f'{data_folder}/All_Search_Results_Combined.csv', index_col=0) #the Aslett et al. data
search_result_df.drop(columns="X", inplace=True)
true_articles_df = pd.read_csv(f'{data_folder}/true_base_articles_with_claims.csv', sep=";", index_col=0) #the 17 FM articles with its contents and claims (/FM_Articles_with_ids_processed.xlsx')

In [2]:
claim_df.rename(columns={'url': 'URL', 'title': 'Title', 'body': 'Body', 'claims': 'Claims'}, inplace=True) #rename columns to make them consistent between dataframes, enabling the flexibility of functions

In [3]:
true_articles_df.rename(columns={'claims': 'Claims'}, inplace=True)

Claims were extracted for 1769 articles. Note, however, that these are stored as strings, which can result in potential evaluation errors, making them unusuable.

In [4]:
claim_df['URL']

0       https://www.newsweek.com/donald-trump-bashes-b...
2       https://www.businessinsider.com/arizona-trump-...
3       https://kutv.com/news/nation-world/ap-fact-che...
4       https://www.azcentral.com/story/opinion/op-ed/...
5       https://apnews.com/article/business-government...
                              ...                        
5641    https://fivethirtyeight.com/features/republica...
5642    https://iop.harvard.edu/youth-poll/spring-2021...
5674    https://news.gallup.com/poll/356591/congress-a...
5675    https://news.gallup.com/poll/1600/congress-pub...
5687    https://www.dailymail.co.uk/news/article-10175...
Name: URL, Length: 1028, dtype: object

We only care about the search queries that relate to the True articles

In [5]:
search_result_df = search_result_df[search_result_df['URL'].isin(true_articles_df['URL'])] #filter down to only contain results that have F/M articles as URL
search_result_df.reset_index(drop=True, inplace=True) 
search_result_df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_80,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls
0,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,,,,,,,,,,
1,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,,,,,,,,,,
2,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,1.0,0.0,0.0,0.0,0.0,0.0,92.750000,"factcheck.org,thehill.com,newsweek.com,busines...","100,80,100,100,90,92.5,95,87.5,87.5,95",'https://www.factcheck.org/2021/07/debunking-t...
3,Day_1_4,R_bqJ3HoBP0lqi7CN,Day_1,True,1,5,3,30,0,True,...,1.0,1.0,0.0,0.0,0.0,0.0,94.642857,"recorder.maricopa.gov,',azcentral.com,politico...","92.5,100,87.5,95,95,92.5,100",'https://recorder.maricopa.gov/electionresults...
4,Day_1_4,R_5ovNt0RyjDe4zhD,Day_1,True,1,5,3,37,1,True,...,1.0,0.0,0.0,0.0,0.0,0.0,95.833333,"azcentral.com,abc15.com,thehill.com,politico.c...","92.5,100,80,100,95,95,100,100,100",'https://www.azcentral.com/story/opinion/op-ed...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,Day_12_4,R_2WCmKU4ViN13BYG,Day_12,True,1,7,4,39,0,True,...,0.0,0.0,0.0,0.0,0.0,1.0,89.000000,"news.gallup.com,news.gallup.com,usatoday.com,f...","100,69.5,100,77,87.5,100",'https://news.gallup.com/poll/356591/congress-...
952,Day_12_4,R_3EmSqPx3YNGR6hc,Day_12,Misl,0,3,2,33,1,True,...,0.0,0.0,0.0,0.0,0.0,1.0,84.050000,"foxnews.com,usatoday.com,nypost.com,cnn.com,sf...","69.5,100,69.5,87.5,100,69.5,95,100,80,69.5",'https://www.foxnews.com/opinion/biden-approva...
953,Day_12_4,R_2WCmKU4ViN13BYG,Day_12,True,1,7,4,39,0,True,...,,,,,,,,,,
954,Day_12_4,R_28GQQm0b1EgpuTq,Day_12,True,1,5,3,30,0,True,...,,,,,,,,,,


In [6]:
search_result_df[search_result_df['all_clean_urls'].isna()] 

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_80,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls
0,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,,,,,,,,,,
1,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,,,,,,,,,,
5,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,,,,,,,,,,
7,Day_1_4,R_20M3irK9yP7zOGl,Day_1,Misl,0,3,2,29,0,True,...,,,,,,,,,,
10,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,Day_12_4,R_3Hi52z8zMbrTa1c,Day_12,True,1,7,4,41,1,True,...,,,,,,,,,,
941,Day_12_4,R_UlpQI4c5kmFOM81,Day_12,True,1,7,4,20,1,True,...,,,,,,,,,,
946,Day_12_4,R_3CVCE05kSa0jk7W,Day_12,True,1,7,4,32,1,True,...,,,,,,,,,,
953,Day_12_4,R_2WCmKU4ViN13BYG,Day_12,True,1,7,4,39,0,True,...,,,,,,,,,,


In [7]:
search_result_df = search_result_df[search_result_df['all_clean_urls'].notna()]
search_result_df.reset_index(drop=True, inplace=True) 
search_result_df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_80,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls
0,Day_1_4,R_2y8dRM3TUWjSPIo,Day_1,Misl,0,1,1,20,0,True,...,1.0,0.0,0.0,0.0,0.0,0.0,92.750000,"factcheck.org,thehill.com,newsweek.com,busines...","100,80,100,100,90,92.5,95,87.5,87.5,95",'https://www.factcheck.org/2021/07/debunking-t...
1,Day_1_4,R_bqJ3HoBP0lqi7CN,Day_1,True,1,5,3,30,0,True,...,1.0,1.0,0.0,0.0,0.0,0.0,94.642857,"recorder.maricopa.gov,',azcentral.com,politico...","92.5,100,87.5,95,95,92.5,100",'https://recorder.maricopa.gov/electionresults...
2,Day_1_4,R_5ovNt0RyjDe4zhD,Day_1,True,1,5,3,37,1,True,...,1.0,0.0,0.0,0.0,0.0,0.0,95.833333,"azcentral.com,abc15.com,thehill.com,politico.c...","92.5,100,80,100,95,95,100,100,100",'https://www.azcentral.com/story/opinion/op-ed...
3,Day_1_4,R_1ILDrpPAAn0CdBx,Day_1,True,1,7,4,45,1,True,...,0.0,0.0,0.0,1.0,1.0,1.0,84.611111,"thehill.com,azcentral.com,abc15.com,washington...","80,92.5,100,42,100,95,100,57,95",'https://thehill.com/homenews/campaign/563658-...
4,Day_1_4,R_3MycWKTJGEI3fkc,Day_1,True,1,7,4,44,1,True,...,1.0,0.0,0.0,0.0,0.0,0.0,95.833333,"azcentral.com,thehill.com,abc15.com,bbc.com,nb...","92.5,80,100,95,100,95,100,100,100",'https://www.azcentral.com/story/opinion/op-ed...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,Day_12_4,R_vDFQr0QAhdjLCDf,Day_12,True,1,5,3,66,0,True,...,0.0,0.0,0.0,0.0,0.0,1.0,88.900000,"cnn.com,foxnews.com,usatoday.com,news.yahoo.co...","87.5,69.5,100,100,87.5,95,100,69.5,100,80",'https://www.cnn.com/videos/politics/2021/11/0...
677,Day_12_4,R_6x2lhuqmicuHOJX,Day_12,True,1,7,4,53,0,True,...,0.0,0.0,0.0,0.0,0.0,1.0,88.100000,"usatoday.com,sfgate.com,independent.co.uk,foxb...","100,100,100,69.5,69.5,80,69.5,100,100,92.5",'https://www.usatoday.com/story/news/politics/...
678,Day_12_4,R_2WCmKU4ViN13BYG,Day_12,True,1,7,4,39,0,True,...,0.0,0.0,0.0,0.0,0.0,1.0,89.000000,"news.gallup.com,news.gallup.com,usatoday.com,f...","100,69.5,100,77,87.5,100",'https://news.gallup.com/poll/356591/congress-...
679,Day_12_4,R_3EmSqPx3YNGR6hc,Day_12,Misl,0,3,2,33,1,True,...,0.0,0.0,0.0,0.0,0.0,1.0,84.050000,"foxnews.com,usatoday.com,nypost.com,cnn.com,sf...","69.5,100,69.5,87.5,100,69.5,95,100,80,69.5",'https://www.foxnews.com/opinion/biden-approva...


In [8]:
search_result_df.groupby('Category')['avg_score'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Coul,51.0,82.845855,18.764782,35.0,73.625,89.285714,96.9375,100.0
Misl,82.0,88.92321,10.147226,35.0,87.784722,91.152778,94.263393,100.0
True,540.0,88.919272,10.32029,21.25,87.428571,90.925,94.392361,100.0


In [9]:
search_result_df = search_result_df[search_result_df['Category'] != 'Coul']

The available clean urls are evaluated as tuples

In [10]:
import ast

def safe_literal_eval(input):
    try:
        return ast.literal_eval(input)
    except:
        return ()
    
#search_result_df['URL']
search_result_df['all_clean_urls'] = search_result_df['all_clean_urls'].apply(safe_literal_eval)
search_result_df['all_clean_urls']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  search_result_df['all_clean_urls'] = search_result_df['all_clean_urls'].apply(safe_literal_eval)


0      (https://www.factcheck.org/2021/07/debunking-t...
1      (https://recorder.maricopa.gov/electionresults...
2      (https://www.azcentral.com/story/opinion/op-ed...
3      (https://thehill.com/homenews/campaign/563658-...
4      (https://www.azcentral.com/story/opinion/op-ed...
                             ...                        
676    (https://www.cnn.com/videos/politics/2021/11/0...
677    (https://www.usatoday.com/story/news/politics/...
678    (https://news.gallup.com/poll/356591/congress-...
679    (https://www.foxnews.com/opinion/biden-approva...
680    (https://www.cnn.com/videos/politics/2021/11/0...
Name: all_clean_urls, Length: 629, dtype: object

Initilization of log data

In [11]:
#declaring the possible errors for the log file
domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors = 0, 0, 0 #newsguard score related errors

claim_dict_eval_errors = 0 #times a dictionary of claims could not be evaluated (ast.literal_eval)

fm_article_url_not_found = 0
serp_article_not_found = 0 #times a SERP article's could not be retrieved via its url
response_errors = 0 #times an API request failed


claim_retrieval_errors, claim_headline_retrieval_errors = 0, 0 # errors occuring during the retrieval of the claim positions and the headline bool

post_processing_error = 0

serp_dict_empty_error = 0

successful_parse = 0

def reset_error_count():
    global domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors
    global claim_dict_eval_errors
    global fm_article_url_not_found
    global serp_article_not_found
    global response_errors
    global claim_retrieval_errors, claim_headline_retrieval_errors
    global post_processing_error
    global serp_dict_empty_error
    global successful_parse 

    domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors = 0, 0, 0

    claim_dict_eval_errors = 0

    serp_article_not_found = 0
    response_errors = 0 

    claim_retrieval_errors, claim_headline_retrieval_errors = 0, 0

    post_processing_error = 0

    successful_parse = 0

def create_log(file_path):
    global domain_processing_errors, ng_dict_errors, ng_score_retrieval_errors
    global claim_dict_eval_errors
    global fm_article_url_not_found
    global serp_article_not_found
    global response_errors
    global claim_retrieval_errors, claim_headline_retrieval_errors
    global post_processing_error
    global serp_dict_empty_error
    global successful_parse

    error_dict = {'Successful_Parses': successful_parse,
                  'Domain_Processing_Errors': domain_processing_errors,
                  'Newsguard_Dict_Errors': ng_dict_errors,
                  'Newsguard_Score_Retrieval_Errors': ng_score_retrieval_errors,
                  'Claim_Dict_Eval_Errors:': claim_dict_eval_errors,
                  'FM_Article_Retrieval_Errors': fm_article_url_not_found,
                  'SERP_Article_Retrieval_Errors': serp_article_not_found,
                  'SERP_Dict_Empty_Errors': serp_dict_empty_error,
                  'API_Response_Errors': response_errors,
                  'Claim_Position_Retrieval_Errors': claim_retrieval_errors,
                  'Claim_Headline_Retrieval_Errors': claim_headline_retrieval_errors,
                  'Post_Processing_Error': post_processing_error
                  }
    
    with open(file_path, 'w') as f:
        f.write(str(error_dict))

In [12]:

#adds quotes around each domain in order for ast.literal_eval to interpret it properly as a list

def pre_process_domains(domain_string):  
    global domain_processing_errors 
    try:
        return ast.literal_eval(f"'{domain_string}'")
    except:
        print('ERROR: could not convert string of domains to list')
        domain_processing_errors += 1
        return ()

In [13]:
import json
def normalize_quotes(string):
    return string.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")

def get_claim_dict(url, df):
    global claim_dict_eval_errors
    claims = df.loc[df['URL'] == url, 'Claims']
    if not claims.empty:
        # specific_string_content = claims.iloc[0]
        # specific_string_content = normalize_quotes(specific_string_content)
        try:
            # dict_content = ast.literal_eval(claims.iloc[0])
            return json.loads(claims.iloc[0])
        except:
            claim_dict_eval_errors += 1
            print("ERROR: could not eval string as dict: " + claims.iloc[0])
            return {}
    else:
        return {}

In [14]:
true_articles_df['URL'][0]

'www.washingtontimes.com/news/2021/jul/19/arizona-election-audit-will-show-trump-won-majorit/'

In [15]:
import json
get_claim_dict('www.washingtontimes.com/news/2021/jul/19/arizona-election-audit-will-show-trump-won-majorit/', true_articles_df)

{'article_title': 'Majority of Arizona Republicans believe election audit will show Trump won, poll shows',
 'url': None,
 'article_length': 15,
 'claims': [{'claim': "The lion's share of Republican voters in Arizona believe the ongoing audit of election results in Maricopa County will turn up enough evidence to show that former President Trump was the real winner in the state last year.",
   'headline': 0,
   'position': 3},
  {'claim': 'An Arizona Public Opinion Pulse survey found that 62% of registered Republican voters in Arizona said the audit will show Mr. Trump beat President Biden in Arizona, compared to 2% of Democrats and 25% of independents who believe that.',
   'headline': 0,
   'position': 4},
  {'claim': 'Mr. Biden was certified as the winner in Arizona after he won the state by about 11,000 of the 3.4 million votes cast.',
   'headline': 0,
   'position': 7},
  {'claim': 'The survey showed that 39% of Republicans believe there was no evidence of widespread fraud, compar

In [16]:
label_assignment_prompt = """For each claim in article A, check for all claims in article B, 
whether the claim made in article B contradicts the claim made in article A (-1), 
reinforces the claim made inarticle A (1), or whether the two claims are unrelated (0). 

Adhere to the following output format:
{"comparisons": [
    {
        'claim_article_a': "This is a claim made in article a"
        'claim_article_b': "This is a claim made in article b"
        'relation': -1 
    },
    {
        'claim_article_a': "This is a claim made in article a"
        'claim_article_b': "This is another claim made in article b"
        'relation': 1
    },
    {
        'claim_article_a': "This is a claim made in article a"
        'claim_article_b': "This is yet another claim made in article b"
        'relation': 0
    }
]
}

Please consider the following:

• Your response must be directly the JSON, do not add any other text, such as
”Here is the output:” and similar.
• Make sure that the JSON is properly formatted, especially with correct types of braces and brackets
• Make sure to consider all of the possible claim pairs, which should add up to 5*5 = 25


Now do this task for the following input:
"""

In [17]:
def extract_claims(claim_dict):
    return [claim['claim'] for claim in claim_dict['claims']]

Prompting the model

In [18]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv() 

api_key = os.getenv("API_KEY") 
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-70b-instruct"  # Choose any available model
temperature = 0.01

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
    )

def process_prompt(prompt):
    # Get response
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "SYSTEM MESSAGE"},
            {"role": "user", "content": prompt}
        ],
        model=model,
        temperature=temperature
    )

    return chat_completion.choices[0].message.content

print(process_prompt("Hello!"))

Hello. How can I help you today?


In [19]:
import time
last_call_time = time.time()

def get_throttled_prompt_response(value):
    global last_call_time
    global response_errors
    elapsed_time = time.time() - last_call_time #dynamic buffer
    if elapsed_time < 8:
        time.sleep(8 - elapsed_time)
    try:
        last_call_time = time.time()
        return process_prompt(f"{label_assignment_prompt}{value}")
    except:
        response_errors += 1
        return "ERROR: " + str(value)

In [20]:
#can be used to retrieve domains later
def get_newsguard_dict(d, s):
    domains = pre_process_domains(d)
    try:
        domains_newsguard = ast.literal_eval(s)
    except:
        return {}

    try:
        ng_dict = {k:v for k, v in zip(domains, domains_newsguard)}
        return ng_dict
    except:
        return {}

In [21]:
search_result_df['list_domains']

0      factcheck.org,thehill.com,newsweek.com,busines...
1      recorder.maricopa.gov,',azcentral.com,politico...
2      azcentral.com,abc15.com,thehill.com,politico.c...
3      thehill.com,azcentral.com,abc15.com,washington...
4      azcentral.com,thehill.com,abc15.com,bbc.com,nb...
                             ...                        
676    cnn.com,foxnews.com,usatoday.com,news.yahoo.co...
677    usatoday.com,sfgate.com,independent.co.uk,foxb...
678    news.gallup.com,news.gallup.com,usatoday.com,f...
679    foxnews.com,usatoday.com,nypost.com,cnn.com,sf...
680    cnn.com,foxnews.com,news.yahoo.com,dailymail.c...
Name: list_domains, Length: 629, dtype: object

In [22]:
search_result_df['list_domains'] = search_result_df['list_domains'].str.replace(",", "', '")
search_result_df['list_domains']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  search_result_df['list_domains'] = search_result_df['list_domains'].str.replace(",", "', '")


0      factcheck.org', 'thehill.com', 'newsweek.com',...
1      recorder.maricopa.gov', ''', 'azcentral.com', ...
2      azcentral.com', 'abc15.com', 'thehill.com', 'p...
3      thehill.com', 'azcentral.com', 'abc15.com', 'w...
4      azcentral.com', 'thehill.com', 'abc15.com', 'b...
                             ...                        
676    cnn.com', 'foxnews.com', 'usatoday.com', 'news...
677    usatoday.com', 'sfgate.com', 'independent.co.u...
678    news.gallup.com', 'news.gallup.com', 'usatoday...
679    foxnews.com', 'usatoday.com', 'nypost.com', 'c...
680    cnn.com', 'foxnews.com', 'news.yahoo.com', 'da...
Name: list_domains, Length: 629, dtype: object

In [23]:
search_result_df['newsguard_dict'] = search_result_df.apply(lambda row: get_newsguard_dict(row['list_domains'], row['list_scores']), axis=1)

ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list
ERROR: could not convert string of domains to list


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  search_result_df['newsguard_dict'] = search_result_df.apply(lambda row: get_newsguard_dict(row['list_domains'], row['list_scores']), axis=1)


In [24]:
from urllib.parse import urlparse

def get_newsguard_score(raw_url, score_dict):
    global ng_dict_errors
    global ng_score_retrieval_errors
    if not score_dict:
        ng_dict_errors += 1
        return None
    try:
        domain = urlparse(raw_url).netloc #to get the the desired format
        return(score_dict['.'.join(domain.split('.')[-2:])])
    except:
        ng_score_retrieval_errors += 1
        return None
    

In [25]:
def get_claim_positions(claim_dict, relative=True):
    global claim_retrieval_errors
    positions = []
    try:
        for c in claim_dict['claims']:
            if relative == False:
                positions.append(c['position'])
            else:
                positions.append((c['position']/claim_dict['article_length']))
    except:
         claim_retrieval_errors += 1
         print("ERROR: could not extract positions")
    return positions

def get_claim_headline_info(claim_dict):
    global claim_headline_retrieval_errors
    h_info = []
    try:
        for c in claim_dict['claims']:
            h_info.append(c['headline'])
    except:
         claim_headline_retrieval_errors += 1
         print("ERROR: could not extract headline info")
    return h_info

In [26]:
def process_response(response, fm_url, serp_article, newsguard_dict, fm_dict):
    response_dict = ast.literal_eval(response)
    response_dict['fm_url'] = fm_url
    response_dict['fm_article_length'] = fm_dict['article_length']
    response_dict['serp_url'] = serp_article['url']
    response_dict['serp_position'] = serp_article['serp_position']
    response_dict['serp_article_length'] = serp_article['article_length']
    try:
        response_dict['serp_newsguard'] = get_newsguard_score(serp_article['url'], newsguard_dict)
    except:
        response_dict['serp_newsguard'] = None

    #get the relative positions of the claims
    positions_fake_article = get_claim_positions(fm_dict)
    positions_fake_article = [item for x in positions_fake_article for item in [x]*5]
    for comp, position in zip(response_dict['comparisons'], positions_fake_article):
        comp['fm_claim_position'] =  position
    positions_search_article = get_claim_positions(serp_article)
    positions_search_article = positions_search_article*5
    for comp, position in zip(response_dict['comparisons'], positions_search_article):
        comp['serp_claim_position'] =  position

    #get the absolute positions of the claims
    positions_fake_article = get_claim_positions(fm_dict, relative=False)
    positions_fake_article = [item for x in positions_fake_article for item in [x]*5]
    for comp, position in zip(response_dict['comparisons'], positions_fake_article):
        comp['fm_claim_position_abs'] =  position
    positions_search_article = get_claim_positions(serp_article, relative=False)
    positions_search_article = positions_search_article*5
    for comp, position in zip(response_dict['comparisons'], positions_search_article):
        comp['serp_claim_position_abs'] =  position 

    h_info_fake_article = get_claim_headline_info(fm_dict)
    h_info_fake_article = [item for x in h_info_fake_article for item in [x]*5]
    for comp, h_info in zip(response_dict['comparisons'], h_info_fake_article):
        comp['fm_claim_headline'] =  h_info

    h_info_serp_article = get_claim_headline_info(serp_article)
    h_info_serp_article = h_info_serp_article*5
    for comp, h_info in zip(response_dict['comparisons'], h_info_serp_article):
        comp['serp_claim_headline'] =  h_info
        
    return response_dict

In [27]:
#catch empty cases
def label_assignment(fm_url, serp_urls, newsguard_dict):
    label_list = []
    global fm_article_url_not_found
    global post_processing_error
    global serp_dict_empty_error

    fm_article_dict = get_claim_dict(fm_url, true_articles_df) #get fm_claim_dict, this is also called in the get_serp_articles function, but that one also has some more details
    if not fm_article_dict:
        fm_article_url_not_found += 1
        return "ERROR: FM Article Claims could not be retrieved"
    fm_article_claims = extract_claims(fm_article_dict)

    serp_dict = get_serp_dict(serp_urls=serp_urls)
    if not serp_dict: #check for empty list
        serp_dict_empty_error += 1
        return None
    for serp_article in serp_dict:
        if not serp_article:
            label_list.append({})
            continue
        serp_claims = extract_claims(serp_article) #extract claims from the entire dict
        comparison_dict = {'claims_article_a': fm_article_claims, 'claims_article_b': serp_claims} #pass only the pure claims into the model
        response = get_throttled_prompt_response(comparison_dict)
        if "ERROR" in response:
            label_list.append({})
            continue
        try:
            processed_response = process_response(response, fm_url, serp_article, newsguard_dict, fm_article_dict)       
            label_list.append(processed_response)
        except:
            print("ERROR: Post-Processing Error")
            post_processing_error += 1
            # catch faulty responses
            label_list.append({})
    return label_list
    

#returns serp articles where a valid claim_dict could be retrieved

def get_serp_dict(serp_urls):
    global serp_article_not_found
    global successful_parse
    serp_articles = []  
    for i, serp_url in enumerate(serp_urls):
        try: 
            article = get_claim_dict(serp_url, claim_df)
            if isinstance(article, dict) and article:
                article['url'] = serp_url
                article['serp_position'] = i + 1
                serp_articles.append(article)
                successful_parse += 1
            else:
                serp_article_not_found += 1
                serp_articles.append({})
        except TypeError as e:
            print(f"TypeError for URL {serp_url}: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error for URL {serp_url}: {e}")
    return serp_articles

In [28]:
#change these variables to conduct over multiple sessions, 100 at a time, since we have up to 10 API calls for each entry (1 for each SERP article) -> 1000 API calls per session
current_start = 550 #Completed runs: 0-100; 100-200 (but 215 API response errors); 200-300; 300-400; 400-500; 500-600; 600-700; 700-800; 800-900
current_end = current_start + 50 #TRUE sample run for 0-100 (20 hours) 100-150, 150-200, 200-250 

In [29]:
current_slice = search_result_df.iloc[current_start:current_end]
current_slice

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_85,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict
600,Day_12_2,R_2zdA2oVVoNtgtdL,Day_12,True,1,7,4,22,1,True,...,0.0,0.0,0.0,0.0,1.0,87.428571,"thedailybeast.com', 'rollingstone.com', 'chene...","87.5,77.5,95,87.5,95,100,69.5",(https://www.thedailybeast.com/liz-cheney-tell...,"{'thedailybeast.com': 87.5, 'rollingstone.com'..."
601,Day_12_2,R_UlpQI4c5kmFOM81,Day_12,True,1,7,4,20,1,True,...,1.0,0.0,0.0,0.0,0.0,98.125,"cbsnews.com', 'cnn.com', 'politico.com', 'theg...","95,87.5,100,100,100,100,100,100,100,100,100,95",(https://www.cbsnews.com/news/liz-cheney-donal...,"{'cbsnews.com': 95, 'cnn.com': 87.5, 'politico..."
603,Day_12_2,R_UlpQI4c5kmFOM81,Day_12,True,1,7,4,20,1,True,...,0.0,0.0,0.0,0.0,0.0,95.0,"cheney.house.gov', 'cheney.house.gov', 'cheney...",10010010080,"(https://cheney.house.gov/, https://cheney.hou...",{'cheney.house.gov': 80}
604,Day_12_2,R_UlpQI4c5kmFOM81,Day_12,True,1,7,4,20,1,True,...,0.0,0.0,0.0,0.0,0.0,94.642857,"statista.com', 'nytimes.com', 'yahoo.com', 'fi...","100,100,92.5,95,100,100,75",(https://www.statista.com/statistics/1201793/f...,"{'statista.com': 100, 'nytimes.com': 100, 'yah..."
605,Day_12_2,R_3feqhugqaYYwLlx,Day_12,True,1,7,4,36,1,True,...,1.0,0.0,0.0,0.0,0.0,97.5,"washingtonpost.com', 'businessinsider.com', 'l...","100,100,100,95,100,100,100,87.5,95,95,100",(https://www.washingtonpost.com/politics/2021/...,"{'washingtonpost.com': 100, 'businessinsider.c..."
606,Day_12_2,R_zfpFPdC9vAGgxRT,Day_12,True,1,6,4,27,1,True,...,1.0,0.0,0.0,0.0,0.0,97.5,"npr.org', 'usatoday.com', 'newyorker.com', 'th...","100,100,100,100,92.5,100,87.5,100",(https://www.npr.org/2021/02/01/962246187/spur...,"{'npr.org': 100, 'usatoday.com': 100, 'newyork..."
607,Day_12_2,R_2EbMwHK8uo6c61n,Day_12,True,1,7,3,33,0,True,...,0.0,0.0,1.0,1.0,1.0,89.5,"bipartisanreport.com', 'cnn.com', 'politico.co...","52.5,87.5,100,80,69.5,95,100,100,100,100,100",(https://bipartisanreport.com/2021/11/07/liz-c...,"{'bipartisanreport.com': 52.5, 'cnn.com': 87.5..."
608,Day_12_2,R_2UVPhYZE6VHVE2l,Day_12,True,1,5,3,30,0,True,...,0.0,0.0,1.0,1.0,1.0,86.333333,"bipartisanreport.com', 'foxnews.com', 'politic...","52.5,69.5,100,80,80,95,100,100,100",(https://bipartisanreport.com/2021/11/07/liz-c...,"{'bipartisanreport.com': 52.5, 'foxnews.com': ..."
609,Day_12_2,R_DuFCDfdUjBXKc7L,Day_12,True,1,7,4,41,0,True,...,0.0,0.0,1.0,1.0,1.0,76.642857,"thedailybeast.com', 'foxnews.com', 'foxnews.co...","87.5,69.5,69.5,77.5,100,80,52.5",(https://www.thedailybeast.com/liz-cheney-tell...,"{'thedailybeast.com': 87.5, 'foxnews.com': 69...."
610,Day_12_2,R_vDFQr0QAhdjLCDf,Day_12,True,1,6,4,66,0,True,...,0.0,0.0,0.0,0.0,1.0,91.888889,"theguardian.com', 'news.yahoo.com', 'sheridanm...","100,100,100,80,100,77.5,100,100,69.5",(https://www.theguardian.com/us-news/2021/nov/...,"{'theguardian.com': 100, 'news.yahoo.com': 100..."


In [30]:
current_slice['all_clean_urls']

600    (https://www.thedailybeast.com/liz-cheney-tell...
601    (https://www.cbsnews.com/news/liz-cheney-donal...
603    (https://cheney.house.gov/, https://cheney.hou...
604    (https://www.statista.com/statistics/1201793/f...
605    (https://www.washingtonpost.com/politics/2021/...
606    (https://www.npr.org/2021/02/01/962246187/spur...
607    (https://bipartisanreport.com/2021/11/07/liz-c...
608    (https://bipartisanreport.com/2021/11/07/liz-c...
609    (https://www.thedailybeast.com/liz-cheney-tell...
610    (https://www.theguardian.com/us-news/2021/nov/...
611    (https://www.rollingstone.com/politics/politic...
612    (https://www.foxnews.com/politics/trumps-targe...
613    (https://www.theblaze.com/news/trump-calls-rep...
614    (https://apnews.com/hub/liz-cheney, https://ap...
615    (https://www.googleadservices.com/, https://gi...
616    (https://www.politico.com/news/2021/11/09/trum...
617    (https://www.reuters.com/world/us/us-republica...
618    (https://bipartisanrepor

Running the mainloop:

In [31]:
reset_error_count()
current_slice['Labels'] = current_slice.apply(lambda row: label_assignment(row['URL'], row['all_clean_urls'], row['newsguard_dict']), axis=1)
create_log(f'./logs/true_error_log{current_start}_{current_end}.txt')
current_slice.to_csv(f"{data_folder}/true_label_data_{current_start}_{current_end}.csv", sep=";")
current_slice['Labels'].to_json(f"{data_folder}/true_label_data_{current_start}_{current_end}.json")

ERROR: could not eval string as dict: {
    "article_title": "Liz Cheney calls out Fox News â\x80¦ on Fox News",
    "url": null,
    "article_length": 15,
    "claims": [
        {
            "claim": "Liz Cheney calls out Fox News on Fox News",
            "headline": 1,
            "position": 0
        }
    ]
}
ERROR: could not eval string as dict: {
    "article_title": "Chris Wallace presses Liz Cheney on Trump supporters: 'Why alienate them?' | Fox News",
    "url": null,
    "article_length": 27,
    "claims": [
        {
            "claim": "Rep. Liz Cheney defended her outspoken criticism of former President Donald Trump during an interview on Fox News Sunday",
            "headline": 0,
            "position": 9
        },
        {
            "claim": "Cheney said that the 74 million Americans who voted for Trump were misled and betrayed",
            "headline": 0,
            "position": 9
        },
        {
            "claim": "GOP lawmakers voted to remove Cheney

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_slice['Labels'] = current_slice.apply(lambda row: label_assignment(row['URL'], row['all_clean_urls'], row['newsguard_dict']), axis=1)


In [32]:
current_slice['Labels']

600    [{'comparisons': [{'claim_article_a': 'Rep. Li...
601    [{'comparisons': [{'claim_article_a': 'Rep. Li...
603    [{}, {}, {}, {}, {}, {}, {'comparisons': [{'cl...
604    [{'comparisons': [{'claim_article_a': 'Rep. Li...
605    [{}, {'comparisons': [{'claim_article_a': 'Rep...
606    [{}, {'comparisons': [{'claim_article_a': 'Rep...
607    [{'comparisons': [{'claim_article_a': 'Rep. Li...
608    [{'comparisons': [{'claim_article_a': 'Rep. Li...
609    [{'comparisons': [{'claim_article_a': 'Rep. Li...
610    [{}, {'comparisons': [{'claim_article_a': 'Rep...
611    [{'comparisons': [{'claim_article_a': 'Rep. Li...
612    [{'comparisons': [{'claim_article_a': 'Rep. Li...
613    [{'comparisons': [{'claim_article_a': 'Rep. Li...
614    [{'comparisons': [{'claim_article_a': 'Rep. Li...
615    [{}, {}, {'comparisons': [{'claim_article_a': ...
616    [{}, {'comparisons': [{'claim_article_a': 'Rep...
617             [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
618    [{'comparisons': [{'clai