In [104]:
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv
import ast
import glob


load_dotenv()

data_folder = os.getenv("DATA_FOLDER")

file_prefix = 'label_data'

# Use glob to find all CSV files with the given filename start
file_list = glob.glob(f"{data_folder}/{file_prefix}*.csv")

# Read each file into a DataFrame and concatenate them into a single DataFrame
dataframes = [pd.read_csv(file, sep=";",index_col=0) for file in file_list]
df = pd.concat(dataframes, ignore_index=True)

In [105]:

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, f1_score, recall_score, precision_score, ndcg_score
from scipy.optimize import minimize
from scipy.special import expit


In [106]:
df['Labels']

0      [{'comparisons': [{'claim_article_a': "Arizona...
1      [{}, {}, {}, {}, {}, {}, {}, {}, {}, {'compari...
2      [{'comparisons': [{'claim_article_a': "Arizona...
3      [{}, {}, {'comparisons': [{'claim_article_a': ...
4      [{}, {}, {'comparisons': [{'claim_article_a': ...
                             ...                        
956    [{'comparisons': [{'claim_article_a': "The NIH...
957    [{}, {'comparisons': [{'claim_article_a': "The...
958    [{}, {}, {}, {}, {}, {'comparisons': [{'claim_...
959             [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
960             [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
Name: Labels, Length: 961, dtype: object

In [107]:
faulty_labels = 0
def safe_literal_eval(x):
    global faulty_labels
    try:
        return ast.literal_eval(x)
    except:
        faulty_labels += 1

In [108]:
df['Labels'] = df['Labels'].apply(safe_literal_eval)
print(faulty_labels)

101


In [109]:
df[df['Labels'].isna()] #101 None entries

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict,Labels
561,Day_4_3,R_3LZpxw4gLOfQL9m,Day_4,Coul,0,5,3,25,0,FM,...,0.0,1.0,1.0,1.0,25.666667,"aier.org', 'twitter.com', 'ne-np.facebook.com'...","62,7.5,7.5",('https://www.aier.org/article/zero-covid-cata...,"{'aier.org': 62, 'twitter.com': 7.5, 'ne-np.fa...",
562,Day_4_3,R_3frX1vnowyoNcQL,Day_4,Misl,0,3,2,46,1,FM,...,0.0,0.0,0.0,0.0,95.000000,"apnews.com', 'washingtonpost.com', 'abcnews.go...","95,100,82.5,87.5,100,100,100,95,95,95",('https://apnews.com/article/donald-trump-capi...,"{'apnews.com': 95, 'washingtonpost.com': 100, ...",
563,Day_4_3,R_rfmuHm1RiwH3ToJ,Day_4,Misl,0,5,3,26,0,FM,...,0.0,0.0,0.0,0.0,91.562500,"telegraph.co.uk', 'nytimes.com', 'scmp.com', '...","87.5,100,85,85,85,90,100,100",('https://www.telegraph.co.uk/news/2021/07/23/...,"{'telegraph.co.uk': 87.5, 'nytimes.com': 100, ...",
564,Day_4_3,R_1rOER7NAoM3RCR3,Day_4,True,1,5,3,41,1,FM,...,0.0,0.0,1.0,1.0,90.900000,"aier.org', 'thelancet.com', 'irishtimes.com', ...","62,92.5,100,100,100",('https://www.aier.org/article/zero-covid-cata...,"{'aier.org': 62, 'thelancet.com': 92.5, 'irish...",
565,Day_4_3,R_3frX1vnowyoNcQL,Day_4,Misl,0,3,2,46,1,FM,...,0.0,0.0,1.0,1.0,77.750000,"otandp.com', 'otandp.com', 'isglobal.org', 'in...","100,62,62,87.5,82.5,72.5",('https://www.otandp.com/blog/is-zero-covid-a-...,"{'otandp.com': 62, 'isglobal.org': 62, 'instit...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,Day_6_1,R_2uWgH4WTF4qBMb0,Day_6,True,1,6,3,24,0,FM,...,0.0,1.0,1.0,1.0,49.750000,"google.co.in', 'westernjournal.com', 'wnd.com'...",8217.5,('https://www.google.co.in/intl/en/about/produ...,{},
862,Day_6_1,R_Y084AN74VgOMcmd,Day_6,Misl,0,1,1,49,1,FM,...,0.0,0.0,0.0,0.0,97.272727,"statesman.com', 'cnn.com', 'forbes.com', 'wash...","100,87.5,95,100,100,100,95,100,92.5,100,100",('https://www.statesman.com/story/news/politic...,"{'statesman.com': 100, 'cnn.com': 87.5, 'forbe...",
863,Day_6_1,R_2331kLNovuSPrlz,Day_6,Misl,0,1,1,26,0,FM,...,0.0,0.0,0.0,0.0,95.277778,"justice.gov', 'washingtonpost.com', 'bbc.com',...","100,95,87.5,100,92.5,92.5,100,95,95",('https://www.justice.gov/usao-dc/pr/departmen...,"{'justice.gov': 100, 'washingtonpost.com': 95,...",
864,Day_6_1,R_3iU9nT30PN5iPB9,Day_6,Coul,0,4,3,27,1,FM,...,0.0,0.0,0.0,1.0,89.700000,"npr.org', 'washingtonpost.com', 'msnbc.com', '...","100,100,70,87.5,100,82.5,69.5,100,100,87.5",('https://www.npr.org/2021/07/28/1021113538/4-...,"{'npr.org': 100, 'washingtonpost.com': 100, 'm...",


In [110]:
df = df[df['Labels'].notna()]
df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,only_rel_90,Unrel_contain,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict,Labels
0,Day_1_1,R_10pkUNEtcV6OMCU,Day_1,Coul,0,5,3,35,1,FM,...,0.0,1.0,1.0,1.0,91.055556,"newsweek.com', 'yahoo.com', 'azmirror.com', 'f...","100,100,57,92.5,100,100,75,100,95",('https://www.newsweek.com/arizona-state-senat...,"{'newsweek.com': 100, 'yahoo.com': 100, 'azmir...","[{'comparisons': [{'claim_article_a': ""Arizona..."
1,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,0.0,0.0,0.0,95.277778,"abc15.com', 'salon.com', 'cnn.com', 'politico....","100,87.5,87.5,100,95,95,100,100,92.5",('https://www.abc15.com/news/state/poll-many-r...,"{'abc15.com': 100, 'salon.com': 87.5, 'cnn.com...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {'compari..."
2,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,1.0,1.0,1.0,83.450000,"cnn.com', 'thehill.com', 'salon.com', 'washing...","87.5,80,87.5,42,87.5,100,92.5,87.5,85,85",('https://www.cnn.com/2021/07/18/politics/fact...,"{'cnn.com': 87.5, 'thehill.com': 80, 'salon.co...","[{'comparisons': [{'claim_article_a': ""Arizona..."
3,Day_1_1,R_3KT6q7Vntwvmg8Z,Day_1,True,1,7,3,33,1,FM,...,0.0,0.0,0.0,0.0,92.812500,"thehill.com', 'recorder.maricopa.gov', 'washin...","80,100,100,100,100,92.5,75,95",('https://thehill.com/homenews/campaign/563100...,"{'thehill.com': 80, 'recorder.maricopa.gov': 1...","[{}, {}, {'comparisons': [{'claim_article_a': ..."
4,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,0.0,0.0,0.0,96.000000,"apnews.com', 'detroitnews.com', 'cnn.com', 'ap...","95,92.5,87.5,95,95,100,100,100,100,95",('https://apnews.com/article/technology-joe-bi...,"{'apnews.com': 95, 'detroitnews.com': 92.5, 'c...","[{}, {}, {'comparisons': [{'claim_article_a': ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,0.0,0.0,0.0,0.0,98.437500,"factcheck.org', 'yahoo.com', 'theintercept.com...","100,100,87.5,100,100,100,100,100",('https://www.factcheck.org/2021/10/scicheck-r...,"{'factcheck.org': 100, 'yahoo.com': 100, 'thei...","[{'comparisons': [{'claim_article_a': ""The NIH..."
957,Day_9_3,R_1P6MQAwGMgs0Loo,Day_9,True,1,5,3,39,0,FM,...,0.0,0.0,0.0,0.0,94.166667,"googleadservices.com', 'scientificamerican.com...","100,77.5,87.5,100,100,100","('https://www.googleadservices.com/', 'https:/...","{'googleadservices.com': 100, 'scientificameri...","[{}, {'comparisons': [{'claim_article_a': ""The..."
958,Day_9_3,R_rcgF4h6LylUXPUd,Day_9,Misl,0,1,2,23,0,FM,...,0.0,0.0,0.0,0.0,92.500000,"covid19.nih.gov', 'grants.nih.gov', 'grants.ni...","100,87.5,100,87.5,87.5",('https://covid19.nih.gov/funding#:~:text=NIH%...,"{'covid19.nih.gov': 100, 'grants.nih.gov': 87.5}","[{}, {}, {}, {}, {}, {'comparisons': [{'claim_..."
959,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,0.0,1.0,1.0,1.0,85.777778,"science.org', 'the-scientist.com', 'rollcall.c...","100,100,75,82.5,82.5,95,80,57,100",('https://www.science.org/content/article/nih-...,"{'science.org': 100, 'the-scientist.com': 100,...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]"


In [111]:
def get_num_articles(label_lists):
    return sum(bool(d) for d in label_lists)

df['num_valid_articles'] = df['Labels'].apply(get_num_articles)

In [112]:
df['num_valid_articles'].describe()

count    860.000000
mean       2.481395
std        2.001950
min        0.000000
25%        1.000000
50%        2.000000
75%        4.000000
max        9.000000
Name: num_valid_articles, dtype: float64

If a Newsguard Score could not be retrieved for an article, it will be filled by the placeholder score, in this case the median of the average newsguard scores as given by Aslett et al.

In [113]:
placeholder_ng_score = df['avg_score'].median()
placeholder_ng_score

np.float64(90.8333333333333)

In [114]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [115]:
def weighted_metric(weights, label_dict_list, positions=False):
    #create three distinct lists to control each parameter
    # if label_dict_list is empty return NaN
    if not label_dict_list:
        return np.nan
    article_stances = []
    article_ng_scores = []
    article_serp_positions = []
    
    for article in label_dict_list: #this is the search level; here we can retrieve SERP position and Newsguard score
        if not article: #catch empty entries
            continue
        if len(article['comparisons']) == 0:
               continue
        #retrieve variables
        serp_newsguard = article['serp_newsguard']
        serp_position = article['serp_position']

        #add to lists for decomposed calculation
        article_ng_scores.append(serp_newsguard)
        article_serp_positions.append(serp_position)

        article_stance = 0
        for claim_pair in article['comparisons']: #this is the "article level"; here we can retrieve relative claim position and the bool headline info
            relation = claim_pair['relation']
            if relation == 0:
                continue

            try:
                #retrieve relative claim positions
                fm_claim_position = claim_pair['fm_claim_position']
                serp_claim_position = claim_pair['serp_claim_position']

                #retrieve headline bool
                fm_claim_headline = claim_pair['fm_claim_headline']
                serp_claim_headline = claim_pair['serp_claim_headline']
            except:
                if positions:
                    return np.nan

            article_stance += relation

        article_stances.append(article_stance/len(article['comparisons']))

    #if article_stances could not be successfully retrieved, return NaN
    if not article_stances:
        return np.nan
    
    

    article_stances = np.array(article_stances)
    article_ng_scores = np.array(article_ng_scores, dtype=float)
    article_ng_scores = np.nan_to_num(article_ng_scores, nan=placeholder_ng_score) #replace NaN with the placeholder
    article_ng_scores = article_ng_scores / 100 #normalize newsguard scores between 0 and 1
    article_serp_positions = np.array(article_serp_positions)
    article_serp_positions = 1/article_serp_positions #1/x

    bias = weights[0]
    weights = weights[1:]

    # print(article_stances)
    # print(article_ng_scores)
    # print(article_serp_positions)
    
    #perceptron setup
    metrics =  np.vstack((article_stances, article_ng_scores, article_serp_positions))
    weighted_preds = np.dot(weights, metrics) + bias
    return sigmoid(np.mean(weighted_preds))


In [116]:
label_mapping = {
    'Coul' : np.nan,
    'Misl' : 0,
    'True' : 1
}

df['User_Labels'] = df['Category'].map(label_mapping)

In [117]:
df = df[df['User_Labels'].notna()] #728
df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,Unrel_contain_65,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict,Labels,num_valid_articles,User_Labels
1,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,0.0,95.277778,"abc15.com', 'salon.com', 'cnn.com', 'politico....","100,87.5,87.5,100,95,95,100,100,92.5",('https://www.abc15.com/news/state/poll-many-r...,"{'abc15.com': 100, 'salon.com': 87.5, 'cnn.com...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {'compari...",1,0.0
2,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,1.0,1.0,83.450000,"cnn.com', 'thehill.com', 'salon.com', 'washing...","87.5,80,87.5,42,87.5,100,92.5,87.5,85,85",('https://www.cnn.com/2021/07/18/politics/fact...,"{'cnn.com': 87.5, 'thehill.com': 80, 'salon.co...","[{'comparisons': [{'claim_article_a': ""Arizona...",4,0.0
3,Day_1_1,R_3KT6q7Vntwvmg8Z,Day_1,True,1,7,3,33,1,FM,...,0.0,0.0,92.812500,"thehill.com', 'recorder.maricopa.gov', 'washin...","80,100,100,100,100,92.5,75,95",('https://thehill.com/homenews/campaign/563100...,"{'thehill.com': 80, 'recorder.maricopa.gov': 1...","[{}, {}, {'comparisons': [{'claim_article_a': ...",6,1.0
4,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,0.0,96.000000,"apnews.com', 'detroitnews.com', 'cnn.com', 'ap...","95,92.5,87.5,95,95,100,100,100,100,95",('https://apnews.com/article/technology-joe-bi...,"{'apnews.com': 95, 'detroitnews.com': 92.5, 'c...","[{}, {}, {'comparisons': [{'claim_article_a': ...",6,0.0
5,Day_1_1,R_yx69bWSwS7KJTDb,Day_1,Misl,0,3,2,25,1,FM,...,0.0,0.0,,"twitter.com', 'twitter.com', 'beckernews.com',...",,('https://twitter.com/kylenabecker?ref_src=tws...,{},"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]",0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,0.0,0.0,98.437500,"factcheck.org', 'yahoo.com', 'theintercept.com...","100,100,87.5,100,100,100,100,100",('https://www.factcheck.org/2021/10/scicheck-r...,"{'factcheck.org': 100, 'yahoo.com': 100, 'thei...","[{'comparisons': [{'claim_article_a': ""The NIH...",5,0.0
957,Day_9_3,R_1P6MQAwGMgs0Loo,Day_9,True,1,5,3,39,0,FM,...,0.0,0.0,94.166667,"googleadservices.com', 'scientificamerican.com...","100,77.5,87.5,100,100,100","('https://www.googleadservices.com/', 'https:/...","{'googleadservices.com': 100, 'scientificameri...","[{}, {'comparisons': [{'claim_article_a': ""The...",5,1.0
958,Day_9_3,R_rcgF4h6LylUXPUd,Day_9,Misl,0,1,2,23,0,FM,...,0.0,0.0,92.500000,"covid19.nih.gov', 'grants.nih.gov', 'grants.ni...","100,87.5,100,87.5,87.5",('https://covid19.nih.gov/funding#:~:text=NIH%...,"{'covid19.nih.gov': 100, 'grants.nih.gov': 87.5}","[{}, {}, {}, {}, {}, {'comparisons': [{'claim_...",3,0.0
959,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,1.0,1.0,85.777778,"science.org', 'the-scientist.com', 'rollcall.c...","100,100,75,82.5,82.5,95,80,57,100",('https://www.science.org/content/article/nih-...,"{'science.org': 100, 'the-scientist.com': 100,...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]",0,0.0


In [118]:
def cost_function(weights, label_list, y):
    pred_list = np.array([weighted_metric(weights, l) for l in label_list]) 
    #scaler = MinMaxScaler(feature_range=(-1, 1))
    #normalized_preds = scaler.fit_transform(pred_list.reshape(-1, 1)).flatten()
    y = np.array(y)
    y = y[~np.isnan(pred_list)]
    pred_list = pred_list[~np.isnan(pred_list)]
    return mean_squared_error(y, pred_list)

In [119]:
initial_weights = np.array([0.25, 0.25, 0.25, 0.25])
cost_function(initial_weights, df['Labels'].tolist(), df['User_Labels'].tolist())

0.2696409631753377

In [120]:
initial_weights = np.array([0.25, 0.25, 0.25, 0.25]) #Bias (W0) and Weights for article stance, ng score, serp position (W1-3)

# def weight_constraint(weights):
#     return np.sum(weights) - 1
# constraints = ({'type': 'eq', 'fun': weight_constraint})

result = minimize(cost_function, initial_weights, args=(df['Labels'].tolist(), df['User_Labels'].tolist()), method='SLSQP', bounds=[(-1, 1), (0, 1), (0, 1), (0, 1)], options={'maxiter': 1000, 'ftol': 1e-6})

optimal_weights = result.x
print(f"Optimal Weights: {optimal_weights}")

cost_function(optimal_weights, df['Labels'].tolist(), df['User_Labels'])

Optimal Weights: [-0.32701601  1.          0.          0.59339975]


0.24506996413027402

In [121]:
def get_metrics(weights, label_list):
    pred_list = np.array([weighted_metric(weights, l) for l in label_list])
    return pred_list
    # scaler = MinMaxScaler(feature_range=(-1, 1))
    # normalized_preds = scaler.fit_transform(pred_list.reshape(-1, 1)).flatten()
    # return normalized_preds

In [122]:
df ['Metric'] = get_metrics(optimal_weights, df['Labels'].tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df ['Metric'] = get_metrics(optimal_weights, df['Labels'].tolist())


In [123]:
df['Metric']

1      0.433478
2      0.461163
3      0.504681
4      0.336193
5           NaN
         ...   
956    0.504124
957    0.443534
958    0.475732
959         NaN
960         NaN
Name: Metric, Length: 728, dtype: float64

In [124]:
(df['num_valid_articles'] == 0).sum()

np.int64(128)

In [125]:
df = df[df['Metric'].notna()]
df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,Unrel_contain_70,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict,Labels,num_valid_articles,User_Labels,Metric
1,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,95.277778,"abc15.com', 'salon.com', 'cnn.com', 'politico....","100,87.5,87.5,100,95,95,100,100,92.5",('https://www.abc15.com/news/state/poll-many-r...,"{'abc15.com': 100, 'salon.com': 87.5, 'cnn.com...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {'compari...",1,0.0,0.433478
2,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,1.0,83.450000,"cnn.com', 'thehill.com', 'salon.com', 'washing...","87.5,80,87.5,42,87.5,100,92.5,87.5,85,85",('https://www.cnn.com/2021/07/18/politics/fact...,"{'cnn.com': 87.5, 'thehill.com': 80, 'salon.co...","[{'comparisons': [{'claim_article_a': ""Arizona...",4,0.0,0.461163
3,Day_1_1,R_3KT6q7Vntwvmg8Z,Day_1,True,1,7,3,33,1,FM,...,0.0,92.812500,"thehill.com', 'recorder.maricopa.gov', 'washin...","80,100,100,100,100,92.5,75,95",('https://thehill.com/homenews/campaign/563100...,"{'thehill.com': 80, 'recorder.maricopa.gov': 1...","[{}, {}, {'comparisons': [{'claim_article_a': ...",6,1.0,0.504681
4,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,0.0,96.000000,"apnews.com', 'detroitnews.com', 'cnn.com', 'ap...","95,92.5,87.5,95,95,100,100,100,100,95",('https://apnews.com/article/technology-joe-bi...,"{'apnews.com': 95, 'detroitnews.com': 92.5, 'c...","[{}, {}, {'comparisons': [{'claim_article_a': ...",6,0.0,0.336193
6,Day_1_1,R_3LXaF7Z7z740d2k,Day_1,Misl,0,1,1,26,1,FM,...,1.0,70.500000,"westernjournal.com', 'newsweek.com', 'flipboar...","82,100,12.5,87.5",('https://www.westernjournal.com/az-state-sena...,"{'westernjournal.com': 82, 'newsweek.com': 100...","[{'comparisons': [{'claim_article_a': ""Arizona...",2,0.0,0.529474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,Day_9_3,R_2QEWqk6A2nnsg7O,Day_9,True,1,6,4,26,0,FM,...,0.0,91.666667,"cnn.com', 'cnn.com', 'cnn.com', 'cnn.com', 'cn...","87.5,87.5,87.5,87.5,87.5,87.5,100,100,100",('https://www.cnn.com/videos/us/2021/10/25/nih...,"{'cnn.com': 87.5, 'npr.org': 100, 'nytimes.com...","[{'comparisons': [{'claim_article_a': ""The NIH...",1,1.0,0.780123
955,Day_9_3,R_3P7x5JWi3Xcbsqi,Day_9,True,1,7,4,25,1,FM,...,0.0,90.000000,"nih.gov', 'nih.gov', ''', 'jacksonville.com', ...","82.5,82.5,100,95",('https://www.nih.gov/news-events/news-release...,{},"[{'comparisons': [{'claim_article_a': ""The NIH...",2,1.0,0.489510
956,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,0.0,98.437500,"factcheck.org', 'yahoo.com', 'theintercept.com...","100,100,87.5,100,100,100,100,100",('https://www.factcheck.org/2021/10/scicheck-r...,"{'factcheck.org': 100, 'yahoo.com': 100, 'thei...","[{'comparisons': [{'claim_article_a': ""The NIH...",5,0.0,0.504124
957,Day_9_3,R_1P6MQAwGMgs0Loo,Day_9,True,1,5,3,39,0,FM,...,0.0,94.166667,"googleadservices.com', 'scientificamerican.com...","100,77.5,87.5,100,100,100","('https://www.googleadservices.com/', 'https:/...","{'googleadservices.com': 100, 'scientificameri...","[{}, {'comparisons': [{'claim_article_a': ""The...",5,1.0,0.443534


In [126]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, f1_score, recall_score, precision_score

In [127]:
# Calculate Mean Squared Error
mse = mean_squared_error(df['User_Labels'], df['Metric'])
print(f"Mean Squared Error: {mse}")

# Calculate Mean Absolute Error
mae = mean_absolute_error(df['User_Labels'], df['Metric'])
print(f"Mean Absolute Error: {mae}")

# Calculate R-squared
r2 = r2_score(df['User_Labels'], df['Metric'])
print(f"R-squared: {r2}")


Mean Squared Error: 0.24506996413027402
Mean Absolute Error: 0.4900401437892773
R-squared: 0.019650914675477305


In [128]:
df['Binary_Metric'] = [1 if (x) > 0.5 else 0 for x in df['Metric']]
df['Binary_Metric']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Binary_Metric'] = [1 if (x) > 0.5 else 0 for x in df['Metric']]


1      0
2      0
3      1
4      0
6      1
      ..
954    1
955    0
956    1
957    0
958    0
Name: Binary_Metric, Length: 595, dtype: int64

In [129]:
# Calculate Accuracy
acc = accuracy_score(df['User_Labels'], df['Binary_Metric'])
print(f"Accuracy: {acc}")

# Calculate F1
f1 = f1_score(df['User_Labels'], df['Binary_Metric'], average="micro")
print(f"F1: {f1}")

# Calculate Precision
prec = precision_score(df['User_Labels'], df['Binary_Metric'], average="micro")
print(f"Precision: {prec}")

# Calculate Recall
recall = recall_score(df['User_Labels'], df['Binary_Metric'], average="micro")
print(f"Recall: {recall}")

Accuracy: 0.5714285714285714
F1: 0.5714285714285714
Precision: 0.5714285714285714
Recall: 0.5714285714285714


In [130]:
last_search_df = df.drop_duplicates(subset='ResponseId', keep='last')
last_search_df

Unnamed: 0,Article_day,ResponseId,Day,Category,True_Dummy,Seven_Ordinal,Four_Ordinal,Age,Gender,FC_Eval,...,avg_score,list_domains,list_scores,all_clean_urls,newsguard_dict,Labels,num_valid_articles,User_Labels,Metric,Binary_Metric
4,Day_1_1,R_1Gv8iAs1HEqGuUe,Day_1,Misl,0,1,1,35,1,FM,...,96.000000,"apnews.com', 'detroitnews.com', 'cnn.com', 'ap...","95,92.5,87.5,95,95,100,100,100,100,95",('https://apnews.com/article/technology-joe-bi...,"{'apnews.com': 95, 'detroitnews.com': 92.5, 'c...","[{}, {}, {'comparisons': [{'claim_article_a': ...",6,0.0,0.336193,0
7,Day_1_1,R_3LXaF7Z7z740d2k,Day_1,Misl,0,1,1,26,1,FM,...,43.333333,"thetruereporter.com', 'washingtonexaminer.com'...","92.5,0,37.5",('https://thetruereporter.com/breaking-arizona...,{},"[{}, {'comparisons': [{'claim_article_a': ""Ari...",1,0.0,0.394051,0
15,Day_1_1,R_2PpsctGRoIizGAS,Day_1,Misl,0,1,1,39,1,FM,...,96.500000,"politico.com', 'bbc.com', 'apnews.com', 'abc15...","100,95,95,100,92.5,100,95,92.5,95,100",('https://www.politico.com/news/2021/06/29/ari...,"{'politico.com': 100, 'bbc.com': 95, 'apnews.c...","[{}, {}, {'comparisons': [{'claim_article_a': ...",7,0.0,0.373991,0
18,Day_1_1,R_Xj4sI7wBqGSbsC5,Day_1,Misl,0,3,2,48,1,FM,...,94.200000,"apnews.com', 'reuters.com', 'newsweek.com', 'y...",95951001001001001001009557,('https://apnews.com/article/technology-joe-bi...,"{'apnews.com': 95, 'reuters.com': 95, 'newswee...","[{}, {}, {'comparisons': [{'claim_article_a': ...",3,0.0,0.415461,0
25,Day_1_1,R_ONBQ9F4K4eiEEff,Day_1,True,1,7,4,33,0,FM,...,93.437500,"newsweek.com', 'fr24news.com', 'azcentral.com'...","100,92.5,100,95,75,85,100,100",('https://www.newsweek.com/arizona-state-senat...,{},"[{'comparisons': [{'claim_article_a': ""Arizona...",2,1.0,0.517139,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,Day_9_3,R_2QEWqk6A2nnsg7O,Day_9,True,1,6,4,26,0,FM,...,91.666667,"cnn.com', 'cnn.com', 'cnn.com', 'cnn.com', 'cn...","87.5,87.5,87.5,87.5,87.5,87.5,100,100,100",('https://www.cnn.com/videos/us/2021/10/25/nih...,"{'cnn.com': 87.5, 'npr.org': 100, 'nytimes.com...","[{'comparisons': [{'claim_article_a': ""The NIH...",1,1.0,0.780123,1
955,Day_9_3,R_3P7x5JWi3Xcbsqi,Day_9,True,1,7,4,25,1,FM,...,90.000000,"nih.gov', 'nih.gov', ''', 'jacksonville.com', ...","82.5,82.5,100,95",('https://www.nih.gov/news-events/news-release...,{},"[{'comparisons': [{'claim_article_a': ""The NIH...",2,1.0,0.489510,0
956,Day_9_3,R_3JqGYdN07AFVzOf,Day_9,Misl,0,1,1,41,0,FM,...,98.437500,"factcheck.org', 'yahoo.com', 'theintercept.com...","100,100,87.5,100,100,100,100,100",('https://www.factcheck.org/2021/10/scicheck-r...,"{'factcheck.org': 100, 'yahoo.com': 100, 'thei...","[{'comparisons': [{'claim_article_a': ""The NIH...",5,0.0,0.504124,1
957,Day_9_3,R_1P6MQAwGMgs0Loo,Day_9,True,1,5,3,39,0,FM,...,94.166667,"googleadservices.com', 'scientificamerican.com...","100,77.5,87.5,100,100,100","('https://www.googleadservices.com/', 'https:/...","{'googleadservices.com': 100, 'scientificameri...","[{}, {'comparisons': [{'claim_article_a': ""The...",5,1.0,0.443534,0


In [131]:
# Calculate Accuracy
acc = accuracy_score(last_search_df['User_Labels'], last_search_df['Binary_Metric'])
print(f"Accuracy: {acc}")

# Calculate F1
f1 = f1_score(last_search_df['User_Labels'], last_search_df['Binary_Metric'], average="micro")
print(f"F1: {f1}")

# Calculate Precision
prec = precision_score(last_search_df['User_Labels'], last_search_df['Binary_Metric'], average="micro")
print(f"Precision: {prec}")

# Calculate Recall
recall = recall_score(last_search_df['User_Labels'], last_search_df['Binary_Metric'], average="micro")

Accuracy: 0.5811320754716981
F1: 0.5811320754716981
Precision: 0.5811320754716981
