# Load data from annotators and reviewer

In [2]:
import pandas as pd
import gradio as gr

def get_BFCs(results_df):
    df = results_df[['hash', 'bfc']]
    return df

def get_BPCs(results_df):
    df = results_df[['hash', 'bpc']]
    return df

def get_PRCs(results_df):
    df = results_df[['hash', 'prc']]
    return df

def get_NFCs(results_df):
    df = results_df[['hash', 'nfc']]
    return df

results_A = pd.read_csv('annotations_Michel.csv')
results_B = pd.read_csv('annotations_Abhishek.csv')
results_C = pd.read_csv('annotations_David.csv')

# Get columns
BFCs_A = get_BFCs(results_A)
BFCs_B = get_BFCs(results_B)
BFCs_C = get_BFCs(results_C)

BPCs_A = get_BPCs(results_A)
BPCs_B = get_BPCs(results_B)
BPCs_C = get_BPCs(results_C)

PRCs_A = get_PRCs(results_A)
PRCs_B = get_PRCs(results_B)
PRCs_C = get_PRCs(results_C)

NFCs_A = get_NFCs(results_A)
NFCs_B = get_NFCs(results_B)
NFCs_C = get_NFCs(results_C)

# Combine metrics
# BFCs
BFCs = pd.merge(BFCs_A, BFCs_B, on='hash', how='inner', suffixes=('A', 'B'))
BFCs = pd.merge(BFCs, BFCs_C, on='hash', how='inner', suffixes=('', 'C'))
BFCs = BFCs.rename(columns={'bfc': 'bfcC'})

# BPCs
BPCs = pd.merge(BPCs_A, BPCs_B, on='hash', how='inner', suffixes=('A', 'B'))
BPCs = pd.merge(BPCs, BPCs_C, on='hash', how='inner', suffixes=('', 'C'))
BPCs = BPCs.rename(columns={'bpc': 'bpcC'})

# PRCs
PRCs = pd.merge(PRCs_A, PRCs_B, on='hash', how='inner', suffixes=('A', 'B'))
PRCs = pd.merge(PRCs, PRCs_C, on='hash', how='inner', suffixes=('', 'C'))
PRCs = PRCs.rename(columns={'prc': 'prcC'})

# NFCs
NFCs = pd.merge(NFCs_A, NFCs_B, on='hash', how='inner', suffixes=('A', 'B'))
NFCs = pd.merge(NFCs, NFCs_C, on='hash', how='inner', suffixes=('', 'C'))
NFCs = NFCs.rename(columns={'nfc': 'nfcC'})

# Join everything in one dataframe
final_df = pd.merge(BFCs, BPCs, on='hash', how='inner')
final_df = pd.merge(final_df, PRCs, on='hash', how='inner')
final_df = pd.merge(final_df, NFCs, on='hash', how='inner')


print(final_df.head())


         hash  bfcA  bfcB  bfcC  bpcA  bpcB  bpcC  prcA  prcB  prcC  nfcA  \
0  0704a8586f     0     0     0     0     3     0     4     4     4     0   
1  c5e97ed154     0     0     0     0     4     0     4     4     4     0   
2  4f9f531e15     0     0     0     1     4     1     4     4     4     0   
3  258030acc9     0     0     0     0     4     0     4     4     4     0   
4  5b2c5540b8     4     4     3     1     0     2     0     0     0     0   

   nfcB  nfcC  
0     0     0  
1     0     0  
2     0     0  
3     0     0  
4     0     0  


In [3]:
# Disagreement BFCs
BFCs_disagree_1 = BFCs.query("(abs(bfcA - bfcB) > 1) or (abs(bfcA - bfcC) > 1) or (abs(bfcB - bfcC) > 1)")
total_BFCs_disagree_1 = len(BFCs_disagree_1)
print(f"Total commits with disagreement in BFC: {total_BFCs_disagree_1}")

# Disagreement BPCs
BPCs_disagree_1 = BPCs.query("(abs(bpcA - bpcB) > 1) or (abs(bpcA - bpcC) > 1) or (abs(bpcB - bpcC) > 1)")
total_BPCs_disagree_1 = len(BPCs_disagree_1)
print(f"Total commits with disagreement in BPC: {total_BPCs_disagree_1}")

# Disagreement PRCs
PRCs_disagree_1 = PRCs.query("(abs(prcA - prcB) > 1) or (abs(prcA - prcC) > 1) or (abs(prcB - prcC) > 1)")
total_PRCs_disagree_1 = len(PRCs_disagree_1)
print(f"Total commits with disagreement in PRC: {total_PRCs_disagree_1}")

# Disagreement NFCs
NFCs_disagree_1 = NFCs.query("(abs(nfcA - nfcB) > 1) or (abs(nfcA - nfcC) > 1) or (abs(nfcB - nfcC) > 1)")
total_NFCs_disagree_1 = len(NFCs_disagree_1)
print(f"Total commits with disagreement in NFC: {total_NFCs_disagree_1}")


Total commits with disagreement in BFC: 93
Total commits with disagreement in BPC: 248
Total commits with disagreement in PRC: 163
Total commits with disagreement in NFC: 71


In [5]:
# Get hashes of all disagreements
hashes_BFC = set(BFCs_disagree_1['hash'])
hashes_BPC = set(BPCs_disagree_1['hash'])
hashes_PRC = set(PRCs_disagree_1['hash'])
hashes_NFC = set(NFCs_disagree_1['hash'])

# Join all the hashes in one list
all_hashes = hashes_BFC.union(hashes_BPC).union(hashes_PRC).union(hashes_NFC)
unique_hashes_list = list(all_hashes)

print(f"Total unique hashes: {len(unique_hashes_list)}")

# To DF
unique_hashes_df = pd.DataFrame(unique_hashes_list, columns=['hash'])

# Save disagreements into CSV
unique_hashes_df.to_csv('unique_hashes.csv', index=False)


# Only for BFC
unique_hashes_bfc_df = pd.DataFrame(hashes_BFC, columns=['hash'])

# Save disagreements into CSV
unique_hashes_bfc_df.to_csv('unique_hashes_bfc.csv', index=False)

# Only for BPC
unique_hashes_bpc_df = pd.DataFrame(hashes_BPC, columns=['hash'])

# Save disagreements into CSV
unique_hashes_bpc_df.to_csv('unique_hashes_bpc.csv', index=False)


Total unique hashes: 368
