# Load data from annotators and reviewer and change those one with more annotations

In [1]:
import pandas as pd

def replace_rows_by_hash(original_df, new_df):
    """
    Reemplaza las filas en original_df con los datos de new_df 
    donde el valor de 'hash' coincida.
    
    Parameters:
    - original_df (pd.DataFrame): DataFrame original con las anotaciones.
    - new_df (pd.DataFrame): DataFrame con las nuevas anotaciones.

    Returns:
    - pd.DataFrame: DataFrame actualizado con las filas reemplazadas.
    """
    original_df = original_df.set_index('hash')
    new_df = new_df.set_index('hash')
    
    original_df.update(new_df)
    
    return original_df.reset_index()

# Load new CSV and old of round 1
results_A = pd.read_csv('../data/round1_annotations_Michel.csv')
results_B = pd.read_csv('../data/round1_annotations_Abhishek.csv')
results_C = pd.read_csv('../data/round1_annotations_David.csv')

results_A1 = pd.read_csv('../data/annotations_Michel2.csv')
results_B1 = pd.read_csv('../data/annotations_Abhishek2.csv')
results_C1 = pd.read_csv('../data/annotations_David2.csv')

# Updated with new annotations after first round
updated_A = replace_rows_by_hash(results_A, results_A1)
updated_B = replace_rows_by_hash(results_B, results_B1)
updated_C = replace_rows_by_hash(results_C, results_C1)

updated_A.to_csv('../data/round2_annotations_Michel.csv', index=False)
updated_B.to_csv('../data/round2_annotations_Abhishek.csv', index=False)
updated_C.to_csv('../data/round2_annotations_David.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/annotations_Michel2.csv'

# Get BFC, BPC, PRC y NFC

In [3]:
def get_BFCs(results_df):
    df = results_df[['hash', 'bfc']]
    return df

def get_BPCs(results_df):
    df = results_df[['hash', 'bpc']]
    return df

def get_PRCs(results_df):
    df = results_df[['hash', 'prc']]
    return df

def get_NFCs(results_df):
    df = results_df[['hash', 'nfc']]
    return df
    
# Get columns
BFCs_A = get_BFCs(updated_A)
BFCs_B = get_BFCs(updated_B)
BFCs_C = get_BFCs(updated_C)

BPCs_A = get_BPCs(updated_A)
BPCs_B = get_BPCs(updated_B)
BPCs_C = get_BPCs(updated_C)

PRCs_A = get_PRCs(updated_A)
PRCs_B = get_PRCs(updated_B)
PRCs_C = get_PRCs(updated_C)

NFCs_A = get_NFCs(updated_A)
NFCs_B = get_NFCs(updated_B)
NFCs_C = get_NFCs(updated_C)

# Combine metrics
# BFCs
BFCs = pd.merge(BFCs_A, BFCs_B, on='hash', how='inner', suffixes=('A', 'B'))
BFCs = pd.merge(BFCs, BFCs_C, on='hash', how='inner', suffixes=('', 'C'))
BFCs = BFCs.rename(columns={'bfc': 'bfcC'})

# BPCs
BPCs = pd.merge(BPCs_A, BPCs_B, on='hash', how='inner', suffixes=('A', 'B'))
BPCs = pd.merge(BPCs, BPCs_C, on='hash', how='inner', suffixes=('', 'C'))
BPCs = BPCs.rename(columns={'bpc': 'bpcC'})

# PRCs
PRCs = pd.merge(PRCs_A, PRCs_B, on='hash', how='inner', suffixes=('A', 'B'))
PRCs = pd.merge(PRCs, PRCs_C, on='hash', how='inner', suffixes=('', 'C'))
PRCs = PRCs.rename(columns={'prc': 'prcC'})

# NFCs
NFCs = pd.merge(NFCs_A, NFCs_B, on='hash', how='inner', suffixes=('A', 'B'))
NFCs = pd.merge(NFCs, NFCs_C, on='hash', how='inner', suffixes=('', 'C'))
NFCs = NFCs.rename(columns={'nfc': 'nfcC'})

# Join everything in one dataframe
final_df = pd.merge(BFCs, BPCs, on='hash', how='inner')
final_df = pd.merge(final_df, PRCs, on='hash', how='inner')
final_df = pd.merge(final_df, NFCs, on='hash', how='inner')



# Get new disagreements and save the hashes of those ones that we have disagreement in BFC

In [5]:
# Disagreement BFCs
BFCs_disagree_1 = BFCs.query("(abs(bfcA - bfcB) > 1) or (abs(bfcA - bfcC) > 1) or (abs(bfcB - bfcC) > 1)")
total_BFCs_disagree_1 = len(BFCs_disagree_1)
print(f"Total commits with disagreement in BFC: {total_BFCs_disagree_1}")

# Disagreement BPCs
BPCs_disagree_1 = BPCs.query("(abs(bpcA - bpcB) > 1) or (abs(bpcA - bpcC) > 1) or (abs(bpcB - bpcC) > 1)")
total_BPCs_disagree_1 = len(BPCs_disagree_1)
print(f"Total commits with disagreement in BPC: {total_BPCs_disagree_1}")

# Disagreement PRCs
PRCs_disagree_1 = PRCs.query("(abs(prcA - prcB) > 1) or (abs(prcA - prcC) > 1) or (abs(prcB - prcC) > 1)")
total_PRCs_disagree_1 = len(PRCs_disagree_1)
print(f"Total commits with disagreement in PRC: {total_PRCs_disagree_1}")

# Disagreement NFCs
NFCs_disagree_1 = NFCs.query("(abs(nfcA - nfcB) > 1) or (abs(nfcA - nfcC) > 1) or (abs(nfcB - nfcC) > 1)")
total_NFCs_disagree_1 = len(NFCs_disagree_1)
print(f"Total commits with disagreement in NFC: {total_NFCs_disagree_1}")

# Get hashes of all disagreements
hashes_BFC = set(BFCs_disagree_1['hash'])
hashes_BPC = set(BPCs_disagree_1['hash'])
hashes_PRC = set(PRCs_disagree_1['hash'])
hashes_NFC = set(NFCs_disagree_1['hash'])

# Join all the hashes in one list
all_hashes = hashes_BFC.union(hashes_BPC).union(hashes_PRC).union(hashes_NFC)
unique_hashes_list = list(all_hashes)

print(f"Total unique hashes: {len(unique_hashes_list)}")

# To DF
#unique_hashes_df = pd.DataFrame(unique_hashes_list, columns=['hash'])
# Save disagreements into CSV
#unique_hashes_df.to_csv('unique_hashes_first_round.csv', index=False)

# Only for BFC
unique_hashes_bfc_df = pd.DataFrame(hashes_BFC, columns=['hash'])
# Save disagreements into CSV
unique_hashes_bfc_df.to_csv('../data/round2_disagreement_bfc_hashes.csv', index=False)

# Only for BPC
#unique_hashes_bpc_df = pd.DataFrame(hashes_BPC, columns=['hash'])
# Save disagreements into CSV
#unique_hashes_bpc_df.to_csv('unique_hashes_bpc_first_round.csv', index=False)

Total commits with disagreement in BFC: 38
Total commits with disagreement in BPC: 223
Total commits with disagreement in PRC: 138
Total commits with disagreement in NFC: 62
Total unique hashes: 325
