# User Rep Score

In [1]:
import numpy as np
import pandas as pd

## Loading Data

In [2]:
# array of questions that are "parent" questions
language_parents = ["T1.Q1", "T1.Q12"]
probability_parents = ["T1.Q1", "T1.Q5", "T1.Q6", "T1.Q11"]
reasoning_parents = ["T1.Q1"]
evidence_parents = ["T1.Q1", "T1.Q12"]

parents = {"Evidence":evidence_parents, 
           "Language":language_parents, 
           "Probability":probability_parents, 
           "Reasoning":reasoning_parents}

# corresponding list of question types, language_parents[n] maps to language_parent_types[n]
language_parents_types = ["select_all", "ordinal"]
probability_parents_types = ["ordinal", "ordinal", "select_one", "ordinal"]
reasoning_parents_types = ["select_all"]
evidence_parents_types = ["select_one", "ordinal"]

# corresponding list of max number of answers, used for scoring ordinal questions
language_num_answers = [13, 4]
probability_num_answers = [3, 3, 3, 4]
reasoning_num_answers = [6]
evidence_num_answers = [3, 4]

In [3]:
def format_feed(feed, feed_type):
    '''
    Format feed into a more processable format:
        Row: review id
        Column: question number
        Value: the review's choice for the corresponding question
    
    Parameter:
        feed: a data csv read by pandas
        feed_type: type of the data csv (evidence, for exampel)
    '''
    tbl = pd.pivot_table(feed, 
                         values='gold_standard_label', 
                         index='article_number', 
                         columns='question_label', 
                         aggfunc=set)
    for col in tbl.columns:
        tbl[col].loc[tbl[col].isnull()] = tbl[col].loc[tbl[col].isnull()].apply(lambda x: set({}))
        tbl[col] = tbl[col].apply(lambda x: x - set([np.nan]))
    return tbl

In [4]:
#Reviews of the golden standard article
#Should implement an iteration over multiple csvs in the future
data = pd.read_csv("Gold Standard Column/Source Data for Reference/Language1020.csv")
data_type = 'Language'

#Reviewers' current reputation score
old_rep_scores = pd.read_csv("./User Rep Score/score.csv").set_index('contributor_uuid')
old_rep_scores = {user: old_rep_scores.loc[user, "score"] if user in old_rep_scores.index else 0.5 
              for user in data["contributor_uuid"].unique()}
rep_scores = {user:[] for user in old_rep_scores.keys()}

#Golden standard answers, experts' reviews of the golden standard article
gold_answers = {
    "Evidence":format_feed(pd.read_csv("Gold Standard Column/Data with Gold Column/evidence_with_gold.csv"),"Evidence"), 
    "Language":format_feed(pd.read_csv("Gold Standard Column/Data with Gold Column/language_with_gold.csv"), "Language"), 
    "Probability":format_feed(pd.read_csv("Gold Standard Column/Data with Gold Column/probability_with_gold.csv"), "Probability"), 
    "Reasoning":format_feed(pd.read_csv("Gold Standard Column/Data with Gold Column/reasoning_with_gold.csv"), "Reasoning")
}

## Update User Rep Score With Golden Standards

In [5]:
sliced_by_articles = {article_id:data[data["article_number"] == article_id] for article_id in data["article_number"]}
for article_id in sliced_by_articles.keys():
    df = sliced_by_articles[article_id]
    df = pd.pivot_table(df,
                        values='answer_label', 
                        index='contributor_uuid', 
                        columns='question_label', 
                        aggfunc=set)
    gold_answers_of_article = gold_answers["Language"].loc[gold_answers["Language"].index == article_id, :]
    for user in df.index:
        review = df.loc[df.index == user, :]
        n_gold_answers = 0
        n_correct = 0
        for question_label in df.columns:
            if question_label in gold_answers_of_article.columns and len(gold_answers_of_article[question_label].iloc[0]) > 0:
                n_gold_answers += 1
                if len(review.loc[:, question_label].iloc[0].intersection(gold_answers_of_article[question_label].iloc[0])) > 0:
                    n_correct += 1
        rep_scores[user].append(n_correct/n_gold_answers)
                
for user in rep_scores.keys():
    rep_scores[user] = 0.5 * sum(rep_scores[user])/len(rep_scores[user]) + 0.5 * old_rep_scores[user]
    

In [17]:
#Update the repscore csv file
csv = pd.read_csv("./User Rep Score/score.csv").set_index('contributor_uuid')
for user in rep_scores.keys():
    if user in csv.index:
        csv.loc[user, 'score'] = rep_scores[user]
    else:
        helper_dict = {user:rep_scores[user]}
        helper_df = pd.DataFrame.from_dict(helper_dict, orient="index", columns=['score'])
        csv = csv.append(helper_df)
csv = csv.reset_index()
csv.columns = ['contributor_uuid', 'score']
csv.to_csv("./User Rep Score/score.csv", index=False)

## Debugging/Helper Code

In [12]:
df = pd.DataFrame.from_dict(
            { 'de68bbf8-46d7-45f0-9111-92e64ab9499a':0.5, #Should be from a csv file, currently using fake data
               'f9143626-bfe0-4e69-b652-6d1525ab4eb0':0.5,
               '85579cf2-e01c-45c5-b9e7-34b40467148d':0.5,
               '00f548b7-6b63-4b47-828e-8e416b6ca0e2':0.5,
               '95dc40d9-3710-47d7-abf0-24b825a1d0c5':0.5,
               'bd786026-bad5-4fa8-9a3a-38ca03a16412':0.5,
               'e1ae8875-a398-4dde-8f4e-4b21109784e3':0.5},
    orient = "index",
).reset_index()
df.columns = ['contributor_uuid', 'score']
df
df.to_csv("./User Rep Score/score.csv", index=False)