In [2]:
import json
import os
import numpy as np
import datefinder
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

workers_data_path = "../data/prolific/Dataframe/workers_answers.csv"
workers_acl_path = "../data/prolific/Dataframe/workers_acl.csv"
workers_url_path = "../data/prolific/Dataframe/workers_urls.csv"
workers_dim_sel = "../data/prolific/Dataframe/workers_dimensions_selection.csv"

def load_json(p):
    if os.path.exists(p):
        with open(p, "r", encoding='latin1') as f:
            d = json.load(f)
        return d
    else:
        return {}

checks_path = "../checks/"
dataframe_path = "../dataframe/"
os.makedirs(checks_path, exist_ok=True)
os.makedirs(dataframe_path, exist_ok=True)

df_acl = pd.read_csv(workers_acl_path)
df_acl = df_acl.loc[df_acl['paid']==True]
df_data = pd.read_csv(workers_data_path)
df_url = pd.read_csv(workers_url_path)
df_dim_sel = pd.read_csv(workers_dim_sel)

worker_ids = np.unique(df_acl['worker_id'].values)

In [3]:
def filter_df(ids, df, answers=False):
    slices = []
    for worker_id in  ids:
        data = df.loc[df['worker_id']==worker_id]
        most_rec_try = data.loc[data['try_current'].idxmax()]['try_current']
        data = data.loc[data['try_current']==most_rec_try]
        if answers:
            if data.shape[0]>8:
                data = data.loc[(data['action']=='Next')|(data['action']=='Finish')]
                doc_ids = np.unique(data['doc_id'].values)
                doc_slices = []
                for doc_id in doc_ids:
                    data_sub = data.loc[data['doc_id']==doc_id]
                    data_sub.sort_values(by='time_submit_parsed', ascending=False, inplace=True)
                    doc_slices.append(data_sub.head(1))
                data = pd.concat(doc_slices)
        slices.append(data)
    return pd.concat(slices)

In [4]:
df_data = filter_df(worker_ids, df_data, True)
df_urls = filter_df(worker_ids, df_url, False)
df_dim_sel = filter_df(worker_ids, df_dim_sel, False)
df_acl = filter_df(worker_ids, df_acl, False)

In [5]:
df_data_filt = df_data[['worker_id','unit_id']].sort_values(by='unit_id')
df_data_filt.drop_duplicates(inplace=True)
workers_unit_unique = df_data_filt[['worker_id','unit_id']].sort_values(by='unit_id').groupby('worker_id').agg("count")
workers_unit_unique.reset_index(inplace=True)
unit_workers_unique = df_data_filt[['worker_id','unit_id']].sort_values(by='unit_id').groupby('unit_id').agg("count")
unit_workers_unique.reset_index(inplace=True)
workers_unit_unique.to_csv(f"{checks_path}prolific_200_worker_x_unit.csv", index=False)
unit_workers_unique.to_csv(f"{checks_path}prolific_200_unit_x_worker.csv", index=False)

In [6]:
df_data.to_csv(f"{dataframe_path}workers_data_prolific_200.csv", index=False)
df_urls.to_csv(f"{dataframe_path}workers_urls_prolific_200.csv", index=False)
df_dim_sel.to_csv(f"{dataframe_path}workers_dim_sel_prolific_200.csv", index=False)
df_acl.to_csv(f"{dataframe_path}workers_acl_prolific_200.csv", index=False)

In [7]:
judgment_per_doc = df_data[['doc_id','doc_truthfulness_value']].groupby('doc_id').agg("count")
judgment_per_doc.reset_index(inplace=True)
judgment_per_doc.to_csv(f"{checks_path}prolific_200_judgments_x_doc.csv", index=False)
worker_per_doc = df_data[['worker_id', 'doc_id']].groupby('doc_id').agg("count")
worker_per_doc.reset_index(inplace=True)
worker_per_doc.to_csv(f"{checks_path}prolific_200_worker_x_doc.csv", index=False)

In [8]:
df_data_filt = df_data.sort_values(by='doc_time_elapsed', ascending=False)
df_data_filt = df_data_filt[['worker_id', 'doc_id', 'doc_time_start', 'doc_time_elapsed',  'doc_index', 'doc_time_end']]
display(df_data_filt)

Unnamed: 0,worker_id,doc_id,doc_time_start,doc_time_elapsed,doc_index,doc_time_end
1690,62bb53a41c9cfff9c9a148c9,joe-biden-said-mass-shootings-tripled-when-ass...,1.661257e+09,1949.66,3.0,1.661259e+09
2164,5d9f0981b22ffb00118e84cd,among-children-firearms-leading-cause-death-2020,1.661249e+09,1870.50,3.0,1.661251e+09
1599,59d0ec2446447f00011f0254,new-york-state-so-many-bills,1.661183e+09,1850.01,2.0,1.661185e+09
1456,60f722b3a83e22a28e18860e,marjorie-taylor-greene-shared-edited-image-hig...,1.661257e+09,1772.58,3.0,1.661259e+09
1574,62b091722350e1bcc3d73528,warren-wades-debate-health-care-costs-and-bank...,1.661257e+09,1273.25,1.0,1.661258e+09
...,...,...,...,...,...,...
590,613e002e81f4b2409b531dbd,big-four-meat-packers-are-seeing-record-profit...,1.661253e+09,6.45,2.0,1.661253e+09
252,56f9364e895094000c8f4967,democrats-say-david-jolly-david-jolly-supports...,1.661255e+09,,3.0,1.661255e+09
1366,5e7027c5a2522d000bddb8f3,GOLD_LOW,1.661260e+09,,3.0,1.661260e+09
327,6056bcf1cee7a41435889255,kathleen-falk-says-gov-scott-walker-cut-school...,1.661183e+09,,3.0,1.661183e+09
