In [181]:
import os

from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np

In [182]:
# get list of dataset directories
doi_directs = [doi for doi in os.listdir("rdata_odyc") if doi != ".DS_Store"]

In [183]:
# initialize empty dataframe to store run logs of all the files
my_df = pd.DataFrame()
# initialize empty list to store problem doi's
error_dois = []

In [184]:
# iterate through directories and concatenate run logs
for doi_index in tqdm(range(len(doi_directs))):
    my_doi = doi_directs[doi_index]
    try:
        # assemble path
        my_path = "rdata_odyc/" + my_doi + "/prov_data/" + "run_log.csv"
        # concatenate to dataframe
        my_df = pd.concat([my_df, pd.read_csv(my_path)])
    except:
        error_dois.append(my_doi)

100%|██████████| 608/608 [00:01<00:00, 545.07it/s]


In [185]:
error_dois

[]

In [186]:
my_df.shape

(3897, 4)

In [187]:
my_df.to_csv("master_run_log.csv", index=False)

In [188]:
my_df.head()

Unnamed: 0,doi,filename,run_type,error
0,rdata_odyc/doi--10.7910-DVN-AZHTBT,3. bes_twitter_facebook.R,source,Error in setwd(C:/Users/Jon/Dropbox/BES backup...
1,rdata_odyc/doi--10.7910-DVN-AZHTBT,3. bes_twitter_facebook_preproc0.R,source,Error in setwd(C:/Users/Jon/Dropbox/BES backup...
0,rdata_odyc/doi--10.7910-DVN-ERX5D1,Final_Thesis.R,source,Error in source(r_file) : Final_Thesis.R:789:1...
1,rdata_odyc/doi--10.7910-DVN-ERX5D1,Final_Thesis_preproc0.R,source,Error in source(r_file) : Final_Thesis_preproc...
0,rdata_odyc/doi--10.7910-DVN-3UXBOJ,classification.R,source,Error : 'Final_data/percentage_data.csv' does ...


In [189]:
# look at how many scripts successfully ran
np.sum(my_df['error'] == "success") / (len(my_df) * 1.0)

0.22530151398511675

In [190]:
# helper functions to extract the original file names to allow better grouping
def extract_filename(file_string):
    return re.sub('_preproc0', '', file_string)

def extract_doi(doi_path):
    return doi_path.split("/")[-1]

def orig_name(row):
    return extract_doi(row['doi']) + '/' + extract_filename(row["filename"])

# add helpful columns to dataframe to facilitate analysis
my_df['is_preproc'] = my_df['filename'].str.contains("_preproc0")
my_df["orig_file"] = my_df.apply(orig_name, axis=1)
my_df["is_library_error"] = my_df["error"].str.contains("library")

In [191]:
sum(my_df['is_preproc']) / (len(my_df) * 1.0)

0.48267898383371827

## Did preprocessing the files help them run?

Proportion of preprocessed files which ran:

In [213]:
np.sum((my_df['is_preproc']) & (my_df["error"] == "success")) / (np.sum(my_df['is_preproc']) * 1.0)

0.19670388091440724

In [193]:
np.sum((my_df['is_preproc']) & (my_df["error"] == "success"))

370

In [210]:
np.sum(my_df['is_preproc'] & ~(my_df['run_type'] == 'source'))

188

In [211]:
np.sum(~my_df['is_preproc'] & ~(my_df['run_type'] == 'source'))

257

Proportion of non-preprocessed files which ran:

In [212]:
np.sum(~(my_df['is_preproc']) & (my_df["error"] == "success")) / (np.sum(~my_df['is_preproc']) * 1.0)

0.25198412698412698

In [195]:
np.sum((~my_df['is_preproc']) & (my_df["error"] == "success"))

508

Yes, but only in a few of the cases.

## Did preprocessing the files resolve library errors?

In [196]:
np.sum((my_df['is_preproc']) & (my_df["error"].str.contains("library")))

20

In [197]:
np.sum((~my_df['is_preproc']) & (my_df["error"].str.contains("library")))

236

In [198]:
# get list of files where the original had a library error
lib_error_files = my_df[(~my_df['is_preproc']) & (my_df["error"].str.contains("library"))\
                        & (my_df['run_type'] == 'source')]['orig_file']
# get run data for those files
my_df_lib_error = my_df[(my_df['run_type'] == 'source') & my_df['orig_file'].isin(lib_error_files)]

In [199]:
np.sum(~(my_df_lib_error['is_preproc']))

236

In [200]:
np.sum(my_df_lib_error['is_preproc'] & my_df_lib_error['is_library_error'])

9

Yes, it appears that library errors were avoided in 237 of 243 cases!

In [201]:
my_df_lib_error[(my_df_lib_error['is_preproc'] & my_df_lib_error['is_library_error'])]

Unnamed: 0,doi,filename,run_type,error,is_preproc,orig_file,is_library_error
3,rdata_odyc/doi--10.7910-DVN-TDSJHE,Millerfigure_preproc0.R,source,"Error in library(plotrix, lib.loc = F:/R/R-2.1...",True,doi--10.7910-DVN-TDSJHE/Millerfigure.R,True
4,rdata_odyc/doi--10.7910-DVN-TDSJHE,Nelson_mturk_figure_preproc0.R,source,"Error in library(plotrix, lib.loc = F:/R/R-2.1...",True,doi--10.7910-DVN-TDSJHE/Nelson_mturk_figure.R,True
5,rdata_odyc/doi--10.7910-DVN-TDSJHE,rubio_mturk_figure_preproc0.R,source,"Error in library(plotrix, lib.loc = F:/R/R-2.1...",True,doi--10.7910-DVN-TDSJHE/rubio_mturk_figure.R,True
1,rdata_odyc/doi--10.7910-DVN-NO90AJ,Lupu-Selios-Warner-EMD-code_preproc0.R,source,Error in source(r_file) : [newline] Lupu-Seli...,True,doi--10.7910-DVN-NO90AJ/Lupu-Selios-Warner-EMD...,True
1,rdata_odyc/doi--10.7910-DVN-1GHCOT,EP and Volatility Replication_preproc0.R,source,"Error in library(rstan, lib.loc = ~/R/win-libr...",True,doi--10.7910-DVN-1GHCOT/EP and Volatility Repl...,True
1,rdata_odyc/doi--10.7910-DVN-JLKTEE,KopralevaVink_EFAR2015_reanalysis csQCA fsQCA_...,source,"Error in library(QCA, lib.loc = /Library/Frame...",True,doi--10.7910-DVN-JLKTEE/KopralevaVink_EFAR2015...,True
4,rdata_odyc/doi--10.7910-DVN-FYJ6YT,space_answers_preproc0.R,source,"Error in library(reshape, lib.loc = /Library/F...",True,doi--10.7910-DVN-FYJ6YT/space_answers.R,True
6,rdata_odyc/doi--10.7910-DVN-FYJ6YT,time_answers_preproc0.R,source,"Error in library(robustHD, lib.loc = /Library/...",True,doi--10.7910-DVN-FYJ6YT/time_answers.R,True
7,rdata_odyc/doi--10.7910-DVN-FYJ6YT,time_RT_preproc0.R,source,"Error in library(robustHD, lib.loc = /Library/...",True,doi--10.7910-DVN-FYJ6YT/time_RT.R,True


## Did ProvR run without errors anytime source ran without errors?

In [202]:
# files that ran without errors
successful_runs = my_df[my_df['error'] == "success"]

In [203]:
np.sum(successful_runs['run_type'] == "source")

449

In [204]:
np.sum(successful_runs['run_type'] == "provR")

429

`ProvR` worked most of the time but failed to complete in about 20 cases.

In [205]:
my_df[~(my_df['error'] == "success") & (my_df['run_type'] == 'provR')].to_csv("provr_errors.csv", index=False)

## Examining preprocessed library errors

In [None]:
# get list of files where the original had a library error
lib_error_files = my_df[(~my_df['is_preproc']) & (my_df["error"].str.contains("library"))\
                        & (my_df['run_type'] == 'source')]['orig_file']
# get run data for those files
my_df_lib_error = my_df[(my_df['run_type'] == 'source') & my_df['orig_file'].isin(lib_error_files)]

In [206]:
# get list of files where the preprocessed had an error
success_unpreproc = my_df[~(my_df['is_preproc']) &\
                          (my_df["error"] == "success") &\
                          (my_df['run_type'] == 'source')]['orig_file']
# get run data for those files
my_df_preproc_error = my_df[my_df['orig_file'].isin(success_unpreproc) &\
                            (my_df['run_type'] == 'source')]

In [207]:
np.sum(my_df_preproc_error['is_preproc'])

232

In [208]:
np.sum(~my_df_preproc_error['is_preproc'])

259

In [209]:
no_orig_error = my_df_preproc_error[(my_df_preproc_error['is_preproc']) &\
                                    ~(my_df_preproc_error['error'] == 'success')]['orig_file']
my_df_preproc_error[my_df_preproc_error['orig_file'].isin(no_orig_error)].to_csv('preproc_failure.csv')