In [37]:
import os
import re

from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np

In [93]:
my_df = pd.read_csv("master_run_log.csv")

In [94]:
my_df.shape

(4298, 4)

In [95]:
# look at how many scripts successfully ran
np.sum(my_df['error'] == "success") / (len(my_df) * 1.0)

0.27338296882270824

In [96]:
# helper functions to extract the original file names to allow better grouping
def extract_filename(file_string):
    return re.sub('__preproc__', '', file_string)

def extract_doi(doi_path):
    return doi_path.split("/")[-1]

def orig_name(row):
    return extract_doi(row['doi']) + '/' + extract_filename(row["filename"])

# add helpful columns to dataframe to facilitate analysis
my_df['is_preproc'] = my_df['filename'].str.contains("__preproc__")
my_df["orig_file"] = my_df.apply(orig_name, axis=1)
my_df["is_library_error"] = my_df["error"].str.contains("library")
my_df['is_wd_error'] = my_df['error'].str.contains('setwd\(')
my_df['is_mirror_error'] = my_df['error'].str.contains('without setting a mirror')
my_df['is_file_error'] = my_df['error'].str.contains('file')

In [97]:
sum(my_df['is_preproc']) / (len(my_df) * 1.0)

0.47603536528617962

## See if run data is missing for any R files

In [98]:
all_r_files = []
# list dataset directories
for dataset_dir in os.listdir("rdata_odyc"):
    if not re.match("^\.", dataset_dir):
        for r_file in os.listdir("rdata_odyc/" + dataset_dir):
            if not "_preproc0" in r_file and re.match(".*\.R$", r_file):
                all_r_files.append(dataset_dir + "/" + r_file)

In [99]:
# files that run data was never collected for
not_run_files = list(set(all_r_files) - set(my_df['orig_file'].unique()))

In [100]:
# extract list of dois for which this was the case
incomplete_dois = list(set([myfile.split('/')[0] for myfile in not_run_files]))

In [101]:
# add list of dois for which more original files ran than prov files
for doi in my_df['doi'].unique():
    doi_df = my_df[my_df['doi'] == doi]
    vanilla_files = set(doi_df[~doi_df['is_preproc']]['orig_file'].unique())
    preproc_files = set(doi_df[doi_df['is_preproc']]['orig_file'].unique())
    if vanilla_files != preproc_files:
        incomplete_dois.append(doi.split('/')[1])

In [102]:
incomplete_dois = ['rdata_odyc/' + doi for doi in list(set(incomplete_dois))]

In [103]:
# filter out incomplete DOIs
my_df = my_df[~my_df['doi'].isin(incomplete_dois)]

In [104]:
my_df.shape

(3959, 10)

## Base stats for trying to run the original scripts

In [105]:
vanilla_df = my_df[~my_df['is_preproc'] & (my_df['run_type'] == 'source')]

In [106]:
vanilla_df.shape

(1724, 10)

Proportion of original scripts that ran

In [129]:
np.sum(vanilla_df['error'] == 'success') 

250

In [107]:
np.sum(vanilla_df['error'] == 'success') / (len(vanilla_df) * 1.0)

0.14501160092807425

Number of errors

In [108]:
np.sum(~(vanilla_df['error'] == "success"))

1474

In [109]:
# isolate errors in their own dataframe
error_vanilla = vanilla_df[~(vanilla_df['error'] == 'success')]

Number of library errors

In [110]:
np.sum(error_vanilla['is_library_error'])

213

In [111]:
np.sum(error_vanilla['is_library_error']) / (len(error_vanilla) * 1.0)

0.14450474898236093

Number of working directory errors

In [112]:
np.sum(error_vanilla['is_wd_error']) 

433

In [113]:
np.sum(error_vanilla['is_wd_error']) / (len(error_vanilla) * 1.0)

0.2937584803256445

File Errors

In [114]:
np.sum(error_vanilla['error'].str.contains('file'))

460

In [115]:
np.sum(error_vanilla['is_file_error']) / (len(error_vanilla) * 1.0)

0.31207598371777479

Using cran without a mirror

In [116]:
np.sum(error_vanilla['is_mirror_error']) / (len(error_vanilla) * 1.0)

0.035956580732700139

## Base stats for preprocessed scripts

Proportion of preprocessed files which ran:

In [117]:
preproc_df = my_df[my_df['is_preproc'] & (my_df['run_type'] == 'source')]

In [118]:
preproc_df.shape

(1723, 10)

Proportion of preprocessed files which ran:

In [128]:
np.sum(preproc_df['error'] == 'success')

267

In [120]:
np.sum(preproc_df['error'] == 'success') / (len(preproc_df) * 1.0)

0.15496227510156704

In [121]:
np.sum(~(preproc_df['error'] == "success"))

1456

In [122]:
# isolate errors in their own dataframe
error_preproc = preproc_df[~(preproc_df['error'] == 'success')]

In [123]:
np.sum(error_preproc['is_library_error'])

12

In [124]:
np.sum(error_preproc['is_wd_error'])

16

In [125]:
np.sum(error_preproc['is_file_error'])

913

In [126]:
np.sum(error_preproc['is_mirror_error'])

7

In [127]:
error_preproc.to_csv('preproc_errors.csv')