In [1]:
import os
import re

from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np

In [2]:
my_df = pd.read_csv("master_run_log.csv")

In [3]:
my_df.shape

(3948, 4)

In [4]:
# look at how many scripts successfully ran
np.sum(my_df['error'] == "success") / (len(my_df) * 1.0)

0.25126646403242148

In [5]:
# helper functions to extract the original file names to allow better grouping
def extract_filename(file_string):
    return re.sub('_preproc0', '', file_string)

def extract_doi(doi_path):
    return doi_path.split("/")[-1]

def orig_name(row):
    return extract_doi(row['doi']) + '/' + extract_filename(row["filename"])

# add helpful columns to dataframe to facilitate analysis
my_df['is_preproc'] = my_df['filename'].str.contains("__preproc__")
my_df["orig_file"] = my_df.apply(orig_name, axis=1)
my_df["is_library_error"] = my_df["error"].str.contains("library")
my_df['is_wd_error'] = my_df['error'].str.contains('setwd\(')
my_df['is_mirror_error'] = my_df['error'].str.contains('without setting a mirror')
my_df['is_file_error'] = my_df['error'].str.contains('file')

In [6]:
sum(my_df['is_preproc']) / (len(my_df) * 1.0)

0.48885511651469099

## See if run data is missing for any R files

In [7]:
all_r_files = []
# list dataset directories
for dataset_dir in os.listdir("rdata_odyc"):
    if not re.match("^\.", dataset_dir):
        for r_file in os.listdir("rdata_odyc/" + dataset_dir):
            if not "_preproc0" in r_file and re.match(".*\.R$", r_file):
                all_r_files.append(dataset_dir + "/" + r_file)

In [8]:
# files that run data was never collected for
not_run_files = list(set(all_r_files) - set(my_df['orig_file'].unique()))

In [9]:
# extract list of dois for which this was the case
incomplete_dois = list(set([myfile.split('/')[0] for myfile in not_run_files]))

In [10]:
# add list of dois for which more original files ran than prov files
for doi in my_df['doi'].unique():
    doi_df = my_df[my_df['doi'] == doi]
    num_vanilla = len(doi_df[~doi_df['is_preproc']]['orig_file'].unique())
    num_preproc = len(doi_df[doi_df['is_preproc']]['orig_file'].unique())
    if num_vanilla > num_preproc:
        incomplete_dois.append(doi.split('/')[1])

In [11]:
incomplete_dois = ['rdata_odyc/' + doi for doi in list(set(incomplete_dois))]

In [12]:
# filter out incomplete DOIs
my_df = my_df[~my_df['doi'].isin(incomplete_dois)]

In [13]:
my_df.shape

(3795, 10)

## Base stats for trying to run the original scripts

In [14]:
vanilla_df = my_df[~my_df['is_preproc'] & (my_df['run_type'] == 'source')]

In [15]:
vanilla_df.shape

(1663, 10)

Proportion of original scripts that ran

In [17]:
np.sum(vanilla_df['error'] == 'success') / (len(vanilla_df) * 1.0)

0.13770294648226097

Number of errors

In [18]:
np.sum(~(vanilla_df['error'] == "success"))

1434

In [19]:
# isolate errors in their own dataframe
error_vanilla = vanilla_df[~(vanilla_df['error'] == 'success')]

Number of library errors

In [20]:
np.sum(error_vanilla['is_library_error'])

202

In [21]:
np.sum(error_vanilla['is_library_error']) / (len(error_vanilla) * 1.0)

0.14086471408647142

Number of working directory errors

In [22]:
np.sum(error_vanilla['is_wd_error']) 

404

In [23]:
np.sum(error_vanilla['is_wd_error']) / (len(error_vanilla) * 1.0)

0.28172942817294283

File Errors

In [24]:
np.sum(error_vanilla['error'].str.contains('file'))

450

In [25]:
np.sum(error_vanilla['is_file_error']) / (len(error_vanilla) * 1.0)

0.31380753138075312

Using cran without a mirror

In [26]:
np.sum(error_vanilla['is_mirror_error']) / (len(error_vanilla) * 1.0)

0.038354253835425386

## Base stats for preprocessed scripts

Proportion of preprocessed files which ran:

In [27]:
preproc_df = my_df[my_df['is_preproc'] & (my_df['run_type'] == 'source')]

In [28]:
preproc_df.shape

(1663, 10)

Proportion of preprocessed files which ran:

In [29]:
np.sum(preproc_df['error'] == 'success') / (len(preproc_df) * 1.0)

0.14491882140709561

In [30]:
np.sum(~(preproc_df['error'] == "success"))

1422

In [31]:
# isolate errors in their own dataframe
error_preproc = preproc_df[~(preproc_df['error'] == 'success')]

In [32]:
np.sum(error_preproc['is_library_error'])

15

In [33]:
np.sum(error_preproc['is_wd_error'])

18

In [34]:
np.sum(error_preproc['is_file_error'])

865

In [35]:
np.sum(error_preproc['is_mirror_error'])

7

In [36]:
error_preproc.to_csv('preproc_errors.csv')