In [85]:
import os
import re
import pickle

from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np

In [86]:
my_df = pd.read_csv("master_run_log.csv")

In [87]:
my_df.shape

(3993, 4)

In [88]:
# look at how many scripts successfully ran
np.sum(my_df['error'] == "success") / (len(my_df) * 1.0)

0.27022289005760081

In [149]:
# helper functions to extract the original file names to allow better grouping
def extract_filename(file_string):
    return re.sub('__preproc__', '', file_string)

def extract_doi(doi_path):
    return doi_path.split("/")[-1]

def orig_name(row):
    return extract_doi(row['doi']) + '/' + extract_filename(row["filename"])

def doi_without_path(row):
    return extract_doi(row['doi'])

# add helpful columns to dataframe to facilitate analysis
my_df['is_preproc'] = my_df['filename'].str.contains("__preproc__")
my_df["orig_file"] = my_df.apply(orig_name, axis=1)
my_df['doi'] = my_df.apply(doi_without_path, axis=1)
my_df["is_library_error"] = my_df["error"].str.contains("library")
my_df['is_wd_error'] = my_df['error'].str.contains('setwd\(')
my_df['is_mirror_error'] = my_df['error'].str.contains('without setting a mirror')
my_df['is_file_error'] = my_df['error'].str.contains('file')

In [150]:
sum(my_df['is_preproc']) / (len(my_df) * 1.0)

0.48559979964938643

## See if run data is missing for any R files

In [91]:
all_r_files = []
# list dataset directories
for dataset_dir in os.listdir("rdata_odyc"):
    if not re.match("^\.", dataset_dir):
        for r_file in os.listdir("rdata_odyc/" + dataset_dir):
            if not "_preproc0" in r_file and re.match(".*\.R$", r_file):
                all_r_files.append(dataset_dir + "/" + r_file)

In [92]:
# files that run data was never collected for
not_run_files = list(set(all_r_files) - set(my_df['orig_file'].unique()))

In [93]:
# extract list of dois for which this was the case
incomplete_dois = list(set([myfile.split('/')[0] for myfile in not_run_files]))

In [94]:
# add list of dois for which more original files ran than prov files
for doi in my_df['doi'].unique():
    doi_df = my_df[my_df['doi'] == doi]
    vanilla_files = set(doi_df[~doi_df['is_preproc']]['orig_file'].unique())
    preproc_files = set(doi_df[doi_df['is_preproc']]['orig_file'].unique())
    if vanilla_files != preproc_files:
        incomplete_dois.append(doi.split('/')[1])

In [95]:
incomplete_dois = ['rdata_odyc/' + doi for doi in list(set(incomplete_dois))]

In [96]:
# filter out incomplete DOIs
my_df = my_df[~my_df['doi'].isin(incomplete_dois)]

In [97]:
my_df.shape

(3993, 10)

In [98]:
# write this list of DOIs to file to only run for these DOIs
with open('non_timeout_dois.txt', 'w') as outfile:
    for doi in my_df['doi'].unique(): 
        outfile.write(doi.split('/')[-1] + '\n')

## Base stats for trying to run the original scripts

In [154]:
vanilla_df = my_df[~my_df['is_preproc'] & (my_df['run_type'] == 'source')]

In [155]:
vanilla_df.drop_duplicates(['orig_file'], keep="last", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [156]:
vanilla_df.shape

(1777, 10)

Proportion of original scripts that ran

In [157]:
np.sum(vanilla_df['error'] == 'success') 

278

In [158]:
np.sum(vanilla_df['error'] == 'success') / (len(vanilla_df) * 1.0)

0.15644344400675295

Number of errors

In [159]:
np.sum(~(vanilla_df['error'] == "success"))

1499

In [160]:
# isolate errors in their own dataframe
error_vanilla = vanilla_df[~(vanilla_df['error'] == 'success')]

Number of library errors

In [161]:
np.sum(error_vanilla['is_library_error'])

227

In [162]:
np.sum(error_vanilla['is_library_error']) / (len(error_vanilla) * 1.0)

0.15143428952635091

Number of working directory errors

In [163]:
np.sum(error_vanilla['is_wd_error']) 

434

In [164]:
np.sum(error_vanilla['is_wd_error']) / (len(error_vanilla) * 1.0)

0.28952635090060042

File Errors

In [165]:
np.sum(error_vanilla['error'].str.contains('file'))

455

In [166]:
np.sum(error_vanilla['is_file_error']) / (len(error_vanilla) * 1.0)

0.30353569046030687

Using cran without a mirror

In [167]:
np.sum(error_vanilla['is_mirror_error']) / (len(error_vanilla) * 1.0)

0.033355570380253503

## Base stats for preprocessed scripts

Proportion of preprocessed files which ran:

In [168]:
preproc_df = my_df[my_df['is_preproc'] & (my_df['run_type'] == 'source')]

In [169]:
preproc_df.drop_duplicates(['orig_file'], keep="last", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [170]:
preproc_df.shape

(1669, 10)

Proportion of preprocessed files which ran:

In [171]:
np.sum(preproc_df['error'] == 'success')

271

In [172]:
np.sum(preproc_df['error'] == 'success') / (len(preproc_df) * 1.0)

0.16237267825044938

In [173]:
np.sum(~(preproc_df['error'] == "success"))

1398

In [174]:
# isolate errors in their own dataframe
error_preproc = preproc_df[~(preproc_df['error'] == 'success')]

In [175]:
np.sum(error_preproc['is_library_error'])

19

In [176]:
np.sum(error_preproc['is_wd_error'])

12

In [177]:
np.sum(error_preproc['is_file_error'])

802

In [178]:
np.sum(error_preproc['is_mirror_error'])

8

In [179]:
error_preproc[error_preproc['is_library_error'] | 
              error_preproc['is_wd_error'] |
              error_preproc['is_mirror_error']].to_csv('preproc_errors.csv')

## Analyzing Missing Files

In [180]:
with open("missing_files.pkl", 'rb') as handle:
    missing_files = pickle.load(handle)

In [181]:
missing_files = {key:value for key,value in missing_files.items() if value}

In [182]:
missing_file_dois = missing_files.keys()

In [183]:
file_error_dois = list(error_preproc[error_preproc['is_file_error']]['doi'].unique())

In [187]:
error_preproc[error_preproc['is_file_error'] &\
              error_preproc['doi'].isin(set(file_error_dois) -\
                                        set(missing_file_dois))].to_csv("preproc_file_errors.csv")

In [140]:
len(missing_files)

449