In [1]:
import pandas
import collections
import itertools
import re
import os
import glob

In [2]:
WORDS = ['CRAN', 'bioconductor', 'omegahat', 'github', 'rforge', 'r-forge',
        'devtools', 'dependency', 'dependencies', 'installation', 'stable', 
        'development', 'stable version', 'development version', 
         'release', 'released version', 'to install', 'replication', 'replicat']

REGEX = {
    'install_github': re.compile(r'install_github\(.+\)'),
    'install_packages': re.compile(r'install\.packages\((\'|"|)[A-Za-z0-9]+(\'|"|)\)'),
    #'install_packages_github': re.compile(r'install\.packages\(.+repos.+github.+\)'),
    'git_clone': re.compile(r'git clone.*\n'),
}

PATH = '/data/github/'

In [3]:
github = pandas.read_csv('../data/github-cran-150601.csv', index_col=0)
github = github.query('InGitHub == 1').sort_values(by='Date').drop_duplicates('Package', keep='last').set_index('Package')

In [4]:
repositories = pandas.read_csv('../data/github-raw-150601.csv')[['Package', 'Owner', 'Repository', 'CommitDate']]
repositories = repositories.sort_values(by='CommitDate').drop_duplicates('Package', keep='last').set_index('Package')

In [5]:
candidates = repositories.join(github[[]], how='right')[['Owner', 'Repository']]

In [6]:
#candidates = candidates.reset_index().query('Package == "ggplot2"').set_index('Package')

In [7]:
data = []

# Apply to each candidate package
for i, (ix, row) in enumerate(candidates.iterrows()):
    if i % 500 == 0:
        print i
    package, owner, repository = ix, row['Owner'], row['Repository']
    path = os.path.join(PATH, owner, repository)
    
    # Get a list of every README* file
    readmes = glob.glob(os.path.join(path, 'README*')) + glob.glob(os.path.join(path, 'readme*'))
    
    for readme in readmes:
        if os.path.isdir(readme):
            continue
            
        readme_data = {'Package': package, 'Readme': readme.rsplit('/',1)[-1]}
        data.append(readme_data)
        
        with open(readme, 'r') as readme_file:
            lines = readme_file.readlines()
            readme_data['nb_lines'] = len(lines)
            content = ''.join(lines)
            readme_data['nb_chars'] = len(content)
            
            # Look for words
            for word in WORDS:
                readme_data[word] = word.lower() in content.lower()
                    
            # Look for regexes
            for re_name, re_ex in REGEX.iteritems():
                match_data = []
                matches = re_ex.finditer(content)
                for match in matches:
                    match_data.append(match.group(0))  # Entire match
                readme_data[re_name] = match_data     

0
500
1000
1500
2000
2500
3000
3500
4000
4500


In [8]:
pandas.DataFrame.from_dict(data).to_csv('../data/readme.csv')