In [1]:
from lxml import html
import requests
from time import sleep
import json

In [9]:
base_url = "https://cran.r-project.org/"

In [40]:
def get_package_names(base_url, max_iter=10000):
    url = base_url + "web/packages/available_packages_by_name.html"
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    result = []
    for i in range(2,max_iter):   
        pkg_xpath = '//tr[{0}]/td[1]/a/text()'
        pkg_name = tree.xpath(pkg_xpath.format(i))
        if pkg_name != []:
            result.append(str(pkg_name[0]))
            miss_count = 0
        else: # might be end, but might be just a skipped row
            miss_count += 1
            if miss_count == 2: # if two misses then call it quits
                break
    return result

def build_urls(base_url, package_names):
    
    if type(package_names) == list:
        urls = [base_url + 
                'web/packages/{0}/index.html'.format(x) 
                for x in package_names]
        return urls
    elif type(package_names) == str:
        urls = base_url + 'web/packages/{0}/index.html'.format(package_names) 
        return urls
    else:
        raise TypeError("package_names must be string or list")

def get_depends(base_url, package_names):
    
    i = 1
    result = {}
    
    for pkg in package_names:    
        url = build_urls(base_url, pkg)
        page = requests.get(url)
        tree = html.fromstring(page.content)
        depends = tree.xpath("*//table[1]/tr[2]/td[2]/a/text()")
        result[pkg] = depends
        sleep(2)
        i += 1
        if i%100 == 0:
            print("retrieved {0} of {1} package dependencies"\
                  .format(i, len(package_names)))
    
    return(result)
    

In [43]:
pkgs = get_package_names(base_url)

d = get_depends(base_url, pkgs[:10])

with open('test.json', 'w') as fp:
    json.dump(d, fp)

## Part 2: Download Logs


In [114]:
import urllib, os, gzip
import pandas as pd
from datetime import timedelta, date

In [119]:
def build_url(dt):
    return "http://cran-logs.rstudio.com/{0}/{0}-{1:02d}-{2:02d}.csv.gz".format(*dt.timetuple()[0:3])

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

def get_logs(start_date, end_date):
    directory_name = "cran_logs-" + time.strftime('%Y-%m-%d', time.gmtime())
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
    
    for single_date in daterange(start_date, end_date):
        url = build_url(single_date)
        fname = directory_name + "/" + url.split('/')[-1]
        urllib.request.urlretrieve(url, fname)

    return None

def count_pkg_downloads(logdir):
    res = pd.Series()
    for subdir, dirs, files in os.walk(logdir):
        for file in files:
            with gzip.open(filename) as f:
                log = pd.read_csv(f, usecols=[6]).package.value_counts()
                res = res.add(log, fill_value=0)
    return res


In [120]:
get_logs(date(2016,8,30), date(2016,9,5))

res = count_pkg_downloads('cran_logs-2016-09-07')

In [161]:
res.sort_values(inplace=False, ascending=False).head(50)

stringr         86574.0
Rcpp            76290.0
jsonlite        67836.0
digest          66804.0
ggplot2         61410.0
stringi         60396.0
plyr            57978.0
magrittr        54900.0
R6              50136.0
reshape2        48318.0
scales          47028.0
base64enc       46728.0
RColorBrewer    46392.0
rmarkdown       45906.0
mime            44874.0
curl            44472.0
gtable          43176.0
munsell         42966.0
colorspace      42360.0
tibble          42120.0
labeling        40794.0
dichromat       40506.0
DBI             40062.0
knitr           40032.0
dplyr           38718.0
RCurl           36186.0
bitops          34572.0
htmltools       34380.0
httr            34326.0
yaml            33714.0
BH              33318.0
lazyeval        32694.0
evaluate        31152.0
markdown        30510.0
XML             30480.0
formatR         29844.0
openssl         29658.0
highr           29466.0
Matrix          29190.0
caTools         28866.0
assertthat      28002.0
rJava           

References:
* http://blog.revolutionanalytics.com/2015/06/fishing-for-packages-in-cran.html
* https://renkun.me/blog/2014/07/29/scraping-information-of-cran-packages.html

In [46]:
with open('test.json','r') as f:
    e = json.load(f)