In [1]:
import pandas as pd
import re
from pprint import pprint 
from datetime import date, datetime as dt
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', -1)

In [2]:
import requests
import bs4
from bs4 import BeautifulSoup

In [3]:
date = date.today()
# date_cran = dt(2019, 1, 1).date()
date_cran = date.today()

## Scrape from cran

In [4]:
url = 'https://cran.r-project.org/web/packages/available_packages_by_name.html'
page = requests.get(url)

# Ensure kaspersky is turned off

In [5]:
html = BeautifulSoup(page.text, 'html.parser')
table = html.find('table', {'summary': 'Available CRAN packages by name.'})
li1 = [x for x in table.find_all('tr')]
li2a = [x.find_all('td')[0].text for x in li1 if len(x.find_all('td')) == 2]
li2b = [re.sub('\\n', ' ', x.find_all('td')[1].text) for x in li1 if len(x.find_all('td')) == 2]
li2c = [re.sub('..\/..\/web/packages', 'https://cran.r-project.org/web/packages', x.find('a')['href']) 
            for x in li1 if len(x.find_all('td')) == 2]

In [6]:
df_out = pd.DataFrame({'name': li2a, 'description': li2b, 'url': li2c})
df_out.head()

Unnamed: 0,name,description,url
0,A3,"Accurate, Adaptable, and Accessible Error Metrics for Predictive Models",https://cran.r-project.org/web/packages/A3/index.html
1,abbyyR,Access to Abbyy Optical Character Recognition (OCR) API,https://cran.r-project.org/web/packages/abbyyR/index.html
2,abc,Tools for Approximate Bayesian Computation (ABC),https://cran.r-project.org/web/packages/abc/index.html
3,abc.data,Data Only: Tools for Approximate Bayesian Computation (ABC),https://cran.r-project.org/web/packages/abc.data/index.html
4,ABC.RAP,Array Based CpG Region Analysis Pipeline,https://cran.r-project.org/web/packages/ABC.RAP/index.html


In [7]:
# Write data out
df_out.to_csv(f'data/01-in/cran-packages-{date}.csv', index=False)

## Process data

In [8]:
# Read/mangle dataframe
df = pd.read_csv(f'data/01-in/cran-packages-{date_cran}.csv', encoding='latin-1')
print(df.shape)
df = df[~df.name.isnull()]
print(df.shape)

# Convert to lower for querying purposes
df = df.assign(name_lower = df.name.str.lower())
df = df.assign(description_lower = df.description.str.lower())

(14188, 3)
(14188, 3)


Packages captured from https://cran.r-project.org on 7 May 2019.

In [9]:
# Testing that it works
print(df[df.name_lower == 'inext']['description_lower'])

5642    interpolation and extrapolation for species diversity
Name: description_lower, dtype: object


In [10]:
# Figuring out keywords to use
pd.set_option('display.max_colwidth', 100)
df[df.description_lower.str.contains('species')][['name', 'description']]

Unnamed: 0,name,description
156,AGHmatrix,Relationship Matrices for Diploid and Autopolyploid Species
640,BarcodingR,Species Identification using DNA Barcodes
671,BatchMap,Software for the Creation of High Density Linkage Maps in Outcrossing Species
869,betalink,Beta-Diversity of Species Interactions
1018,biomod2,Ensemble Platform for Species Distribution Modeling
1199,bossMaps,Convert Binary Species Range Maps into Continuous Surfaces Based on Distance to Range Edge
1233,breakaway,Species Richness Estimation and Modeling
1873,coexist,Species coexistence modeling and analysis
2056,cooccur,Probabilistic Species Co-Occurrence Analysis in R
2838,dismo,Species Distribution Modeling


Looks like 'species', together with 'distribution', 'occupancy', 'diversity', 'niche', 'spatial' are important keywords. 'occurrence' are tentative.

In [11]:
keywords = [
    ['species', 'distribution'],
    ['SDM'],
    ['niche'],
    ['species', 'range'],
    ['species', 'occupancy'],
    ['species', 'diversity', 'extrapolation']
]

In [12]:
df

Unnamed: 0,name,description,url,name_lower,description_lower
0,A3,"Accurate, Adaptable, and Accessible Error Metrics for Predictive Models",https://cran.r-project.org/web/packages/A3/index.html,a3,"accurate, adaptable, and accessible error metrics for predictive models"
1,abbyyR,Access to Abbyy Optical Character Recognition (OCR) API,https://cran.r-project.org/web/packages/abbyyR/index.html,abbyyr,access to abbyy optical character recognition (ocr) api
2,abc,Tools for Approximate Bayesian Computation (ABC),https://cran.r-project.org/web/packages/abc/index.html,abc,tools for approximate bayesian computation (abc)
3,abc.data,Data Only: Tools for Approximate Bayesian Computation (ABC),https://cran.r-project.org/web/packages/abc.data/index.html,abc.data,data only: tools for approximate bayesian computation (abc)
4,ABC.RAP,Array Based CpG Region Analysis Pipeline,https://cran.r-project.org/web/packages/ABC.RAP/index.html,abc.rap,array based cpg region analysis pipeline
5,ABCanalysis,Computed ABC Analysis,https://cran.r-project.org/web/packages/ABCanalysis/index.html,abcanalysis,computed abc analysis
6,abcdeFBA,ABCDE_FBA: A-Biologist-Can-Do-Everything of Flux Balance Analysis with this package,https://cran.r-project.org/web/packages/abcdeFBA/index.html,abcdefba,abcde_fba: a-biologist-can-do-everything of flux balance analysis with this package
7,ABCoptim,Implementation of Artificial Bee Colony (ABC) Optimization,https://cran.r-project.org/web/packages/ABCoptim/index.html,abcoptim,implementation of artificial bee colony (abc) optimization
8,ABCp2,Approximate Bayesian Computational Model for Estimating P2,https://cran.r-project.org/web/packages/ABCp2/index.html,abcp2,approximate bayesian computational model for estimating p2
9,abcrf,Approximate Bayesian Computation via Random Forests,https://cran.r-project.org/web/packages/abcrf/index.html,abcrf,approximate bayesian computation via random forests


In [13]:
pd.set_option('display.max_colwidth', 100)

df_init = pd.DataFrame(columns=['name', 'description', 'keywords', 'url'])

for i, keyword in enumerate(keywords):
    
    keys = " ".join(keyword)
    
    print('======================================================')
    print(f'Table {i+1} - keywords: {keys} \n')
    
    
    for j, word in enumerate(keyword):
        print(f'Checking for word {word}')
        check = df.description_lower.str.contains(word)
        print(f'Number of matches: {sum(check)}\n')
        if j == 0:
            check_combined = check 
        else:
            check_combined = check & check_combined
    
    df_filtered = df[check_combined][['name', 'description', 'url']]
    print(df[check_combined][['name']])
    df_filtered = df_filtered.assign(keywords=keys)
    df_init = df_init.append(df_filtered, sort=True)
    print(f'\nTotal: {df_filtered.shape}')
    print('\n\n')
    

Table 1 - keywords: species distribution 

Checking for word species
Number of matches: 87

Checking for word distribution
Number of matches: 381

                 name
1018          biomod2
2838            dismo
4254         fuzzySim
4286           gambin
5345             hSDM
5829             iSDM
6789      marinespeed
6858          maxlike
6861           maxnet
7196          MigClim
7498             mopa
11305          SADISA
11307            sads
11501             sdm
11502         SDMPlay
11503   sdmpredictors
11504        SDMTools
11505     sdmvspecies
11512          sdStaf
12319     SPEDInstabR
12450            SSDM
12452        ssdtools
13610            usdm
13761  virtualspecies
13833         wallace
14179            zoon

Total: (26, 4)



Table 2 - keywords: SDM 

Checking for word SDM
Number of matches: 0

Empty DataFrame
Columns: [name]
Index: []

Total: (0, 4)



Table 3 - keywords: niche 

Checking for word niche
Number of matches: 9

                          name
1533 

In [14]:
uniques = df_init[~df_init['name'].duplicated()]
dups = df_init[df_init['name'].duplicated()]

In [15]:
dups

Unnamed: 0,description,keywords,name,url
13833,A Modular Platform for Reproducible Modeling of Species Niches and Distributions,niche,wallace,https://cran.r-project.org/web/packages/wallace/index.html


In [16]:
for i, row in dups.iterrows():
    print(i, row['name'])
    uniques.at[uniques.name==row['name'], 'keywords'] = \
        ''.join(list(uniques[uniques.name==row['name']]['keywords'])) + f'; {row["keywords"]}'

13833 wallace


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc[index, col] = value


In [17]:
uniques[uniques.name.isin(list(dups.name))]

Unnamed: 0,description,keywords,name,url
13833,A Modular Platform for Reproducible Modeling of Species Niches and Distributions,species distribution; niche,wallace,https://cran.r-project.org/web/packages/wallace/index.html


In [18]:
uniques = uniques.assign(prog_lang='R', source='CRAN', stars="nan")
uniques = uniques[['name', 'description', 'url', 'keywords', 'prog_lang', 'source', 'stars']]
uniques.to_csv(f'data-out/pkgs-cran-{date}.csv', index=False)

In [19]:
uniques

Unnamed: 0,name,description,url,keywords,prog_lang,source,stars
1018,biomod2,Ensemble Platform for Species Distribution Modeling,https://cran.r-project.org/web/packages/biomod2/index.html,species distribution,R,CRAN,
2838,dismo,Species Distribution Modeling,https://cran.r-project.org/web/packages/dismo/index.html,species distribution,R,CRAN,
4254,fuzzySim,Fuzzy Similarity in Species Distributions,https://cran.r-project.org/web/packages/fuzzySim/index.html,species distribution,R,CRAN,
4286,gambin,Fit the Gambin Model to Species Abundance Distributions,https://cran.r-project.org/web/packages/gambin/index.html,species distribution,R,CRAN,
5345,hSDM,Hierarchical Bayesian Species Distribution Models,https://cran.r-project.org/web/packages/hSDM/index.html,species distribution,R,CRAN,
5829,iSDM,Invasive Species Distribution Modelling,https://cran.r-project.org/web/packages/iSDM/index.html,species distribution,R,CRAN,
6789,marinespeed,Benchmark Data Sets and Functions for Marine Species Distribution Modelling,https://cran.r-project.org/web/packages/marinespeed/index.html,species distribution,R,CRAN,
6858,maxlike,Model Species Distributions by Estimating the Probability of Occurrence Using Presence-Only Data,https://cran.r-project.org/web/packages/maxlike/index.html,species distribution,R,CRAN,
6861,maxnet,Fitting 'Maxent' Species Distribution Models with 'glmnet',https://cran.r-project.org/web/packages/maxnet/index.html,species distribution,R,CRAN,
7196,MigClim,Implementing dispersal into species distribution models,https://cran.r-project.org/web/packages/MigClim/index.html,species distribution,R,CRAN,
