In [1]:
from datetime import date
import glob
import pandas as pd
import seaborn as sns

In [2]:
today = date.today()

In [3]:
files = f'data/02-out/*{today}.csv'

## Read data

Combining github and cran data

In [4]:
df_arr = []
for file in glob.glob(files):
    df_arr.append(pd.read_csv(file))

print(len(df_arr))
df = pd.concat(df_arr)
print(len(df))

2
625


In [5]:
df.columns

Index(['name', 'description', 'url', 'keywords', 'prog_lang', 'source',
       'stars'],
      dtype='object')

## Remove duplicated packages cross-listed in python/R 

Treatment: if cross-listed, only count towards R

In [6]:
dup = set(df[df.name.duplicated()]['name'])

In [7]:
df_dups = pd.DataFrame()

for w in dup:
    x = df[df.name == w]
    df_dups = df_dups.append(x)

In [8]:
df_dups.sort_values(['name', 'source']).to_csv(f'data/02-out/clean/pkgs-comb-{today}-dups.csv', index=False)

In [9]:
df = df.assign(duplicate='No')
df.loc[(df.name.isin(dup)) & (df.source=="Github"), 'duplicate'] = 'Yes'


## Check SDM and remove irrelevant packages
Treatment: remove

In [10]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)
df[df.name=='py-sdm']['description']

200    Python implementation of nonparametric nearest-neighbor-based estimators for divergences between distributions.
Name: description, dtype: object

In [11]:
# This is a manual step
df[df.keywords == 'SDM'][['name', 'description', 'url']]

Unnamed: 0,name,description,url
192,SDMUSIC,Search && Download Music from multi-platform,https://github.com/pcdack/SDMUSIC
193,menpofit,Menpo's 2D deformable modelling toolkit (AAMs/CLMs/SDMs),https://github.com/menpo/menpofit
194,landmark_py,"Landmark with Regressition in Python （LBF(3000fps), ESR and SDM）",https://github.com/FacialLandmark/landmark_py
195,pandaSDMX,Python interface to SDMX,https://github.com/dr-leo/pandaSDMX
197,Linux-SDM-Downloader,Downloader SDM files via SDX file (DreamSpark) on Linux,https://github.com/RadekSimkanic/Linux-SDM-Downloader
199,AANE_Python,"Accelerated Attributed Network Embedding, SDM 2017",https://github.com/xhuang31/AANE_Python
200,py-sdm,Python implementation of nonparametric nearest-neighbor-based estimators for divergences between distributions.,https://github.com/dougalsutherland/py-sdm
201,ArcSDM,Spatial Data Modeler 5 for ArcGis pro,https://github.com/gtkfi/ArcSDM
202,SDML-Final,Transportation Mode Detection For HTC,https://github.com/BaiiYuan/SDML-Final
204,sdmpy-old,"Updated May 19, 2016",https://github.com/caseyjlaw/sdmpy-old


In [12]:
names_to_keep = ['nhSDM', 'MinBAR', 'SDM_ENSO_Butterflies',
                 'SDM_course_git', 'Severn-Estuary-SDMS', 'Local-Ecological-knowledge-Model-evaluation',
                 'biodiversity-sdm-lesson', '2016_Ecology_SDM', 'coral_traits_SDM',
                 'SDM.Virtual.Species_Bell.Schlaepfer', 'speciesdistributionmodeling', 'DynamicEnsembleSDM',
                 'Plant_SDM_code', 'Lyon.MS.Thesis-SDM.Project', 'BioScen1.5_SDM', 'bbs', 'STManaged',
                 'Predictor_decomposition', 'oak_gap_analysis', 'Mapping_phylogenetic_diversity',
                 'PROs', 'shiny_STM-managed', 'GIST_Thesis', 'ebutterfly-sdm', 'sdmTMB', 'rmaxent',
                 'SDMpriors', 'eSDM', 'SDM-CLMcomp', 'SDM-teaching', 'Composition', '244_SMLW',
                 'Light-Pollution-Project', 'SDMexoticPlants', 'Makrooekologie', 'SDMProjectA', 'STA546_SDM2',
                 'SDMDataAnalyticsProject', 'RobiniaSDM', 'things', 'BioticSDMs', 'tapir_sdms'] 

# must be packages or substantial repos about SDM

In [13]:
print(len(df))
df_sdm = df[(df.name.isin(names_to_keep)) & (df.keywords == 'SDM')]
df = df[df.keywords != 'SDM'].append(df_sdm)
print(len(df))

625
476


## Check niche and remove irrelevant packages
Treatment: remove

In [14]:
# This is a manual step
df[df.keywords == 'niche'][['name', 'description', 'url']]

Unnamed: 0,name,description,url
26,CENFA,Climate and Ecological Niche Factor Analysis,https://cran.r-project.org/web/packages/CENFA/index.html
27,ENiRG,Ecological Niche in R and GRASS,https://cran.r-project.org/web/packages/ENiRG/index.html
28,ENMeval,Automated Runs and Evaluations of Ecological Niche Models,https://cran.r-project.org/web/packages/ENMeval/index.html
29,EnvNicheR,Niche Estimation,https://cran.r-project.org/web/packages/EnvNicheR/index.html
30,MaxentVariableSelection,Selecting the Best Set of Relevant Environmental Variables along with the Optimal Regularization Multiplier for Maxent Niche Modeling,https://cran.r-project.org/web/packages/MaxentVariableSelection/index.html
31,nicheROVER,(Niche) (R)egion and Niche (Over)lap Metrics for Multidimensional Ecological Niches,https://cran.r-project.org/web/packages/nicheROVER/index.html
32,phyloclim,Integrating Phylogenetics and Climatic Niche Modeling,https://cran.r-project.org/web/packages/phyloclim/index.html
33,rKIN,(Kernel) Isotope Niche Estimation,https://cran.r-project.org/web/packages/rKIN/index.html
0,xpost-bot,Reddit bot to scan and repost submissions of interest to niche subreddits,https://github.com/git2samus/xpost-bot
2,fynat,Find Your Niche At Tech,https://github.com/dadams22/fynat


In [15]:
names_to_keep = ['ENiRG', 'MunichBFOR', 'multi-modal-ga', 'fuzzy_habitat_modelling', 'niche_modelling',
                'ENMGadgets_orig', 'ENM_TheMetaLand', 'spaa', 'Model-R', 'atlasr', 'marburg_zoonotic', 
                'anhu', 'Mammal-hypervolumes', 'ACERVNicheSpaceFlorida2011-2015', 'RSFSA_R', 'benmR', 'coastal_ENM',
                'ENM_manuals', 'humboldt', 'nppen', 'Ecological-replacement', 'phyloclim', 'CENFA', 'PLNT-model',
                'ENM', 'envdianthus', 'MaxentModelEvaluations', 'Strassburg_NEE_ENM', 'recline', 'Merow_et_al_2016_GEB_Minxent_Examples',
                'Persistence-Times-Code', 'MaxentVariableSelection', 'NunesPearson2016', 'nicheTrackR', 'ebola_zoonotic',
                'biomod2ez', 'modelr_pkg', 'nicheModeling', 'nicheROVER', 'NicheModel', 'NicheModelling_FBaletaud',
                'NicheModellingGBIF', 'NicheOverlapR', 'phyloENM', '', 'Niche-Modeling-Workshop', 'GerminationNiche',
                'wost-clim-niche', 'Wild-boars', 'ENMOD', 'Isotopic-niche-diversity', 'EnvNicheR', 'mobula_japanica-niche',
                'Niche-Breadth-Range-Size', 'Ecological-niche-modelling', 'asian_niche-1', 'Himalayan-Carex-Climatic-niche-Evolution',
                'RLS-thermal-niche-V2', 'ENFA', 'ENM_heterogeneity', 'kali', 'Rossman_etal_2016_EcolAndEvol', 'trochilidae',
                'vuln', 'trait-geo-diverse-ungulates', 'Hybrid-niche-modelling-Pacific-Bluefin-tuna--Allgayer', 'Chapter2', 'Bucanetes',
                'futurescenarios', 'FranceCG', 'lump.split.pool.ENM	']

# must be packages or substantial repos about SDM

In [16]:
print(len(df))
df_niche = df[(df.name.isin(names_to_keep)) & (df.keywords == 'niche')]
df = df[df.keywords != 'niche'].append(df_niche)
print(len(df))

476
360


## Marking repo v pkg

In [17]:
df = df.assign(type_repo = "package")

df.loc[(~df.description.str.contains('package')) & (df.source == "Github"), 'type_repo'] = 'code_base'

In [18]:
df.head()

Unnamed: 0,name,description,url,keywords,prog_lang,source,stars,duplicate,type_repo
0,biomod2,Ensemble Platform for Species Distribution Modeling,https://cran.r-project.org/web/packages/biomod2/index.html,species distribution,R,CRAN,,No,package
1,dismo,Species Distribution Modeling,https://cran.r-project.org/web/packages/dismo/index.html,species distribution,R,CRAN,,No,package
2,fuzzySim,Fuzzy Similarity in Species Distributions,https://cran.r-project.org/web/packages/fuzzySim/index.html,species distribution,R,CRAN,,No,package
3,gambin,Fit the Gambin Model to Species Abundance Distributions,https://cran.r-project.org/web/packages/gambin/index.html,species distribution,R,CRAN,,No,package
4,hSDM,Hierarchical Bayesian Species Distribution Models,https://cran.r-project.org/web/packages/hSDM/index.html,species distribution,R,CRAN,,No,package


## Sanity checks

In [19]:
df[df.name.str.match('^i')] # iNext exists

Unnamed: 0,name,description,url,keywords,prog_lang,source,stars,duplicate,type_repo
5,iSDM,Invasive Species Distribution Modelling,https://cran.r-project.org/web/packages/iSDM/index.html,species distribution,R,CRAN,,No,package
39,iNEXT,Interpolation and Extrapolation for Species Diversity,https://cran.r-project.org/web/packages/iNEXT/index.html,species diversity extrapolation,R,CRAN,,No,package
303,iSDM,iSDM is an open-source R package that implements a few functions useful for modeling the spatial distribution of inva…,https://github.com/TarekHattab/iSDM,SDM; species distribution,R,Github,7.0,Yes,package
553,iNEXT,❗️ This is a read-only mirror of the CRAN R package repository. iNEXT — Interpolation and Extrapolation for Species D…,https://github.com/cran/iNEXT,species diversity extrapolation,R,Github,,Yes,package


In [20]:
df[df.keywords.str.contains('species diversity extrapolation')]

Unnamed: 0,name,description,url,keywords,prog_lang,source,stars,duplicate,type_repo
39,iNEXT,Interpolation and Extrapolation for Species Diversity,https://cran.r-project.org/web/packages/iNEXT/index.html,species diversity extrapolation,R,CRAN,,No,package
553,iNEXT,❗️ This is a read-only mirror of the CRAN R package repository. iNEXT — Interpolation and Extrapolation for Species D…,https://github.com/cran/iNEXT,species diversity extrapolation,R,Github,,Yes,package


# Summary stats

In [21]:
pd.crosstab(df.keywords, df.source)

source,CRAN,Github
keywords,Unnamed: 1_level_1,Unnamed: 2_level_1
SDM,0,42
SDM; species distribution,0,42
SDM; species distribution; species range,0,1
niche,6,68
niche; SDM,0,1
niche; SDM; species distribution,0,3
niche; species distribution,0,4
niche; species range,0,2
species distribution,25,118
species distribution; niche,1,0


In [22]:
pd.crosstab(df.type_repo, df.source)

source,CRAN,Github
type_repo,Unnamed: 1_level_1,Unnamed: 2_level_1
code_base,0,282
package,38,40


In [23]:
if 'index' in list(df.columns):
    del df['index']

df.to_csv(f'data-out/clean/pkgs-comb-{today}.csv', index=False)