# Software Mentions Data Analysis Notebook
Sample notebook meant to serve as a starting point for further analyses

In [1]:
import pandas as pd

In [2]:
data_path = 'data/input_files/comm_curated.tsv.gz'

# Read the data

In [3]:
data_df = pd.read_csv(data_path, sep = '\\t', engine = 'python', compression = 'gzip')

In [5]:
data_df.head()

Unnamed: 0,license,location,pmcid,pmid,doi,pubdate,source,number,text,software,version,ID
0,comm,comm/Micropl/PMC8475362.nxml,8475362,,10.1186/s43591-021-00017-9,2021,Particle selection and identification of polym...,7,"Then, all items were photographed under a bino...",Olympus CellSens,,SM0
1,comm,comm/Micropl/PMC8475362.nxml,8475362,,10.1186/s43591-021-00017-9,2021,Particle selection and identification of polym...,8,"Spectra were then vector normalized (OPUS, ver...",OPUS,,SM1
2,comm,comm/Micropl/PMC8475362.nxml,8475362,,10.1186/s43591-021-00017-9,2021,Statistical analysis,12,Model fit was assessed through functions provi...,R package DHARMa,,SM2
3,comm,comm/Micropl/PMC8475362.nxml,8475362,,10.1186/s43591-021-00017-9,2021,Statistical analysis,12,Analyses and plotting were performed in R (ver...,R,,SM3
4,comm,comm/Micropl/PMC8475362.nxml,8475362,,10.1186/s43591-021-00017-9,2021,Statistical analysis,12,Analyses and plotting were performed in R (ver...,ggplot2,,SM4


In [6]:
data_df.columns

Index(['license', 'location', 'pmcid', 'pmid', 'doi', 'pubdate', 'source',
       'number', 'text', 'software', 'version', 'ID'],
      dtype='object')

# Example: Most frequent terms
Note that this aggregates over plain-text software mentions

In [8]:
data_df.groupby('software').count().sort_values(by = 'pmid', ascending = False)[:20]

Unnamed: 0_level_0,license,location,pmcid,pmid,doi,pubdate,source,number,text,version,ID
software,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SPSS,353065,353065,353065,351229,344044,353065,349464,353065,353065,97537,353065
R,342853,342853,342853,341642,341662,342853,328366,342853,342853,19365,342853
ImageJ,189854,189854,189854,189385,188193,189854,173963,189854,189854,8508,189854
GraphPad Prism,148887,148887,148887,148514,147641,148887,143642,148887,148887,69481,148887
BLAST,107660,107660,107660,107475,107132,107660,100902,107660,107660,729,107660
Excel,102179,102179,102179,101642,101246,102179,97687,102179,102179,17754,102179
SAS,99679,99679,99679,99410,98997,99679,97178,99679,99679,19348,99679
GraphPad,84220,84220,84220,84029,83433,84220,82753,84220,84220,1935,84220
MATLAB,83858,83858,83858,83441,83632,83858,80099,83858,83858,12842,83858
Stata,57095,57095,57095,56861,56765,57095,55820,57095,57095,18203,57095


# Example: Query an entry

In [9]:
pd.options.display.max_colwidth = 100
data_df[data_df['software'] == 'scikit-learn'].head()

Unnamed: 0,license,location,pmcid,pmid,doi,pubdate,source,number,text,software,version,ID
7999,comm,comm/ACS_Nano/PMC7905882.nxml,7905882,33556239.0,10.1021/acsnano.0c10632,2021,Cluster Analysis,37,Local cluster density analysis was performed by fitting 2D KDE maps from the point clouds of clu...,scikit-learn,,SM3075
8001,comm,comm/ACS_Nano/PMC7905882.nxml,7905882,33556239.0,10.1021/acsnano.0c10632,2021,Cluster Analysis,38,"Nearest neighbor analysis was performed as a k-nearest neighbor analysis, with k = 4 (excluding ...",scikit-learn,,SM3075
17059,comm,comm/AMB_Express/PMC5047870.nxml,5047870,27699703.0,10.1186/s13568-016-0260-6,2016,Support vector machine model,53,The software scikit-learn 0.17 (http://scikit-learn.org) (Pedregosa et al,scikit-learn,0.17,SM3075
33933,comm,comm/Acta_Crystallogr_D_Biol_Crystallogr/PMC4427199.nxml,4427199,25945580.0,10.1107/S1399004715004241,2015,SVM setup,10,"We then used scikit-learn v.0.13.1 (Pedregosa et al., 2012 ▶) to perform recursive feature elimi...",scikit-learn,,SM3075
38181,comm,comm/Acta_Crystallogr_D_Biol_Crystallogr/PMC4356372.nxml,4356372,25760616.0,10.1107/S1399004715000383,2015,Software used and Brickworx implementation,22,The SVM classifier was implemented with the use of the scikit-learn suite v.0.14.1,scikit-learn,,SM3075


# Example: Most popular plain-text mentions in 2021

In [10]:
data_2021_df = data_df[data_df['pubdate'] == 2021]
data_2021_df.groupby('software').count().sort_values(by = 'pmid', ascending = False)[:20]

Unnamed: 0_level_0,license,location,pmcid,pmid,doi,pubdate,source,number,text,version,ID
software,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
R,74613,74613,74613,73823,74592,74613,70805,74613,74613,4059,74613
SPSS,53991,53991,53991,53272,53946,53991,53265,53991,53991,14588,53991
ImageJ,32329,32329,32329,31977,32328,32329,29515,32329,32329,1440,32329
GraphPad Prism,31668,31668,31668,31365,31653,31668,30274,31668,31668,15688,31668
Excel,17216,17216,17216,16979,17211,17216,16432,17216,17216,2982,17216
GSEA,15747,15747,15747,15618,15741,15747,13293,15747,15747,133,15747
GraphPad,15661,15661,15661,15499,15653,15661,15314,15661,15661,535,15661
MATLAB,14703,14703,14703,14579,14697,14703,13782,14703,14703,2585,14703
SAS,12799,12799,12799,12696,12793,12799,12342,12799,12799,2508,12799
Cytoscape,10658,10658,10658,10552,10645,10658,9777,10658,10658,1262,10658
