# The Unconquerables of Open Access

## Merge of journals with PubMed references

Project for the EAHIL conference 2023 : https://eahil2023.org/
Authors : **Floriane Muller & Pablo Iriarte**, University of Geneva  
Last update : 24.05.2023

This purpose of this notebook is to merge the information about journals with all the publications metadata extracted from PubMed


### Sources

* Journal information obtained with notebooks 1-6 : file results/2023/MedlineJournals.tsv
* PubMed references metadata obtained with notebook 7: 1166 tsv files in data/sources/pubmed/pubmed23n[0001-1166].tsv


### Choice of fields extracted

 * PMID: PubmedArticleSet/PubmedArticle/MedlineCitation/PMID
 * DOI: PubmedArticleSet/PubmedArticle/PubmedData/ArticleIdList/ArticleId@IdType="doi"
 * Publication date: PubmedArticleSet/PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate/Year
 * Journal ID: PubmedArticleSet/PubmedArticle/MedlineCitation/MedlineJournalInfo/NlmUniqueID
 

### Extract content using XPATH and export in TSV format

In [1]:
import pandas as pd
import os
# display the full content of rows (non truncated)
pd.set_option('display.max_colwidth', -1)
# display all the columns
pd.set_option('display.max_columns', None)
# folder of PubMed tsv files
myfolderpubmed = 'data/sources/pubmed/'
# dataframe
df_pubmed = pd.DataFrame()

In [2]:
# loop into the TSV files
for file in os.listdir(myfolderpubmed):
    # skip x files in case of error
    # errors:
    # file pubmed23n0303.tsv, Expected 6 fields in line 19219, saw 7: "9127676	1996		10.1076/apab.104.7.814.13110		9510153	"
    # file pubmed23n0325.tsv, Expected 6 fields in line 17666, saw 7: "9803271	1998		10.1046/j.1469-1809.1998.6230271.x		0416661	"
    # file pubmed23n0389.tsv, Expected 6 fields in line 14430, saw 7: "11772327	2002		10.1517/13543784.11.1.125		9434197	"
    # file pubmed23n0396.tsv, Expected 6 fields in line 15897, saw 7: "11996645	2002		10.1517/13543784.11.5.631	9434197	"
    # file pubmed23n0417.tsv, Expected 6 fields in line 24429, saw 7: "12745873	2003		10.1080/0049825031000066259		1306665	"
    # file pubmed23n0422.tsv, Expected 6 fields in line 16816, saw 7: "12901678	2003		10.1080/713610419	8914274	"
    # file pubmed23n0422.tsv, Expected 6 fields in line 16817, saw 7: "12901679	2003		10.1080/713610420	8914274	"
    # file pubmed23n0507.tsv, Expected 6 fields in line 21839, saw 7: "15545080	2004		10.1080/10611860400005697	9312476	"
    # file pubmed23n0592.tsv, Expected 6 fields in line 19800, saw 7: "18274931	2008		10.1080/10611860802095494		9312476	"
    # file pubmed23n0598.tsv, Expected 6 fields in line 3122, saw 7: "18446603	2008		10.1080/10611860802088523		9312476	"
    # file pubmed23n0599.tsv, Expected 6 fields in line 28353, saw 7: "18504824	2008		10.1038/nrg2381		100962779	"
    # file pubmed23n0602.tsv, Expected 6 fields in line 27008, saw 7: "18604657	2008		10.1080/10611860802088523		9312476	"
    # file pubmed23n0602.tsv, Expected 6 fields in line 27010, saw 7: "18604657	2008		10.1080/10611860802088523		9312476	"
    # file pubmed23n0603.tsv, Expected 6 fields in line 2968, saw 7: "18611112	2008		10.1517/17425255.4.6.697		101228422	"
    # file pubmed23n0615.tsv, Expected 6 fields in line 13590, saw 7: "19005871	2009		10.1080/14756360902784425		101150203	"
    if (int(file[9:13]) > 0):
        print(file[9:13])
        df_temp = pd.read_csv(myfolderpubmed + file, sep='\t', header=0, usecols=['PMID', 'Year', 'MedlineDate', 'DOI', 'NlmUniqueID'], dtype={'MedlineDate' : str, 'DOI' : str, 'NlmUniqueID' : str})
        df_pubmed = df_pubmed.append(df_temp, ignore_index=True)

0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


In [3]:
df_pubmed

Unnamed: 0,PMID,Year,MedlineDate,DOI,NlmUniqueID
0,1,1975.0,,10.1016/0006-2944(75)90147-7,0151424
1,2,1975.0,,10.1016/0006-291x(75)90482-9,0372516
2,4,1975.0,,10.1016/0006-291x(75)90506-9,0372516
3,3,1975.0,,10.1016/0006-291x(75)90498-2,0372516
4,5,1975.0,,10.1016/0006-291x(75)90508-2,0372516
...,...,...,...,...,...
34960695,36475580,2022.0,,10.1021/acsami.2c16255,101504991
34960696,36475581,2022.0,,10.1021/acsami.2c17703,101504991
34960697,36475582,2022.0,,10.1021/acsami.2c16205,101504991
34960698,36475583,2022.0,,10.1021/acsami.2c17147,101504991


In [4]:
# csv export
df_pubmed.to_csv('data/temp/2023/pubmed_all.tsv', sep='\t', index=False)

In [5]:
# refs with empty year
df_pubmed.loc[df_pubmed['Year'].isna()]

Unnamed: 0,PMID,Year,MedlineDate,DOI,NlmUniqueID
93,93,,1975 Jul-Aug,,0372666
95,94,,1975 Jul-Aug,,0372666
97,95,,1975 Jul-Aug,,0372666
112,111,,1975 Jan-Feb,,7611945
171,168,,1975 Nov-Dec,10.1097/00003086-197511000-00030,0075674
...,...,...,...,...,...
34960613,36475491,,2022 Nov-Dec,10.31857/S0026898422060027,0105454
34960616,36475494,,2022 Nov-Dec,10.31857/S0026898422060064,0105454
34960617,36475495,,2022 Nov-Dec,10.31857/S0026898422060076,0105454
34960619,36475493,,2022 Nov-Dec,10.31857/S0026898422060179,0105454


In [6]:
# fixing empty Years with first chars from MedlineDate
df_pubmed.loc[df_pubmed['Year'].isna(), 'Year'] = df_pubmed['MedlineDate'].astype(str).str[0:4]

In [7]:
# refs with empty year
df_pubmed.loc[df_pubmed['Year'].isna()]

Unnamed: 0,PMID,Year,MedlineDate,DOI,NlmUniqueID


In [8]:
# refs by year
df_pubmed['Year'].value_counts()[0:50]

2022.0    1554222
2021.0    1515685
2020.0    1353157
2019.0    1180575
2018.0    1119795
2017.0    1070770
2016.0    1054072
2015.0    1026530
2014.0    991457 
2013.0    946318 
2012.0    893101 
2011.0    826889 
2010.0    773128 
2009.0    733193 
2008.0    700232 
2007.0    660374 
2006.0    633771 
2005.0    602491 
2004.0    570883 
2003.0    537534 
2002.0    514549 
2001.0    499442 
2000.0    489258 
1999.0    453242 
1998.0    436114 
1996.0    423232 
1997.0    418667 
1995.0    413233 
1994.0    403463 
1993.0    391136 
1992.0    383548 
1991.0    378349 
1990.0    375225 
1989.0    368442 
1988.0    354960 
1987.0    336517 
1986.0    319122 
1985.0    304404 
1984.0    288876 
1983.0    282728 
1982.0    270533 
1981.0    259926 
1979.0    258460 
1980.0    257624 
1978.0    248722 
1977.0    238614 
1976.0    231587 
1975.0    226576 
1974.0    215524 
1973.0    211763 
Name: Year, dtype: int64

In [9]:
# refs by year
df_pubmed['Year'].value_counts()[50:100]

1972.0    209243
1971.0    204139
1970.0    200601
1969.0    196590
1968.0    188248
1967.0    173389
1966.0    161712
1965.0    157090
1964.0    144106
1963.0    126098
1962.0    111837
1961.0    107461
1960.0    100485
1959.0    98246 
1953.0    97781 
1957.0    97578 
1958.0    97520 
1952.0    96930 
1955.0    95809 
1954.0    94317 
1956.0    93482 
1951.0    92647 
1950.0    76394 
1948.0    65560 
2015      62811 
2016      60870 
1947.0    60090 
1949.0    57895 
2017      55176 
2005      54301 
2018      52835 
2014      52819 
2021      51834 
2009      51678 
2006      51664 
2008      51464 
2020      51038 
2007      50525 
2013      50456 
1946.0    50431 
2010      50352 
2004      49686 
2011      49574 
2012      48187 
2019      47923 
2003      46987 
2002      44559 
2001      44374 
2000      41265 
2022      40769 
Name: Year, dtype: int64

In [10]:
# refs by year
df_pubmed['Year'].value_counts()[100:150]

1999      40541
1998      38553
1997      38109
1995      35880
1993      35586
1990      35389
1996      35338
1991      34825
1994      34775
1992      34449
1989      34330
1988      32060
1987      31467
1985      30945
1986      30598
1984      29365
1983      26791
1982      25229
1978      23845
1981      23696
1979      23657
1977      23574
1976      23386
1980      22979
1975      22612
1968      19907
1965      19647
1974      19584
1971      19517
1973      19363
1970      18823
1969      18756
1967      18685
1972      18680
1945.0    18677
1966      18472
1964      18004
1963      15441
2023.0    14179
1957      13950
1962      13856
1956      13533
1955      12753
1961      12719
1958      11947
1959      11754
1960      11737
1954      11625
1952      11252
1951      11174
Name: Year, dtype: int64

In [11]:
# refs by year
df_pubmed['Year'].value_counts()[150:200]

1953      11143
1950      9398 
1927.0    5978 
1938.0    5874 
1929.0    5793 
1935.0    5763 
1930.0    5733 
1933.0    5703 
1939.0    5661 
1932.0    5654 
1931.0    5608 
1934.0    5607 
1928.0    5564 
1936.0    5496 
1937.0    5428 
1926.0    5427 
1948      5197 
1914.0    5191 
1913.0    5183 
1920.0    5130 
1925.0    5080 
1940.0    5073 
1911.0    4990 
1924.0    4989 
1947      4960 
1949      4931 
1941.0    4918 
1912.0    4896 
1942.0    4843 
1910.0    4814 
1921.0    4776 
1909.0    4662 
1915.0    4660 
1944.0    4579 
1923.0    4554 
1919.0    4539 
1943.0    4526 
1916.0    4470 
1908.0    4438 
1897.0    4386 
1898.0    4306 
1922.0    4232 
1907.0    4224 
1917.0    4223 
1918.0    4176 
1896.0    4167 
1904.0    3893 
1905.0    3883 
1893.0    3855 
1895.0    3806 
Name: Year, dtype: int64

In [12]:
# refs by year
df_pubmed['Year'].value_counts()[200:250]

1899.0    3784
1903.0    3719
1906.0    3707
1902.0    3686
1901.0    3661
1900.0    3621
1889.0    3567
1892.0    3551
1946      3500
1894.0    3477
1890.0    3345
1887.0    3221
1888.0    3131
1891.0    2959
1886.0    2641
1885.0    2576
1883.0    2411
1884.0    2406
1881.0    2238
1880.0    1941
1878.0    1904
1874.0    1889
1873.0    1842
1882.0    1839
1879.0    1837
1945      1836
1872.0    1829
1877.0    1805
1875.0    1741
1876.0    1677
1871.0    1577
1869.0    1336
1867.0    1304
1868.0    1246
1870.0    1229
1858.0    1185
1856.0    1168
1857.0    1132
1866.0    1107
1859.0    1060
1860.0    1020
1855.0    997 
1842.0    972 
1853.0    918 
Fall      867 
1848.0    827 
1843.0    813 
1841.0    805 
1851.0    782 
1861.0    780 
Name: Year, dtype: int64

In [13]:
# refs by year
df_pubmed['Year'].value_counts()[250:300]

1852.0    779
1847.0    777
Wint      776
1849.0    760
1850.0    758
1846.0    750
Spri      728
1854.0    692
1845.0    684
1864.0    680
1862.0    658
Summ      625
1844.0    609
1865.0    576
1829.0    568
1863.0    546
1830.0    510
2023      482
1828.0    409
1831.0    403
1800.0    387
1826.0    385
1801.0    368
1840.0    361
1827.0    327
1811.0    322
1816.0    315
1805.0    314
1804.0    304
1799.0    300
1807.0    295
1823.0    291
1802.0    291
1808.0    289
1817.0    288
1814.0    285
1832.0    285
1806.0    282
1837.0    281
1818.0    279
1821.0    270
1803.0    269
1824.0    269
1810.0    267
1809.0    265
1813.0    265
1838.0    265
1825.0    262
1839.0    253
1822.0    247
Name: Year, dtype: int64

In [14]:
# refs by year
df_pubmed['Year'].value_counts()[300:350]

1812.0    239
1819.0    225
1835.0    222
1815.0    222
1833.0    221
1836.0    220
1820.0    213
1944      212
1834.0    209
1831      176
1797.0    69 
1781.0    69 
1786.0    60 
1868      58 
Autu      55 
1865      48 
1790.0    45 
1785.0    43 
1787.0    43 
1871      43 
1929      38 
1839      36 
1943      35 
1840      34 
1788.0    33 
1792.0    33 
1789.0    32 
1922      32 
1784      31 
1783      26 
1923      25 
1841      24 
1860      24 
1782      22 
1873      21 
1933      21 
1796.0    20 
1876      20 
1791.0    19 
1798.0    19 
1794.0    18 
1932      17 
1793.0    17 
1928      15 
1795.0    13 
1925      13 
1935      6  
1918      5  
1924      3  
spri      2  
Name: Year, dtype: int64

In [15]:
# refs by year
df_pubmed['Year'].value_counts()[350:400]

1927    2
summ    1
fall    1
Name: Year, dtype: int64

In [16]:
# fix some errors
# Fall      867
# Wint      776
# Spri      728
# Summ      625
# Autu      55
# spri      2
# fall    1
# summ    1
df_pubmed.loc[(df_pubmed['Year'] == 'Fall') | (df_pubmed['Year'] == 'Wint') | (df_pubmed['Year'] == 'Spri') | (df_pubmed['Year'] == 'Summ') | (df_pubmed['Year'] == 'Autu') | (df_pubmed['Year'] == 'spri') | (df_pubmed['Year'] == 'fall') | (df_pubmed['Year'] == 'summ')]

Unnamed: 0,PMID,Year,MedlineDate,DOI,NlmUniqueID
19705876,20376331,Wint,Winter 2006,,0323553
19792267,20467646,Summ,Summer 2008,,101473719
19792269,20467647,Summ,Summer 2008,,101473719
19792272,20467648,Summ,Summer 2008,,101473719
19792274,20467649,Summ,Summer 2008,,101473719
...,...,...,...,...,...
33780115,35278311,Spri,Spring 2022,10.55460/WUMM-Y4N3,101158402
33780116,35278312,Spri,Spring 2022,10.55460/ETZI-SI9T,101158402
33780118,35278313,Spri,Spring 2022,10.55460/8IUQ-907J,101158402
33780134,35278329,Spri,Spring 2022,10.55460/WXGA-QYX2,101158402


In [17]:
# fix Year taking the end of string
df_pubmed.loc[(df_pubmed['Year'] == 'Fall') | (df_pubmed['Year'] == 'Wint') | (df_pubmed['Year'] == 'Spri') | (df_pubmed['Year'] == 'Summ') | (df_pubmed['Year'] == 'Autu') | (df_pubmed['Year'] == 'spri') | (df_pubmed['Year'] == 'fall') | (df_pubmed['Year'] == 'summ'), 'Year'] = df_pubmed['MedlineDate'].astype(str).str[-4:]

In [18]:
# test
df_pubmed['Year'].value_counts()[300:350]

1833.0    221
1836.0    220
1820.0    213
1944      212
1834.0    209
1831      176
1797.0    69 
1781.0    69 
1786.0    60 
1868      58 
1865      48 
1790.0    45 
1787.0    43 
1785.0    43 
1871      43 
1929      38 
1839      36 
1943      35 
1840      34 
1788.0    33 
1792.0    33 
1789.0    32 
1922      32 
1784      31 
1783      26 
1923      25 
1860      24 
1841      24 
1782      22 
1933      21 
1873      21 
1876      20 
1796.0    20 
1798.0    19 
1791.0    19 
1794.0    18 
1793.0    17 
1932      17 
1928      15 
1925      13 
1795.0    13 
1935      6  
1918      5  
1924      3  
1927      2  
Name: Year, dtype: int64

In [19]:
# convert Year to int
df_pubmed.loc[df_pubmed['Year'].notna(), 'Year'] = df_pubmed['Year'].astype(int)
df_pubmed

Unnamed: 0,PMID,Year,MedlineDate,DOI,NlmUniqueID
0,1,1975,,10.1016/0006-2944(75)90147-7,0151424
1,2,1975,,10.1016/0006-291x(75)90482-9,0372516
2,4,1975,,10.1016/0006-291x(75)90506-9,0372516
3,3,1975,,10.1016/0006-291x(75)90498-2,0372516
4,5,1975,,10.1016/0006-291x(75)90508-2,0372516
...,...,...,...,...,...
34960695,36475580,2022,,10.1021/acsami.2c16255,101504991
34960696,36475581,2022,,10.1021/acsami.2c17703,101504991
34960697,36475582,2022,,10.1021/acsami.2c16205,101504991
34960698,36475583,2022,,10.1021/acsami.2c17147,101504991


In [20]:
# remove MedlineDate
del df_pubmed['MedlineDate']
df_pubmed

Unnamed: 0,PMID,Year,DOI,NlmUniqueID
0,1,1975,10.1016/0006-2944(75)90147-7,0151424
1,2,1975,10.1016/0006-291x(75)90482-9,0372516
2,4,1975,10.1016/0006-291x(75)90506-9,0372516
3,3,1975,10.1016/0006-291x(75)90498-2,0372516
4,5,1975,10.1016/0006-291x(75)90508-2,0372516
...,...,...,...,...
34960695,36475580,2022,10.1021/acsami.2c16255,101504991
34960696,36475581,2022,10.1021/acsami.2c17703,101504991
34960697,36475582,2022,10.1021/acsami.2c16205,101504991
34960698,36475583,2022,10.1021/acsami.2c17147,101504991


In [21]:
# test NAs
df_pubmed.loc[df_pubmed['DOI'].isna()]

Unnamed: 0,PMID,Year,DOI,NlmUniqueID
9,10,1975,,0101032
20,21,1975,,0372660
21,22,1975,,0372660
22,23,1975,,0372660
23,24,1975,,0372660
...,...,...,...,...
34960153,36475032,2022,,101528042
34960154,36475034,2022,,101528042
34960157,36475035,2022,,101528042
34960159,36475036,2022,,101528042


In [22]:
# simplification of DOI to 0/1 to reduce size
df_pubmed.loc[df_pubmed['DOI'].notna(), 'DOI2'] = 1
df_pubmed.loc[df_pubmed['DOI'].isna(), 'DOI2'] = 0
df_pubmed

Unnamed: 0,PMID,Year,DOI,NlmUniqueID,DOI2
0,1,1975,10.1016/0006-2944(75)90147-7,0151424,1.0
1,2,1975,10.1016/0006-291x(75)90482-9,0372516,1.0
2,4,1975,10.1016/0006-291x(75)90506-9,0372516,1.0
3,3,1975,10.1016/0006-291x(75)90498-2,0372516,1.0
4,5,1975,10.1016/0006-291x(75)90508-2,0372516,1.0
...,...,...,...,...,...
34960695,36475580,2022,10.1021/acsami.2c16255,101504991,1.0
34960696,36475581,2022,10.1021/acsami.2c17703,101504991,1.0
34960697,36475582,2022,10.1021/acsami.2c16205,101504991,1.0
34960698,36475583,2022,10.1021/acsami.2c17147,101504991,1.0


In [23]:
# test NAs
df_pubmed.loc[df_pubmed['DOI2'] == 0]

Unnamed: 0,PMID,Year,DOI,NlmUniqueID,DOI2
9,10,1975,,0101032,0.0
20,21,1975,,0372660,0.0
21,22,1975,,0372660,0.0
22,23,1975,,0372660,0.0
23,24,1975,,0372660,0.0
...,...,...,...,...,...
34960153,36475032,2022,,101528042,0.0
34960154,36475034,2022,,101528042,0.0
34960157,36475035,2022,,101528042,0.0
34960159,36475036,2022,,101528042,0.0


In [24]:
del df_pubmed['DOI']
df_pubmed = df_pubmed.rename(columns={'DOI2' : 'DOI'})
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI
0,1,1975,0151424,1.0
1,2,1975,0372516,1.0
2,4,1975,0372516,1.0
3,3,1975,0372516,1.0
4,5,1975,0372516,1.0
...,...,...,...,...
34960695,36475580,2022,101504991,1.0
34960696,36475581,2022,101504991,1.0
34960697,36475582,2022,101504991,1.0
34960698,36475583,2022,101504991,1.0


In [25]:
# csv export
df_pubmed.to_csv('data/temp/2023/pubmed_simplified.tsv', sep='\t', index=False, header=True)

## MEDLINE Journals enriched

In [1]:
import pandas as pd
# display the full content of rows (non truncated)
pd.set_option('display.max_colwidth', -1)
# display all the columns
pd.set_option('display.max_columns', None)

In [2]:
# Open MEDLINE journals data merged with Sherpa/Romeo, DOAJ and reveres flipped data
df_journals = pd.read_csv('results/2023/MedlineJournals.tsv', delimiter='\t', header=0)
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [3]:
# Open Pubmed data
df_pubmed = pd.read_csv('data/temp/2023/pubmed_simplified.tsv', delimiter='\t', header=0)
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI
0,1,1975,0151424,1.0
1,2,1975,0372516,1.0
2,4,1975,0372516,1.0
3,3,1975,0372516,1.0
4,5,1975,0372516,1.0
...,...,...,...,...
34960695,36475580,2022,101504991,1.0
34960696,36475581,2022,101504991,1.0
34960697,36475582,2022,101504991,1.0
34960698,36475583,2022,101504991,1.0


In [4]:
# test DOIs
df_pubmed.loc[df_pubmed['DOI'] == 0]

Unnamed: 0,PMID,Year,NlmUniqueID,DOI
9,10,1975,0101032,0.0
20,21,1975,0372660,0.0
21,22,1975,0372660,0.0
22,23,1975,0372660,0.0
23,24,1975,0372660,0.0
...,...,...,...,...
34960153,36475032,2022,101528042,0.0
34960154,36475034,2022,101528042,0.0
34960157,36475035,2022,101528042,0.0
34960159,36475036,2022,101528042,0.0


In [5]:
# test DOIs
df_pubmed.loc[df_pubmed['DOI'].isna()]

Unnamed: 0,PMID,Year,NlmUniqueID,DOI


In [6]:
# Open MEDLINE journals unconquerables
df_unconquerables = pd.read_csv('results/2023/MedlineJournals_not_sherpa_and_not_doaj_or_not_oa.tsv', delimiter='\t', header=0)
df_unconquerables

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,not_sherpa_and_not_doaj_or_not_oa
0,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
1,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
2,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
3,101206716,Acta biochimica et biophysica Sinica,Acta Biochim Biophys Sin (Shanghai),China,Shanghai,China Science Publishing & Media Ltd.,2004,,Monthly,1745-7270,1672-9145,1672-9145,eng,Y,2004.0,Y,Y,IM,https://academic.oup.com/abbs,Y,1745-7270,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
4,101194794,Acta of bioengineering and biomechanics,Acta Bioeng Biomech,Poland,Wrocław,Oficyna Wydawnicza Politechniki Wrocławskiej,1999,,Irregular,,1509-409X,1509-409X,eng,N,2007.0,Y,Y,IM,http://www.actabio.pwr.wroc.pl/archive.php,Y,1509-409X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,9425197,Zhonghua yi xue yi chuan xue za zhi = Zhonghua yixue yichuanxue zazhi = Chinese journal of medical genetics,Zhonghua Yi Xue Yi Chuan Xue Za Zhi,China,"Chengdu, Sichuan, P.R. China",Sichuan University,1992,,Bimonthly,,1003-9406,1003-9406,chi,N,1998.0,Y,N,IM,,Y,1003-9406,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
779,7511141,Zhonghua yi xue za zhi,Zhonghua Yi Xue Za Zhi,China,Beijing,Zhonghua yi xue hui,1960,,"Semimonthly,",,0376-2491,0376-2491,chi,N,1973.0,Y,N,IM,,Y,0376-2491,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
780,7904962,Zhonghua yu fang yi xue za zhi [Chinese journal of preventive medicine],Zhonghua Yu Fang Yi Xue Za Zhi,China,Beijing,Zhonghua yi xue hui,1967,,Bimonthly,,0253-9624,0253-9624,chi,N,1979.0,Y,N,IM,,Y,0253-9624,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes
781,7910681,Zhonghua zhong liu za zhi [Chinese journal of oncology],Zhonghua Zhong Liu Za Zhi,China,Peking,Chinese Medical Association,1979,,Bimonthly,,0253-3766,0253-3766,chi,N,1979.0,Y,N,IM,,Y,0253-3766,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes


In [7]:
df_unconquerables.loc[df_unconquerables['NlmUniqueID'].notna(), 'unconquerables'] = 1
df_unconquerables

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,not_sherpa_and_not_doaj_or_not_oa,unconquerables
0,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
1,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
2,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
3,101206716,Acta biochimica et biophysica Sinica,Acta Biochim Biophys Sin (Shanghai),China,Shanghai,China Science Publishing & Media Ltd.,2004,,Monthly,1745-7270,1672-9145,1672-9145,eng,Y,2004.0,Y,Y,IM,https://academic.oup.com/abbs,Y,1745-7270,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
4,101194794,Acta of bioengineering and biomechanics,Acta Bioeng Biomech,Poland,Wrocław,Oficyna Wydawnicza Politechniki Wrocławskiej,1999,,Irregular,,1509-409X,1509-409X,eng,N,2007.0,Y,Y,IM,http://www.actabio.pwr.wroc.pl/archive.php,Y,1509-409X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,9425197,Zhonghua yi xue yi chuan xue za zhi = Zhonghua yixue yichuanxue zazhi = Chinese journal of medical genetics,Zhonghua Yi Xue Yi Chuan Xue Za Zhi,China,"Chengdu, Sichuan, P.R. China",Sichuan University,1992,,Bimonthly,,1003-9406,1003-9406,chi,N,1998.0,Y,N,IM,,Y,1003-9406,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
779,7511141,Zhonghua yi xue za zhi,Zhonghua Yi Xue Za Zhi,China,Beijing,Zhonghua yi xue hui,1960,,"Semimonthly,",,0376-2491,0376-2491,chi,N,1973.0,Y,N,IM,,Y,0376-2491,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
780,7904962,Zhonghua yu fang yi xue za zhi [Chinese journal of preventive medicine],Zhonghua Yu Fang Yi Xue Za Zhi,China,Beijing,Zhonghua yi xue hui,1967,,Bimonthly,,0253-9624,0253-9624,chi,N,1979.0,Y,N,IM,,Y,0253-9624,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
781,7910681,Zhonghua zhong liu za zhi [Chinese journal of oncology],Zhonghua Zhong Liu Za Zhi,China,Peking,Chinese Medical Association,1979,,Bimonthly,,0253-3766,0253-3766,chi,N,1979.0,Y,N,IM,,Y,0253-3766,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0


In [8]:
# test types
df_unconquerables['NlmUniqueID'].dtypes

dtype('O')

In [9]:
# test empty IDs
df_unconquerables.loc[df_unconquerables['NlmUniqueID'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,not_sherpa_and_not_doaj_or_not_oa,unconquerables


In [10]:
# convert NlmUniqueID to int
# ERROR: some journals have alpha in NlmUniqueID: 2985165R, etc.
# df_unconquerables.loc[df_unconquerables['NlmUniqueID'].notna(), 'NlmUniqueID'] = df_unconquerables['NlmUniqueID'].astype('int64')

In [11]:
# convert to str
df_unconquerables.loc[df_unconquerables['NlmUniqueID'].notna(), 'NlmUniqueID'] = df_unconquerables['NlmUniqueID'].astype(str)

In [12]:
# convert to str
df_journals.loc[df_journals['NlmUniqueID'].notna(), 'NlmUniqueID'] = df_journals['NlmUniqueID'].astype(str)

In [13]:
# convert to str
df_pubmed.loc[df_pubmed['NlmUniqueID'].notna(), 'NlmUniqueID'] = df_pubmed['NlmUniqueID'].astype(str)

In [14]:
# merge
df_pubmed = df_pubmed.merge(df_unconquerables[['NlmUniqueID', 'unconquerables']], on='NlmUniqueID', how='left')
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI,unconquerables
0,1,1975,0151424,1.0,
1,2,1975,0372516,1.0,
2,4,1975,0372516,1.0,
3,3,1975,0372516,1.0,
4,5,1975,0372516,1.0,
...,...,...,...,...,...
34960695,36475580,2022,101504991,1.0,
34960696,36475581,2022,101504991,1.0,
34960697,36475582,2022,101504991,1.0,
34960698,36475583,2022,101504991,1.0,


In [15]:
# test unconquerables
df_pubmed.loc[df_pubmed['unconquerables'] == 1]

Unnamed: 0,PMID,Year,NlmUniqueID,DOI,unconquerables
109,110,1975,7507032,0.0,1.0
145,146,1975,0004743,0.0,1.0
166,167,1975,0364441,0.0,1.0
276,277,1975,0374617,0.0,1.0
277,278,1975,0420550,0.0,1.0
...,...,...,...,...,...
34960666,36475547,2022,101776555,1.0,1.0
34960667,36475548,2022,101776555,1.0,1.0
34960675,36475556,2022,101777295,1.0,1.0
34960676,36475557,2022,8904033,1.0,1.0


In [16]:
df_unconquerables_years = df_unconquerables[['PublicationFirstYear', 'unconquerables']].groupby(by='PublicationFirstYear').count()

In [17]:
df_unconquerables_years.sort_values(by='PublicationFirstYear', ascending=False)

Unnamed: 0_level_0,unconquerables
PublicationFirstYear,Unnamed: 1_level_1
20uu,1
2023,2
2022,15
2021,17
2020,7
...,...
1872,1
1862,1
1859,1
1845,1


In [18]:
# attribution of 0 to other journals
df_pubmed.loc[df_pubmed['unconquerables'].isna(), 'unconquerables'] = 0
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI,unconquerables
0,1,1975,0151424,1.0,0.0
1,2,1975,0372516,1.0,0.0
2,4,1975,0372516,1.0,0.0
3,3,1975,0372516,1.0,0.0
4,5,1975,0372516,1.0,0.0
...,...,...,...,...,...
34960695,36475580,2022,101504991,1.0,0.0
34960696,36475581,2022,101504991,1.0,0.0
34960697,36475582,2022,101504991,1.0,0.0
34960698,36475583,2022,101504991,1.0,0.0


In [19]:
# export counts by year
counts_pubmed_by_year = df_pubmed['Year'].value_counts().rename_axis('Year').reset_index(name='counts')
df_pubmed_by_year = pd.DataFrame(counts_pubmed_by_year, columns=['Year', 'counts'])
df_pubmed_by_year.sort_values(by='Year', ascending=False).to_csv('results/2023/pubmed_counts_by_year.tsv', header=True, sep='\t', index=False)
df_pubmed_by_year

Unnamed: 0,Year,counts
0,2022,1595029
1,2021,1567593
2,2020,1404250
3,2019,1228698
4,2018,1173512
...,...,...
238,1798,19
239,1791,19
240,1794,18
241,1793,17


In [20]:
# export counts by year for unconquerables
counts_pubmed_unconquerables_by_year = df_pubmed.loc[df_pubmed['unconquerables'] == 1]['Year'].value_counts().rename_axis('Year').reset_index(name='unconquerables_counts')
df_pubmed_unconquerables_by_year = pd.DataFrame(counts_pubmed_unconquerables_by_year, columns=['Year', 'unconquerables_counts'])
# df_pubmed_unconquerables_by_year.sort_values(by='Year', ascending=False).to_csv('results/2023/pubmed_unconquerables_counts_by_year.csv', header=True, sep='\t', index=False)
df_pubmed_unconquerables_by_year

Unnamed: 0,Year,unconquerables_counts
0,2021,90125
1,2020,86827
2,2019,84499
3,2018,84469
4,2022,84017
...,...,...
91,1932,45
92,1931,34
93,1929,32
94,1930,31


In [21]:
# merge
df_pubmed_by_year = df_pubmed_by_year.merge(df_pubmed_unconquerables_by_year, on='Year', how='left')
df_pubmed_by_year

Unnamed: 0,Year,counts,unconquerables_counts
0,2022,1595029,84017.0
1,2021,1567593,90125.0
2,2020,1404250,86827.0
3,2019,1228698,84499.0
4,2018,1173512,84469.0
...,...,...,...
238,1798,19,
239,1791,19,
240,1794,18,
241,1793,17,


In [22]:
# replace NaN by 0
df_pubmed_by_year.loc[df_pubmed_by_year['unconquerables_counts'].isna(), 'unconquerables_counts'] = 0
df_pubmed_by_year.loc[df_pubmed_by_year['unconquerables_counts'].notna(), 'unconquerables_counts'] = df_pubmed_by_year['unconquerables_counts'].astype(int)

In [23]:
# add ratios
df_pubmed_by_year['ratio'] = df_pubmed_by_year['unconquerables_counts'] / df_pubmed_by_year['counts']
df_pubmed_by_year

Unnamed: 0,Year,counts,unconquerables_counts,ratio
0,2022,1595029,84017,0.052674
1,2021,1567593,90125,0.057493
2,2020,1404250,86827,0.061832
3,2019,1228698,84499,0.068771
4,2018,1173512,84469,0.071980
...,...,...,...,...
238,1798,19,0,0.000000
239,1791,19,0,0.000000
240,1794,18,0,0.000000
241,1793,17,0,0.000000


In [24]:
# export
df_pubmed_by_year.sort_values(by='Year', ascending=False).to_csv('results/2023/pubmed_unconquerables_counts_by_year.tsv', sep='\t', index=False)
df_pubmed_by_year

Unnamed: 0,Year,counts,unconquerables_counts,ratio
0,2022,1595029,84017,0.052674
1,2021,1567593,90125,0.057493
2,2020,1404250,86827,0.061832
3,2019,1228698,84499,0.068771
4,2018,1173512,84469,0.071980
...,...,...,...,...
238,1798,19,0,0.000000
239,1791,19,0,0.000000
240,1794,18,0,0.000000
241,1793,17,0,0.000000


## Compare PubMed data for currently indexed journals only

In [25]:
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [26]:
# test currently indexed
df_journals.loc[df_journals['CurrentlyIndexedYN'] == 'N']

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip


In [27]:
# test currently indexed
df_journals.loc[df_journals['CurrentlyIndexedYN'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip


In [28]:
# add 1
df_journals.loc[df_journals['NlmUniqueID'].notna(), 'CurrentlyIndexed'] = 1
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes,1.0
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0


In [29]:
# merge
df_pubmed = df_pubmed.merge(df_journals[['NlmUniqueID', 'CurrentlyIndexed']], on='NlmUniqueID', how='left')
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI,unconquerables,CurrentlyIndexed
0,1,1975,0151424,1.0,0.0,
1,2,1975,0372516,1.0,0.0,1.0
2,4,1975,0372516,1.0,0.0,1.0
3,3,1975,0372516,1.0,0.0,1.0
4,5,1975,0372516,1.0,0.0,1.0
...,...,...,...,...,...,...
34960695,36475580,2022,101504991,1.0,0.0,1.0
34960696,36475581,2022,101504991,1.0,0.0,1.0
34960697,36475582,2022,101504991,1.0,0.0,1.0
34960698,36475583,2022,101504991,1.0,0.0,1.0


In [30]:
# attribution of 0 to other journals
df_pubmed.loc[df_pubmed['CurrentlyIndexed'].isna(), 'CurrentlyIndexed'] = 0
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI,unconquerables,CurrentlyIndexed
0,1,1975,0151424,1.0,0.0,0.0
1,2,1975,0372516,1.0,0.0,1.0
2,4,1975,0372516,1.0,0.0,1.0
3,3,1975,0372516,1.0,0.0,1.0
4,5,1975,0372516,1.0,0.0,1.0
...,...,...,...,...,...,...
34960695,36475580,2022,101504991,1.0,0.0,1.0
34960696,36475581,2022,101504991,1.0,0.0,1.0
34960697,36475582,2022,101504991,1.0,0.0,1.0
34960698,36475583,2022,101504991,1.0,0.0,1.0


In [31]:
# export counts by year for currently indexed
counts_pubmed_currently_indexed_by_year = df_pubmed.loc[df_pubmed['CurrentlyIndexed'] == 1]['Year'].value_counts().rename_axis('Year').reset_index(name='counts_currently_indexed')
df_pubmed_currently_indexed_by_year = pd.DataFrame(counts_pubmed_currently_indexed_by_year, columns=['Year', 'counts_currently_indexed'])
df_pubmed_currently_indexed_by_year.sort_values(by='Year', ascending=False).to_csv('results/2023/pubmed_currently_indexed_counts_by_year.tsv', header=True, sep='\t', index=False)
df_pubmed_currently_indexed_by_year

Unnamed: 0,Year,counts_currently_indexed
0,2022,1244839
1,2021,1205319
2,2020,1119747
3,2019,1021270
4,2018,991396
...,...,...
141,1894,333
142,1880,325
143,1879,31
144,1878,30


In [32]:
# merge
df_pubmed_by_year = df_pubmed_by_year.merge(df_pubmed_currently_indexed_by_year, on='Year', how='left')
df_pubmed_by_year

Unnamed: 0,Year,counts,unconquerables_counts,ratio,counts_currently_indexed
0,2022,1595029,84017,0.052674,1244839.0
1,2021,1567593,90125,0.057493,1205319.0
2,2020,1404250,86827,0.061832,1119747.0
3,2019,1228698,84499,0.068771,1021270.0
4,2018,1173512,84469,0.071980,991396.0
...,...,...,...,...,...
238,1798,19,0,0.000000,
239,1791,19,0,0.000000,
240,1794,18,0,0.000000,
241,1793,17,0,0.000000,


In [33]:
# replace NaN by 0
df_pubmed_by_year.loc[df_pubmed_by_year['counts_currently_indexed'].isna(), 'counts_currently_indexed'] = 0
df_pubmed_by_year.loc[df_pubmed_by_year['counts_currently_indexed'].notna(), 'counts_currently_indexed'] = df_pubmed_by_year['counts_currently_indexed'].astype(int)

In [34]:
# add ratios
df_pubmed_by_year['ratio2'] = df_pubmed_by_year['unconquerables_counts'] / df_pubmed_by_year['counts_currently_indexed']
df_pubmed_by_year

Unnamed: 0,Year,counts,unconquerables_counts,ratio,counts_currently_indexed,ratio2
0,2022,1595029,84017,0.052674,1244839,0.067492
1,2021,1567593,90125,0.057493,1205319,0.074773
2,2020,1404250,86827,0.061832,1119747,0.077542
3,2019,1228698,84499,0.068771,1021270,0.082739
4,2018,1173512,84469,0.071980,991396,0.085202
...,...,...,...,...,...,...
238,1798,19,0,0.000000,0,
239,1791,19,0,0.000000,0,
240,1794,18,0,0.000000,0,
241,1793,17,0,0.000000,0,


In [35]:
# export
df_pubmed_by_year.sort_values(by='Year', ascending=False).to_csv('results/2023/pubmed_unconquerables_counts_by_year.tsv', sep='\t', index=False)
df_pubmed_by_year

Unnamed: 0,Year,counts,unconquerables_counts,ratio,counts_currently_indexed,ratio2
0,2022,1595029,84017,0.052674,1244839,0.067492
1,2021,1567593,90125,0.057493,1205319,0.074773
2,2020,1404250,86827,0.061832,1119747,0.077542
3,2019,1228698,84499,0.068771,1021270,0.082739
4,2018,1173512,84469,0.071980,991396,0.085202
...,...,...,...,...,...,...
238,1798,19,0,0.000000,0,
239,1791,19,0,0.000000,0,
240,1794,18,0,0.000000,0,
241,1793,17,0,0.000000,0,


In [36]:
df_pubmed

Unnamed: 0,PMID,Year,NlmUniqueID,DOI,unconquerables,CurrentlyIndexed
0,1,1975,0151424,1.0,0.0,0.0
1,2,1975,0372516,1.0,0.0,1.0
2,4,1975,0372516,1.0,0.0,1.0
3,3,1975,0372516,1.0,0.0,1.0
4,5,1975,0372516,1.0,0.0,1.0
...,...,...,...,...,...,...
34960695,36475580,2022,101504991,1.0,0.0,1.0
34960696,36475581,2022,101504991,1.0,0.0,1.0
34960697,36475582,2022,101504991,1.0,0.0,1.0
34960698,36475583,2022,101504991,1.0,0.0,1.0


In [37]:
# csv export
df_pubmed.to_csv('results/2023/pubmed_merged_unconquerables_and_currently_indexed.tsv', sep='\t', index=False, header=True)

## Import PubMed publication first and last year for each journal

In [38]:
# take the first publication year for each journal
df_pubmed_journals_first_year = df_pubmed[['Year', 'NlmUniqueID']].groupby(by='NlmUniqueID').min()
df_pubmed_journals_first_year.sort_values(by='NlmUniqueID', ascending=True).to_csv('results/2023/pubmed_journals_first_year.tsv', sep='\t', index=True)
df_pubmed_journals_first_year

Unnamed: 0_level_0,Year
NlmUniqueID,Unnamed: 1_level_1
0000201,1963
0000211,1965
0000212,1965
0000213,1945
0000216,1964
...,...
9918505287106676,2022
9918505588006676,2023
9918505588106676,2022
9918505688506676,2022


In [39]:
# take the last publication year for each journal
df_pubmed_journals_last_year = df_pubmed[['Year', 'NlmUniqueID']].groupby(by='NlmUniqueID').max()
df_pubmed_journals_last_year.sort_values(by='NlmUniqueID', ascending=True).to_csv('results/2023/pubmed_journals_last_year.tsv', sep='\t', index=True)
df_pubmed_journals_last_year

Unnamed: 0_level_0,Year
NlmUniqueID,Unnamed: 1_level_1
0000201,1977
0000211,1991
0000212,1971
0000213,1992
0000216,1988
...,...
9918505287106676,2022
9918505588006676,2023
9918505588106676,2022
9918505688506676,2022


In [40]:
# reset index and rename columns
df_pubmed_journals_first_year = df_pubmed_journals_first_year.reset_index(drop=False)
df_pubmed_journals_last_year = df_pubmed_journals_last_year.reset_index(drop=False)
df_pubmed_journals_first_year = df_pubmed_journals_first_year.rename(columns={'Year' : 'first_year'})
df_pubmed_journals_last_year = df_pubmed_journals_last_year.rename(columns={'Year' : 'last_year'})

In [41]:
df_pubmed_journals_first_year

Unnamed: 0,NlmUniqueID,first_year
0,0000201,1963
1,0000211,1965
2,0000212,1965
3,0000213,1945
4,0000216,1964
...,...,...
35635,9918505287106676,2022
35636,9918505588006676,2023
35637,9918505588106676,2022
35638,9918505688506676,2022


In [42]:
df_pubmed_journals_last_year

Unnamed: 0,NlmUniqueID,last_year
0,0000201,1977
1,0000211,1991
2,0000212,1971
3,0000213,1992
4,0000216,1988
...,...,...
35635,9918505287106676,2022
35636,9918505588006676,2023
35637,9918505588106676,2022
35638,9918505688506676,2022


In [43]:
# merge first and last year infos for each journal
df_journals = df_journals.merge(df_pubmed_journals_first_year, on='NlmUniqueID', how='left')
df_journals = df_journals.merge(df_pubmed_journals_last_year, on='NlmUniqueID', how='left')
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1990.0,2019.0
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2018.0,2022.0
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2006.0,2022.0
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1974.0,2022.0
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2004.0,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes,1.0,1987.0,2022.0
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2001.0,2022.0
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2007.0,2022.0
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2008.0,2022.0


In [44]:
# test empty values
df_journals.loc[df_journals['first_year'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year
134,9918523075306676,Advances in kidney disease and health,Adv Kidney Dis Health,United States,[New York],Elsevier Inc.,2023,,Bimonthly,2949-8139,2949-8147,2949-8139,eng,Y,2023.0,Y,Y,IM,https://www.sciencedirect.com/journal/advances-in-kidney-disease-and-health,Y,2949-8139,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
713,9918522188506676,Biomolecules and biomedicine,Biomol Biomed,Bosnia and Herzegovina,Sarajevo,Association of Basic Medical Sciences of FBIH,2023,,,2831-090X,2831-0896,2831-0896,eng,Y,2023.0,Y,Y,IM,http://bjbms.org/ojs/index.php/bjbms/issue/archive,Y,2831-090X,yes,0.0,0.0,0.0,yes,,Biomolecules and Biomedicine,2831-090X,2831-0896,https://www.bjbms.org/ojs/index.php/bjbms,2417.0,ba,society_publisher,https://www.bjbms.org/ojs/index.php/bjbms,Association of Basic Medical Sciences of Federation of Bosnia and Herzegovina,https://v2.sherpa.ac.uk/id/publisher_policy/2417,2015-07-22 13:37:57,2023-02-28 06:23:38,29978.0,2831-090X,,Biomolecules & Biomedicine,2831-090X,No,2023-02-27T13:14:38Z,2023.0,CC BY,Yes,2023-02-27T13:14:38Z,Yes,yes,,,,,,,,,,,,,,,,,1.0,,
930,9918418287206676,Canadian journal of health history = Revue canadienne d'histoire de la santé,Can J Health Hist,Canada,Toronto,University of Toronto Press,2022,,,2816-6477,2816-6469,,eng,Y,2022.0,Y,Y,QIS,https://www.utpjournals.press/loi/cjhh,Y,2816-6477,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
2451,9918487342606676,International journal of social determinants of health and health services,Int J Soc Determinants Health Health Serv,United States,"Thousand Oaks, CA",SAGE Publishing,2023,,Quarterly,2755-1946,2755-1938,2755-1938,eng,Y,2023.0,Y,Y,IM,https://journals.sagepub.com/home/joh,Y,2755-1946,yes,0.0,,0.0,yes,yes,International Journal of Social Determinants of Health and Health Services,2755-1946,2755-1938,https://journals.sagepub.com/home/joh,65.0,gb,commercial_publisher,https://uk.sagepub.com/en-gb/eur/home,SAGE Publications,https://v2.sherpa.ac.uk/id/publisher_policy/65,2010-08-05 10:52:45,2023-01-12 09:47:57,7349.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
2494,9918540787006676,Interdisciplinary cardiovascular and thoracic surgery,Interdiscip Cardiovasc Thorac Surg,England,[Oxford],Oxford University Press,2023,,Monthly,2753-670X,,2753-670X,eng,Y,2023.0,Y,Y,IM,https://academic.oup.com/icvts,Y,2753-670X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,


In [45]:
# test empty values
df_journals.loc[df_journals['last_year'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year
134,9918523075306676,Advances in kidney disease and health,Adv Kidney Dis Health,United States,[New York],Elsevier Inc.,2023,,Bimonthly,2949-8139,2949-8147,2949-8139,eng,Y,2023.0,Y,Y,IM,https://www.sciencedirect.com/journal/advances-in-kidney-disease-and-health,Y,2949-8139,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
713,9918522188506676,Biomolecules and biomedicine,Biomol Biomed,Bosnia and Herzegovina,Sarajevo,Association of Basic Medical Sciences of FBIH,2023,,,2831-090X,2831-0896,2831-0896,eng,Y,2023.0,Y,Y,IM,http://bjbms.org/ojs/index.php/bjbms/issue/archive,Y,2831-090X,yes,0.0,0.0,0.0,yes,,Biomolecules and Biomedicine,2831-090X,2831-0896,https://www.bjbms.org/ojs/index.php/bjbms,2417.0,ba,society_publisher,https://www.bjbms.org/ojs/index.php/bjbms,Association of Basic Medical Sciences of Federation of Bosnia and Herzegovina,https://v2.sherpa.ac.uk/id/publisher_policy/2417,2015-07-22 13:37:57,2023-02-28 06:23:38,29978.0,2831-090X,,Biomolecules & Biomedicine,2831-090X,No,2023-02-27T13:14:38Z,2023.0,CC BY,Yes,2023-02-27T13:14:38Z,Yes,yes,,,,,,,,,,,,,,,,,1.0,,
930,9918418287206676,Canadian journal of health history = Revue canadienne d'histoire de la santé,Can J Health Hist,Canada,Toronto,University of Toronto Press,2022,,,2816-6477,2816-6469,,eng,Y,2022.0,Y,Y,QIS,https://www.utpjournals.press/loi/cjhh,Y,2816-6477,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
2451,9918487342606676,International journal of social determinants of health and health services,Int J Soc Determinants Health Health Serv,United States,"Thousand Oaks, CA",SAGE Publishing,2023,,Quarterly,2755-1946,2755-1938,2755-1938,eng,Y,2023.0,Y,Y,IM,https://journals.sagepub.com/home/joh,Y,2755-1946,yes,0.0,,0.0,yes,yes,International Journal of Social Determinants of Health and Health Services,2755-1946,2755-1938,https://journals.sagepub.com/home/joh,65.0,gb,commercial_publisher,https://uk.sagepub.com/en-gb/eur/home,SAGE Publications,https://v2.sherpa.ac.uk/id/publisher_policy/65,2010-08-05 10:52:45,2023-01-12 09:47:57,7349.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
2494,9918540787006676,Interdisciplinary cardiovascular and thoracic surgery,Interdiscip Cardiovasc Thorac Surg,England,[Oxford],Oxford University Press,2023,,Monthly,2753-670X,,2753-670X,eng,Y,2023.0,Y,Y,IM,https://academic.oup.com/icvts,Y,2753-670X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,


## Import PubMed publication volume for each journal

In [46]:
# take the number of publications year for each journal
df_pubmed_journals_volume = df_pubmed[['PMID', 'NlmUniqueID']].groupby(by='NlmUniqueID').count()
df_pubmed_journals_volume.sort_values(by='NlmUniqueID', ascending=True).to_csv('results/2023/pubmed_journals_volume.tsv', sep='\t', index=True)
df_pubmed_journals_volume

Unnamed: 0_level_0,PMID
NlmUniqueID,Unnamed: 1_level_1
0000201,739
0000211,4181
0000212,213
0000213,1154
0000216,1058
...,...
9918505287106676,1
9918505588006676,1
9918505588106676,1
9918505688506676,1


In [47]:
# reset index and rename columns
df_pubmed_journals_volume = df_pubmed_journals_volume.reset_index(drop=False)
df_pubmed_journals_volume = df_pubmed_journals_volume.rename(columns={'PMID' : 'PMIDs'})

In [48]:
df_pubmed_journals_volume

Unnamed: 0,NlmUniqueID,PMIDs
0,0000201,739
1,0000211,4181
2,0000212,213
3,0000213,1154
4,0000216,1058
...,...,...
35635,9918505287106676,1
35636,9918505588006676,1
35637,9918505588106676,1
35638,9918505688506676,1


In [49]:
# merge PMIDs infos for each journal
df_journals = df_journals.merge(df_pubmed_journals_volume, on='NlmUniqueID', how='left')
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1990.0,2019.0,145.0
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2018.0,2022.0,999.0
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2006.0,2022.0,851.0
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1974.0,2022.0,2547.0
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2004.0,2022.0,1976.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes,1.0,1987.0,2022.0,2680.0
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2001.0,2022.0,1019.0
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2007.0,2022.0,1445.0
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2008.0,2022.0,20965.0


In [50]:
# test empty values
df_journals.loc[df_journals['PMIDs'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs
134,9918523075306676,Advances in kidney disease and health,Adv Kidney Dis Health,United States,[New York],Elsevier Inc.,2023,,Bimonthly,2949-8139,2949-8147,2949-8139,eng,Y,2023.0,Y,Y,IM,https://www.sciencedirect.com/journal/advances-in-kidney-disease-and-health,Y,2949-8139,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
713,9918522188506676,Biomolecules and biomedicine,Biomol Biomed,Bosnia and Herzegovina,Sarajevo,Association of Basic Medical Sciences of FBIH,2023,,,2831-090X,2831-0896,2831-0896,eng,Y,2023.0,Y,Y,IM,http://bjbms.org/ojs/index.php/bjbms/issue/archive,Y,2831-090X,yes,0.0,0.0,0.0,yes,,Biomolecules and Biomedicine,2831-090X,2831-0896,https://www.bjbms.org/ojs/index.php/bjbms,2417.0,ba,society_publisher,https://www.bjbms.org/ojs/index.php/bjbms,Association of Basic Medical Sciences of Federation of Bosnia and Herzegovina,https://v2.sherpa.ac.uk/id/publisher_policy/2417,2015-07-22 13:37:57,2023-02-28 06:23:38,29978.0,2831-090X,,Biomolecules & Biomedicine,2831-090X,No,2023-02-27T13:14:38Z,2023.0,CC BY,Yes,2023-02-27T13:14:38Z,Yes,yes,,,,,,,,,,,,,,,,,1.0,,,
930,9918418287206676,Canadian journal of health history = Revue canadienne d'histoire de la santé,Can J Health Hist,Canada,Toronto,University of Toronto Press,2022,,,2816-6477,2816-6469,,eng,Y,2022.0,Y,Y,QIS,https://www.utpjournals.press/loi/cjhh,Y,2816-6477,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
2451,9918487342606676,International journal of social determinants of health and health services,Int J Soc Determinants Health Health Serv,United States,"Thousand Oaks, CA",SAGE Publishing,2023,,Quarterly,2755-1946,2755-1938,2755-1938,eng,Y,2023.0,Y,Y,IM,https://journals.sagepub.com/home/joh,Y,2755-1946,yes,0.0,,0.0,yes,yes,International Journal of Social Determinants of Health and Health Services,2755-1946,2755-1938,https://journals.sagepub.com/home/joh,65.0,gb,commercial_publisher,https://uk.sagepub.com/en-gb/eur/home,SAGE Publications,https://v2.sherpa.ac.uk/id/publisher_policy/65,2010-08-05 10:52:45,2023-01-12 09:47:57,7349.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
2494,9918540787006676,Interdisciplinary cardiovascular and thoracic surgery,Interdiscip Cardiovasc Thorac Surg,England,[Oxford],Oxford University Press,2023,,Monthly,2753-670X,,2753-670X,eng,Y,2023.0,Y,Y,IM,https://academic.oup.com/icvts,Y,2753-670X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,


## Import PubMed publications number of DOIs for each journal

In [51]:
# take the number of publications year for each journal
df_pubmed_journals_dois = df_pubmed[['DOI', 'NlmUniqueID']].groupby(by='NlmUniqueID').sum()
df_pubmed_journals_dois.sort_values(by='NlmUniqueID', ascending=True).to_csv('results/2023/pubmed_journals_dois.tsv', sep='\t', index=True)
df_pubmed_journals_dois

Unnamed: 0_level_0,DOI
NlmUniqueID,Unnamed: 1_level_1
0000201,709.0
0000211,4026.0
0000212,0.0
0000213,1015.0
0000216,0.0
...,...
9918505287106676,1.0
9918505588006676,1.0
9918505588106676,1.0
9918505688506676,1.0


In [52]:
# reset index and rename columns
df_pubmed_journals_dois = df_pubmed_journals_dois.reset_index(drop=False)
df_pubmed_journals_dois = df_pubmed_journals_dois.rename(columns={'DOI' : 'DOIs'})

In [53]:
df_pubmed_journals_dois

Unnamed: 0,NlmUniqueID,DOIs
0,0000201,709.0
1,0000211,4026.0
2,0000212,0.0
3,0000213,1015.0
4,0000216,0.0
...,...,...
35635,9918505287106676,1.0
35636,9918505588006676,1.0
35637,9918505588106676,1.0
35638,9918505688506676,1.0


In [54]:
# merge PMIDs infos for each journal
df_journals = df_journals.merge(df_pubmed_journals_dois, on='NlmUniqueID', how='left')
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs,DOIs
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1990.0,2019.0,145.0,144.0
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2018.0,2022.0,999.0,999.0
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2006.0,2022.0,851.0,837.0
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1974.0,2022.0,2547.0,0.0
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2004.0,2022.0,1976.0,1976.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes,1.0,1987.0,2022.0,2680.0,2635.0
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2001.0,2022.0,1019.0,1017.0
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2007.0,2022.0,1445.0,1444.0
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2008.0,2022.0,20965.0,20886.0


In [55]:
# test empty values
df_journals.loc[df_journals['DOIs'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs,DOIs
134,9918523075306676,Advances in kidney disease and health,Adv Kidney Dis Health,United States,[New York],Elsevier Inc.,2023,,Bimonthly,2949-8139,2949-8147,2949-8139,eng,Y,2023.0,Y,Y,IM,https://www.sciencedirect.com/journal/advances-in-kidney-disease-and-health,Y,2949-8139,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,
713,9918522188506676,Biomolecules and biomedicine,Biomol Biomed,Bosnia and Herzegovina,Sarajevo,Association of Basic Medical Sciences of FBIH,2023,,,2831-090X,2831-0896,2831-0896,eng,Y,2023.0,Y,Y,IM,http://bjbms.org/ojs/index.php/bjbms/issue/archive,Y,2831-090X,yes,0.0,0.0,0.0,yes,,Biomolecules and Biomedicine,2831-090X,2831-0896,https://www.bjbms.org/ojs/index.php/bjbms,2417.0,ba,society_publisher,https://www.bjbms.org/ojs/index.php/bjbms,Association of Basic Medical Sciences of Federation of Bosnia and Herzegovina,https://v2.sherpa.ac.uk/id/publisher_policy/2417,2015-07-22 13:37:57,2023-02-28 06:23:38,29978.0,2831-090X,,Biomolecules & Biomedicine,2831-090X,No,2023-02-27T13:14:38Z,2023.0,CC BY,Yes,2023-02-27T13:14:38Z,Yes,yes,,,,,,,,,,,,,,,,,1.0,,,,
930,9918418287206676,Canadian journal of health history = Revue canadienne d'histoire de la santé,Can J Health Hist,Canada,Toronto,University of Toronto Press,2022,,,2816-6477,2816-6469,,eng,Y,2022.0,Y,Y,QIS,https://www.utpjournals.press/loi/cjhh,Y,2816-6477,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,
2451,9918487342606676,International journal of social determinants of health and health services,Int J Soc Determinants Health Health Serv,United States,"Thousand Oaks, CA",SAGE Publishing,2023,,Quarterly,2755-1946,2755-1938,2755-1938,eng,Y,2023.0,Y,Y,IM,https://journals.sagepub.com/home/joh,Y,2755-1946,yes,0.0,,0.0,yes,yes,International Journal of Social Determinants of Health and Health Services,2755-1946,2755-1938,https://journals.sagepub.com/home/joh,65.0,gb,commercial_publisher,https://uk.sagepub.com/en-gb/eur/home,SAGE Publications,https://v2.sherpa.ac.uk/id/publisher_policy/65,2010-08-05 10:52:45,2023-01-12 09:47:57,7349.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,
2494,9918540787006676,Interdisciplinary cardiovascular and thoracic surgery,Interdiscip Cardiovasc Thorac Surg,England,[Oxford],Oxford University Press,2023,,Monthly,2753-670X,,2753-670X,eng,Y,2023.0,Y,Y,IM,https://academic.oup.com/icvts,Y,2753-670X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,


## Import number of PubMed refs for 2021 and 2022

In [56]:
# take the number of publications year for each journal
df_pubmed_journals_pmids_2021 = df_pubmed.loc[df_pubmed['Year'] == 2021][['PMID', 'NlmUniqueID']].groupby(by='NlmUniqueID').count()
df_pubmed_journals_pmids_2021.sort_values(by='NlmUniqueID', ascending=True).to_csv('results/2023/pubmed_journals_pmids_2021.tsv', sep='\t', index=True)
df_pubmed_journals_pmids_2021

Unnamed: 0_level_0,PMID
NlmUniqueID,Unnamed: 1_level_1
0001027,161
0004041,627
0004047,34
0004653,189
0004743,37
...,...
9918486785306676,37
9918486785406676,1
9918487388606676,1
9918487580106676,1


In [57]:
# take the number of publications year for each journal
df_pubmed_journals_pmids_2022 = df_pubmed.loc[df_pubmed['Year'] == 2022][['PMID', 'NlmUniqueID']].groupby(by='NlmUniqueID').count()
df_pubmed_journals_pmids_2022.sort_values(by='NlmUniqueID', ascending=True).to_csv('results/2023/pubmed_journals_pmids_2022.tsv', sep='\t', index=True)
df_pubmed_journals_pmids_2022

Unnamed: 0_level_0,PMID
NlmUniqueID,Unnamed: 1_level_1
0001027,186
0004041,521
0004047,43
0004653,187
0004743,35
...,...
9918505286106676,2
9918505287106676,1
9918505588106676,1
9918505688506676,1


In [58]:
# reset index and rename columns
df_pubmed_journals_pmids_2021 = df_pubmed_journals_pmids_2021.reset_index(drop=False)
df_pubmed_journals_pmids_2021 = df_pubmed_journals_pmids_2021.rename(columns={'PMID' : 'PMIDs_2021'})
df_pubmed_journals_pmids_2022 = df_pubmed_journals_pmids_2022.reset_index(drop=False)
df_pubmed_journals_pmids_2022 = df_pubmed_journals_pmids_2022.rename(columns={'PMID' : 'PMIDs_2022'})

In [59]:
df_pubmed_journals_pmids_2022

Unnamed: 0,NlmUniqueID,PMIDs_2022
0,0001027,186
1,0004041,521
2,0004047,43
3,0004653,187
4,0004743,35
...,...,...
10525,9918505286106676,2
10526,9918505287106676,1
10527,9918505588106676,1
10528,9918505688506676,1


In [60]:
# merge PMIDs infos for each journal
df_journals = df_journals.merge(df_pubmed_journals_pmids_2021, on='NlmUniqueID', how='left')
df_journals = df_journals.merge(df_pubmed_journals_pmids_2022, on='NlmUniqueID', how='left')
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs,DOIs,PMIDs_2021,PMIDs_2022
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1990.0,2019.0,145.0,144.0,,
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2018.0,2022.0,999.0,999.0,187.0,73.0
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2006.0,2022.0,851.0,837.0,59.0,37.0
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1974.0,2022.0,2547.0,0.0,75.0,60.0
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2004.0,2022.0,1976.0,1976.0,123.0,109.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes,1.0,1987.0,2022.0,2680.0,2635.0,62.0,53.0
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2001.0,2022.0,1019.0,1017.0,74.0,41.0
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2007.0,2022.0,1445.0,1444.0,102.0,117.0
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2008.0,2022.0,20965.0,20886.0,2316.0,1315.0


In [61]:
# test empty values
df_journals.loc[df_journals['PMIDs_2022'].isna()]

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs,DOIs,PMIDs_2021,PMIDs_2022
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1990.0,2019.0,145.0,144.0,,
28,101194794,Acta of bioengineering and biomechanics,Acta Bioeng Biomech,Poland,Wrocław,Oficyna Wydawnicza Politechniki Wrocławskiej,1999,,Irregular,,1509-409X,1509-409X,eng,N,2007.0,Y,Y,IM,http://www.actabio.pwr.wroc.pl/archive.php,Y,1509-409X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2007.0,2021.0,847.0,0.0,54.0,
84,9303678,"Acta pharmaceutica (Zagreb, Croatia)",Acta Pharm,Poland,"Warsaw, Poland",Sciendo,1992,,Quarterly,1846-9558,1330-0075,1330-0075,eng,N,2003.0,Y,Y,IM,https://sciendo.com/journal/ACPH,Y,1846-9558,yes,0.0,0.0,,yes,,Acta Pharmaceutica,1846-9558,,https://sciendo.com/journal/ACPH,62066.0,pl,commercial_publisher,https://sciendo.com/,Sciendo,https://v2.sherpa.ac.uk/id/publisher_policy/3523,2014-04-10 10:12:04,2022-12-23 09:58:33,26385.0,1846-9558,,Acta Pharmaceutica,1846-9558,No,2007-11-23T10:28:44Z,2007.0,CC BY-NC-ND,No,2022-02-28T14:12:39Z,Yes,yes,,,,,,,,,,,,,,,,,1.0,2003.0,2021.0,735.0,590.0,20.0,
88,0370365,Acta psychiatrica Scandinavica. Supplementum,Acta Psychiatr Scand Suppl,Denmark,"Malden, MA",Wiley-Blackwell,1961,,Irregular,1600-5473,0065-1591,0065-1591,eng,N,1965.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1600-0447,Y,1600-5473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1959.0,2015.0,1664.0,1292.0,,
109,0407712,"Advances in anatomy, embryology, and cell biology",Adv Anat Embryol Cell Biol,Germany,Berlin,Springer Verlag,1973,,Irregular,,0301-5556,0301-5556,eng,N,1965.0,Y,Y,IM,https://link.springer.com/bookseries/102,Y,0301-5556,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1973.0,2021.0,345.0,249.0,18.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4951,0051012,Texas medicine,Tex Med,United States,Austin,Texas Medical Assn.,1966,,Monthly,1938-3223,0040-4470,0040-4470,eng,N,1965.0,Y,Y,IM,http://www.texmed.org/Template.aspx?id=552,Y,1938-3223,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1965.0,2021.0,5615.0,0.0,50.0,
5081,0417475,Uirusu,Uirusu,Japan,Kyoto,Society Of Japanese Virologists,1958,,Semiannual,,0042-6857,0042-6857,jpn,N,1972.0,Y,Y,IM,https://www.jstage.jst.go.jp/browse/jsv,Y,0042-6857,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1972.0,2021.0,989.0,756.0,11.0,
5161,0330122,"Vital and health statistics. Series 2, Data evaluation and methods research",Vital Health Stat 2,United States,"Hyattsville, Md.",U.S. National Center for Health Statistics,1963,,Irregular,2333-0872,0083-2057,0083-2057,eng,N,1977.0,Y,Y,IM,http://www.cdc.gov/nchs/products/series/series02.htm,Y,2333-0872,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1965.0,2021.0,185.0,9.0,1.0,
5187,9607541,Windows in time,Windows Time,United States,"Charlottesville, Va.","Center for Nursing Historical Inquiry, University of Virginia, School of Nursing",1993,,Semiannual,2576-5221,2576-5213,2576-5213,eng,N,,Y,Y,QIS,https://www.nursing.virginia.edu/nursing-history/nursing-history-newsletter/,Y,2576-5221,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1999.0,2018.0,38.0,0.0,,


## Add unconquerable tag to journals


In [62]:
df_unconquerables

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,not_sherpa_and_not_doaj_or_not_oa,unconquerables
0,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
1,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
2,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
3,101206716,Acta biochimica et biophysica Sinica,Acta Biochim Biophys Sin (Shanghai),China,Shanghai,China Science Publishing & Media Ltd.,2004,,Monthly,1745-7270,1672-9145,1672-9145,eng,Y,2004.0,Y,Y,IM,https://academic.oup.com/abbs,Y,1745-7270,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
4,101194794,Acta of bioengineering and biomechanics,Acta Bioeng Biomech,Poland,Wrocław,Oficyna Wydawnicza Politechniki Wrocławskiej,1999,,Irregular,,1509-409X,1509-409X,eng,N,2007.0,Y,Y,IM,http://www.actabio.pwr.wroc.pl/archive.php,Y,1509-409X,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,9425197,Zhonghua yi xue yi chuan xue za zhi = Zhonghua yixue yichuanxue zazhi = Chinese journal of medical genetics,Zhonghua Yi Xue Yi Chuan Xue Za Zhi,China,"Chengdu, Sichuan, P.R. China",Sichuan University,1992,,Bimonthly,,1003-9406,1003-9406,chi,N,1998.0,Y,N,IM,,Y,1003-9406,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
779,7511141,Zhonghua yi xue za zhi,Zhonghua Yi Xue Za Zhi,China,Beijing,Zhonghua yi xue hui,1960,,"Semimonthly,",,0376-2491,0376-2491,chi,N,1973.0,Y,N,IM,,Y,0376-2491,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
780,7904962,Zhonghua yu fang yi xue za zhi [Chinese journal of preventive medicine],Zhonghua Yu Fang Yi Xue Za Zhi,China,Beijing,Zhonghua yi xue hui,1967,,Bimonthly,,0253-9624,0253-9624,chi,N,1979.0,Y,N,IM,,Y,0253-9624,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0
781,7910681,Zhonghua zhong liu za zhi [Chinese journal of oncology],Zhonghua Zhong Liu Za Zhi,China,Peking,Chinese Medical Association,1979,,Bimonthly,,0253-3766,0253-3766,chi,N,1979.0,Y,N,IM,,Y,0253-3766,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yes,1.0


In [63]:
# merge
df_journals = df_journals.merge(df_unconquerables[['NlmUniqueID', 'unconquerables']], on='NlmUniqueID', how='left')
df_journals

Unnamed: 0,NlmUniqueID,Title,MedlineTA,Country,Place,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,ISSN-Electronic,ISSN-Print,ISSN-Linking,Language,TitleContinuationYN,IndexingStartDate,CurrentlyIndexedYN,IndexOnlineYN,IndexingSubset,IndexingSelectedURL,ReportedMedlineYN,ISSN,sherpa_has_oa_path,embargo,embargo_published_version,embargo_accepted_version,sherpa_oa_green,additional_oa_fee,title_sherpa,issne_sherpa,issnp_sherpa,url,publisher_id,publisher_country,publisher_type,publisher_url,publisher_name,sherpa_uri,sherpa_created,sherpa_last_modified,sherpa_id,DOAJ_pissn,ISSN_old,DOAJ_title,DOAJ_eissn,DOAJ_Seal,DOAJ_date,DOAJ_year,DOAJ_license,DOAJ_author_holds_copyright_without_restrictions,DOAJ_last_updated,DOAJ_APC,DOAJ,year_founded,year_reverse_flipped,journal_location,society_affiliation,other_sci_affiliation,flipped_journal_name,post_flip_publisher,flipped_access_model,apc_pre,apc_post,apc_now,discipline,flipped_oa,flipped_born_oa,flipped_embargo,Reverse_Flip,CurrentlyIndexed,first_year,last_year,PMIDs,DOIs,PMIDs_2021,PMIDs_2022,unconquerables
0,9015384,20 century British history,20 Century Br Hist,England,"Eynsham, Oxford",Oxford University Press,1990,,"4 no. a year,",1477-4674,0955-2359,0955-2359,eng,N,1990.0,Y,N,QIS,,Y,1477-4674,yes,24.0,,24.0,yes,yes,Twentieth Century British History,1477-4674,0955-2359,https://academic.oup.com/tcbh,55.0,gb,university_publisher,https://academic.oup.com/journals/,Oxford University Press,https://v2.sherpa.ac.uk/id/publisher_policy/1112,2010-07-15 16:04:39,2022-07-26 10:25:23,1406.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1990.0,2019.0,145.0,144.0,,,
1,101714112,A&A practice,A A Pract,United States,"[Philadelphia, PA]","Wolters Kluwer Health, Inc.",2018,,Biweekly,2575-3126,,2575-3126,eng,Y,2018.0,Y,Y,IM,https://ovidsp.ovid.com/ovidweb.cgi?T=JS&MODE=ovid&PAGE=toc&D=ovft&AN=02054229-000000000-00000,Y,2575-3126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2018.0,2022.0,999.0,999.0,187.0,73.0,1.0
2,101269322,AACN advanced critical care,AACN Adv Crit Care,United States,"Aliso Viejo, CA",American Association of Critical-Care Nurses (AACN),2006,,Quarterly,1559-7776,1559-7768,1559-7768,eng,Y,2006.0,Y,Y,N,https://aacnjournals.org/aacnacconline,Y,1559-7776,yes,,,,,,AACN Advanced Critical Care,1559-7776,1559-7768,http://acc.aacnjournals.org/,663.0,us,society_publisher,https://www.aacn.org/,American Association of Critical Care Nurses,https://v2.sherpa.ac.uk/id/publisher_policy/663,2010-08-24 15:05:09,2022-07-08 08:42:33,10921.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2006.0,2022.0,851.0,837.0,59.0,37.0,1.0
3,0431420,AANA journal,AANA J,United States,"Park Ridge, Ill.",American Association of Nurse Anesthetists,1974,,Bimonthly,2162-5239,0094-6354,0094-6354,eng,N,1974.0,Y,Y,N,https://www.aana.com/publications/aana-journal,Y,2162-5239,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1974.0,2022.0,2547.0,0.0,75.0,60.0,1.0
4,101223209,The AAPS journal,AAPS J,United States,"Arlington, Va., USA",American Association of Pharmaceutical Scientists,2004,,Four no. a year,1550-7416,,1550-7416,eng,Y,2004.0,Y,Y,IM,https://link.springer.com/journal/12248,Y,1550-7416,yes,12.0,,12.0,yes,yes,AAPS Journal,,1550-7416,http://link.springer.com/journal/12248,313.0,us,client_organisation,https://www.aaps.org/home,American Association of Pharmaceutical Scientists,https://v2.sherpa.ac.uk/id/publisher_policy/3291,2010-09-15 13:16:19,2023-01-05 14:55:40,16180.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2004.0,2022.0,1976.0,1976.0,123.0,109.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Zoolog Sci,Japan,"Tokyo, Japan",Zoological Society of Japan,1984,,"Monthly,",,0289-0003,0289-0003,eng,N,2002.0,Y,Y,IM,http://www.bioone.org/loi/jzoo,Y,0289-0003,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984.0,2005.0,Japan,Zoological Society of Japan,0,Zoological Science,Zoological Society of Japan,Subscription,,-,-,LS,1.0,0.0,,yes,1.0,1987.0,2022.0,2680.0,2635.0,62.0,53.0,1.0
5275,9435608,"Zoology (Jena, Germany)",Zoology (Jena),Germany,"Jena, Germany",Urban & Fischer,1994,,"Six no. a year,",1873-2720,0944-2006,0944-2006,eng,N,2005.0,Y,Y,IM,https://www.sciencedirect.com/journal/zoology,Y,1873-2720,yes,12.0,,12.0,yes,yes,,,,,,,,,,,,,15919.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2001.0,2022.0,1019.0,1017.0,74.0,41.0,
5276,101300786,Zoonoses and public health,Zoonoses Public Health,Germany,"Berlin, Germany",Blackwell Verlag,2007,,Ten no. a year,1863-2378,1863-1959,1863-1959,eng,Y,2007.0,Y,Y,IM,http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1863-2378,Y,1863-2378,yes,12.0,,12.0,yes,yes,Zoonoses and Public Health,1863-2378,1863-1959,https://onlinelibrary.wiley.com/journal/18632378,580.0,us,commercial_publisher,https://www.wiley.com/en-gb,Wiley,https://v2.sherpa.ac.uk/id/publisher_policy/2050,2010-07-20 14:58:33,2022-07-27 12:37:58,2555.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2007.0,2022.0,1445.0,1444.0,102.0,117.0,
5277,101179386,Zootaxa,Zootaxa,New Zealand,"Auckland, N.Z.",Magnolia Press,2001,,Irregular,1175-5334,1175-5326,1175-5326,eng,N,2013.0,Y,Y,IM,http://www.mapress.com/j/zt/,Y,1175-5334,yes,,,,,yes,Zootaxa,1175-5334,1175-5326,https://www.mapress.com/zt/,284.0,nz,commercial_publisher,https://www.mapress.com/,Magnolia Press,https://v2.sherpa.ac.uk/id/publisher_policy/284,2010-06-30 17:47:50,2022-07-15 08:53:17,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2008.0,2022.0,20965.0,20886.0,2316.0,1315.0,


In [64]:
df_journals.loc[df_journals['unconquerables'] == 1].shape[0]

783

## Export journals data

Choice of fileds to keep:
 * NlmUniqueID
 * Country
 * Publisher
 * PublicationFirstYear
 * PublicationEndYear
 * Frequency
 * Language
 * first_year
 * last_year
 * PMIDs
 * PMIDs 2021
 * PMIDs 2022
 * DOIs
 * unconquerables

In [65]:
# csv export
df_journals[['NlmUniqueID',
             'Title',
             'Country',
             'Publisher',
             'PublicationFirstYear',
             'PublicationEndYear',
             'Frequency',
             'Language',
             'first_year',
             'last_year',
             'PMIDs',
             'PMIDs_2021',
             'PMIDs_2022',
             'DOIs',
             'unconquerables'            
            ]].to_csv('results/2023/journals_final_data.tsv', sep='\t', index=False, header=True)

## Add unconquerable tag to MeSH


In [66]:
# Open MeSH data
df_mesh = pd.read_csv('results/2023/MeshHeadings.tsv', delimiter='\t', header=0)
df_mesh

Unnamed: 0,NlmUniqueID,MeshHeading,sherpa_id_x,sherpa_has_oa_path,embargo,sherpa_id_y,embargo_published_version,embargo_accepted_version,sherpa_oa_green,sherpa_id,additional_oa_fee,sherpa_oa_hybrid,DOAJ_Seal,DOAJ,Reverse_Flip,flipped_embargo,flipped_access_model,flipped_born_oa
0,9015384,History,1406.0,yes,24.0,1406.0,,24.0,yes,1406.0,yes,hybrid or gold,,,,,,
1,9015384,United Kingdom,1406.0,yes,24.0,1406.0,,24.0,yes,1406.0,yes,hybrid or gold,,,,,,
2,101637720,Anesthesiology,,,,,,,,,,,,,,,,
3,101714112,Anesthesiology,,,,,,,,,,,,,,,,
4,101269322,Critical Care,10921.0,yes,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26785,21830020R,History of Medicine,,,,,,,,,,,,,,,,
26786,0233767,Dentistry,,,,,,,,,,,,,,,,
26787,9309124,Embryonic Development,2242.0,yes,6.0,2242.0,,6.0,yes,2242.0,yes,hybrid or gold,,,,,,
26788,9309124,Fetal Development,2242.0,yes,6.0,2242.0,,6.0,yes,2242.0,yes,hybrid or gold,,,,,,


In [67]:
# convert to str
df_mesh.loc[df_mesh['NlmUniqueID'].notna(), 'NlmUniqueID'] = df_mesh['NlmUniqueID'].astype(str)

In [68]:
# merge
df_mesh = df_mesh.merge(df_journals[['NlmUniqueID',
             'Country',
             'Publisher',
             'PublicationFirstYear',
             'PublicationEndYear',
             'Frequency',
             'Language',
             'first_year',
             'last_year',
             'PMIDs',
             'DOIs',
             'unconquerables'            
            ]], on='NlmUniqueID', how='left')
df_mesh

Unnamed: 0,NlmUniqueID,MeshHeading,sherpa_id_x,sherpa_has_oa_path,embargo,sherpa_id_y,embargo_published_version,embargo_accepted_version,sherpa_oa_green,sherpa_id,additional_oa_fee,sherpa_oa_hybrid,DOAJ_Seal,DOAJ,Reverse_Flip,flipped_embargo,flipped_access_model,flipped_born_oa,Country,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,Language,first_year,last_year,PMIDs,DOIs,unconquerables
0,9015384,History,1406.0,yes,24.0,1406.0,,24.0,yes,1406.0,yes,hybrid or gold,,,,,,,England,Oxford University Press,1990,,"4 no. a year,",eng,1990.0,2019.0,145.0,144.0,
1,9015384,United Kingdom,1406.0,yes,24.0,1406.0,,24.0,yes,1406.0,yes,hybrid or gold,,,,,,,England,Oxford University Press,1990,,"4 no. a year,",eng,1990.0,2019.0,145.0,144.0,
2,101637720,Anesthesiology,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,101714112,Anesthesiology,,,,,,,,,,,,,,,,,United States,"Wolters Kluwer Health, Inc.",2018,,Biweekly,eng,2018.0,2022.0,999.0,999.0,1.0
4,101269322,Critical Care,10921.0,yes,,,,,,,,,,,,,,,United States,American Association of Critical-Care Nurses (AACN),2006,,Quarterly,eng,2006.0,2022.0,851.0,837.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26785,21830020R,History of Medicine,,,,,,,,,,,,,,,,,,,,,,,,,,,
26786,0233767,Dentistry,,,,,,,,,,,,,,,,,,,,,,,,,,,
26787,9309124,Embryonic Development,2242.0,yes,6.0,2242.0,,6.0,yes,2242.0,yes,hybrid or gold,,,,,,,England,Cambridge University Press,1993,,"Quarterly,",eng,1993.0,2022.0,1709.0,1641.0,
26788,9309124,Fetal Development,2242.0,yes,6.0,2242.0,,6.0,yes,2242.0,yes,hybrid or gold,,,,,,,England,Cambridge University Press,1993,,"Quarterly,",eng,1993.0,2022.0,1709.0,1641.0,


In [69]:
# take the number of journals for each MeSH
df_mesh_counts = df_mesh[['MeshHeading', 'NlmUniqueID']].groupby(by='MeshHeading').count()
# reset index and rename columns
df_mesh_counts = df_mesh_counts.reset_index(drop=False)
df_mesh_counts = df_mesh_counts.rename(columns={'NlmUniqueID' : 'count'})
df_mesh_counts

Unnamed: 0,MeshHeading,count
0,AIDS Dementia Complex,1
1,AIDS Vaccines,2
2,AIDS-Related Opportunistic Infections,1
3,Abdomen,3
4,"Abnormalities, Drug-Induced",5
...,...,...
2658,Zimbabwe,3
2659,Zoology,33
2660,Zoonoses,5
2661,Zygote,1


In [70]:
# take the number of journals for each MeSH for unconquerables
df_mesh_unconquerables_counts = df_mesh.loc[df_mesh['unconquerables'] == 1][['MeshHeading', 'NlmUniqueID']].groupby(by='MeshHeading').count()
# reset index and rename columns
df_mesh_unconquerables_counts = df_mesh_unconquerables_counts.reset_index(drop=False)
df_mesh_unconquerables_counts = df_mesh_unconquerables_counts.rename(columns={'NlmUniqueID' : 'unconquerables_count'})
df_mesh_unconquerables_counts

Unnamed: 0,MeshHeading,unconquerables_count
0,Accidents,1
1,"Accidents, Occupational",1
2,"Accidents, Traffic",1
3,Acquired Immunodeficiency Syndrome,2
4,Actuarial Analysis,1
...,...,...
608,Weights and Measures,1
609,Wound Healing,1
610,Wounds and Injuries,9
611,Yoga,1


In [71]:
# merge
df_mesh_counts = df_mesh_counts.merge(df_mesh_unconquerables_counts, on='MeshHeading', how='left')
df_mesh_counts

Unnamed: 0,MeshHeading,count,unconquerables_count
0,AIDS Dementia Complex,1,
1,AIDS Vaccines,2,
2,AIDS-Related Opportunistic Infections,1,
3,Abdomen,3,
4,"Abnormalities, Drug-Induced",5,
...,...,...,...
2658,Zimbabwe,3,
2659,Zoology,33,2.0
2660,Zoonoses,5,
2661,Zygote,1,


In [72]:
# replace NaN by 0
df_mesh_counts.loc[df_mesh_counts['unconquerables_count'].isna(), 'unconquerables_count'] = 0
df_mesh_counts.loc[df_mesh_counts['unconquerables_count'].notna(), 'unconquerables_count'] = df_mesh_counts['unconquerables_count'].astype(int)
df_mesh_counts

Unnamed: 0,MeshHeading,count,unconquerables_count
0,AIDS Dementia Complex,1,0
1,AIDS Vaccines,2,0
2,AIDS-Related Opportunistic Infections,1,0
3,Abdomen,3,0
4,"Abnormalities, Drug-Induced",5,0
...,...,...,...
2658,Zimbabwe,3,0
2659,Zoology,33,2
2660,Zoonoses,5,0
2661,Zygote,1,0


In [73]:
# add ratios
df_mesh_counts['ratio'] = df_mesh_counts['unconquerables_count'] / df_mesh_counts['count']
df_mesh_counts

Unnamed: 0,MeshHeading,count,unconquerables_count,ratio
0,AIDS Dementia Complex,1,0,0.000000
1,AIDS Vaccines,2,0,0.000000
2,AIDS-Related Opportunistic Infections,1,0,0.000000
3,Abdomen,3,0,0.000000
4,"Abnormalities, Drug-Induced",5,0,0.000000
...,...,...,...,...
2658,Zimbabwe,3,0,0.000000
2659,Zoology,33,2,0.060606
2660,Zoonoses,5,0,0.000000
2661,Zygote,1,0,0.000000


In [74]:
# test high ratios (> 0.15)
df_mesh_counts.loc[df_mesh_counts['ratio'] > 0.15].sort_values(by='ratio', ascending=False)

Unnamed: 0,MeshHeading,count,unconquerables_count,ratio
22,Actuarial Analysis,1,1,1.000000
887,Excipients,1,1,1.000000
1515,Maternal-Child Health Services,1,1,1.000000
1511,Maternal Health,1,1,1.000000
1487,Mali,1,1,1.000000
...,...,...,...,...
1185,History of Nursing,6,1,0.166667
914,Family Practice,38,6,0.157895
1494,Managed Care Programs,26,4,0.153846
633,Dental Hygienists,13,2,0.153846


In [75]:
# test highest ratios (= 1)
df_mesh_counts.loc[df_mesh_counts['ratio'] == 1].sort_values(by='count', ascending=False)

Unnamed: 0,MeshHeading,count,unconquerables_count,ratio
708,Diving,2,2,1.0
2123,Psychoanalytic Therapy,2,2,1.0
1241,Hyperbaric Oxygenation,2,2,1.0
1651,Moxibustion,1,1,1.0
1951,"Peritoneal Dialysis, Continuous Ambulatory",1,1,1.0
...,...,...,...,...
993,Gastrointestinal Motility,1,1,1.0
1099,Health Equity,1,1,1.0
1106,Health Inequities,1,1,1.0
1175,Hispanic Americans,1,1,1.0


In [76]:
# export CSV
df_mesh_counts.sort_values(by='MeshHeading', ascending=True).to_csv('results/2023/mesh_counts.tsv', sep='\t', index=False)

## Add unconquerable tag to Broad Journal Headings


In [77]:
import pandas as pd
# Open journals data
df_journals = pd.read_csv('results/2023/journals_final_data.tsv', delimiter='\t', header=0)
df_journals

Unnamed: 0,NlmUniqueID,Title,Country,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,Language,first_year,last_year,PMIDs,PMIDs_2021,PMIDs_2022,DOIs,unconquerables
0,9015384,20 century British history,England,Oxford University Press,1990,,"4 no. a year,",eng,1990.0,2019.0,145.0,,,144.0,
1,101714112,A&A practice,United States,"Wolters Kluwer Health, Inc.",2018,,Biweekly,eng,2018.0,2022.0,999.0,187.0,73.0,999.0,1.0
2,101269322,AACN advanced critical care,United States,American Association of Critical-Care Nurses (AACN),2006,,Quarterly,eng,2006.0,2022.0,851.0,59.0,37.0,837.0,1.0
3,0431420,AANA journal,United States,American Association of Nurse Anesthetists,1974,,Bimonthly,eng,1974.0,2022.0,2547.0,75.0,60.0,0.0,1.0
4,101223209,The AAPS journal,United States,American Association of Pharmaceutical Scientists,2004,,Four no. a year,eng,2004.0,2022.0,1976.0,123.0,109.0,1976.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,8702287,Zoological science,Japan,Zoological Society of Japan,1984,,"Monthly,",eng,1987.0,2022.0,2680.0,62.0,53.0,2635.0,1.0
5275,9435608,"Zoology (Jena, Germany)",Germany,Urban & Fischer,1994,,"Six no. a year,",eng,2001.0,2022.0,1019.0,74.0,41.0,1017.0,
5276,101300786,Zoonoses and public health,Germany,Blackwell Verlag,2007,,Ten no. a year,eng,2007.0,2022.0,1445.0,102.0,117.0,1444.0,
5277,101179386,Zootaxa,New Zealand,Magnolia Press,2001,,Irregular,eng,2008.0,2022.0,20965.0,2316.0,1315.0,20886.0,


In [78]:
# Open Broad Journal Headings
df_bjh = pd.read_csv('results/2023/BroadJournalHeadings.tsv', delimiter='\t', header=0)
df_bjh

Unnamed: 0,NlmUniqueID,BroadJournalHeading,sherpa_id_x,sherpa_has_oa_path,embargo,sherpa_id_y,embargo_published_version,embargo_accepted_version,sherpa_oa_green,sherpa_id,additional_oa_fee,sherpa_oa_hybrid,DOAJ_Seal,DOAJ,Reverse_Flip,flipped_embargo,flipped_access_model,flipped_born_oa
0,9015384,History of Medicine,1406.0,yes,24.0,1406.0,,24.0,yes,1406.0,yes,hybrid or gold,,,,,,
1,101637720,Anesthesiology,,,,,,,,,,,,,,,,
2,101714112,Anesthesiology,,,,,,,,,,,,,,,,
3,101269322,Critical Care,10921.0,yes,,,,,,,,,,,,,,
4,101269322,Nursing,10921.0,yes,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16386,0056272,Reproductive Medicine,,,,,,,,,,,,,,,,
16387,0056272,Veterinary Medicine,,,,,,,,,,,,,,,,
16388,21830020R,History of Medicine,,,,,,,,,,,,,,,,
16389,0233767,Dentistry,,,,,,,,,,,,,,,,


In [79]:
# convert to str
df_bjh.loc[df_bjh['NlmUniqueID'].notna(), 'NlmUniqueID'] = df_bjh['NlmUniqueID'].astype(str)

In [80]:
# merge
df_bjh = df_bjh.merge(df_journals[['NlmUniqueID',
             'Country',
             'Publisher',
             'PublicationFirstYear',
             'PublicationEndYear',
             'Frequency',
             'Language',
             'first_year',
             'last_year',
             'PMIDs',
             'DOIs',
             'unconquerables'            
            ]], on='NlmUniqueID', how='left')
df_bjh

Unnamed: 0,NlmUniqueID,BroadJournalHeading,sherpa_id_x,sherpa_has_oa_path,embargo,sherpa_id_y,embargo_published_version,embargo_accepted_version,sherpa_oa_green,sherpa_id,additional_oa_fee,sherpa_oa_hybrid,DOAJ_Seal,DOAJ,Reverse_Flip,flipped_embargo,flipped_access_model,flipped_born_oa,Country,Publisher,PublicationFirstYear,PublicationEndYear,Frequency,Language,first_year,last_year,PMIDs,DOIs,unconquerables
0,9015384,History of Medicine,1406.0,yes,24.0,1406.0,,24.0,yes,1406.0,yes,hybrid or gold,,,,,,,England,Oxford University Press,1990,,"4 no. a year,",eng,1990.0,2019.0,145.0,144.0,
1,101637720,Anesthesiology,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,101714112,Anesthesiology,,,,,,,,,,,,,,,,,United States,"Wolters Kluwer Health, Inc.",2018,,Biweekly,eng,2018.0,2022.0,999.0,999.0,1.0
3,101269322,Critical Care,10921.0,yes,,,,,,,,,,,,,,,United States,American Association of Critical-Care Nurses (AACN),2006,,Quarterly,eng,2006.0,2022.0,851.0,837.0,1.0
4,101269322,Nursing,10921.0,yes,,,,,,,,,,,,,,,United States,American Association of Critical-Care Nurses (AACN),2006,,Quarterly,eng,2006.0,2022.0,851.0,837.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16386,0056272,Reproductive Medicine,,,,,,,,,,,,,,,,,,,,,,,,,,,
16387,0056272,Veterinary Medicine,,,,,,,,,,,,,,,,,,,,,,,,,,,
16388,21830020R,History of Medicine,,,,,,,,,,,,,,,,,,,,,,,,,,,
16389,0233767,Dentistry,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [81]:
# take the number of journals for each Broad Journal Headings
df_bjh_counts = df_bjh[['BroadJournalHeading', 'NlmUniqueID']].groupby(by='BroadJournalHeading').count()
# reset index and rename columns
df_bjh_counts = df_bjh_counts.reset_index(drop=False)
df_bjh_counts = df_bjh_counts.rename(columns={'NlmUniqueID' : 'count'})
df_bjh_counts

Unnamed: 0,BroadJournalHeading,count
0,Acquired Immunodeficiency Syndrome,91
1,Aerospace Medicine,21
2,Allergy and Immunology,302
3,Anatomy,84
4,Anesthesiology,88
...,...,...
120,Veterinary Medicine,184
121,Virology,62
122,Vital Statistics,28
123,Women's Health,25


In [82]:
# take the number of journals for each Broad Journal Headings for unconquerables
df_bjh_unconquerables_counts = df_bjh.loc[df_bjh['unconquerables'] == 1][['BroadJournalHeading', 'NlmUniqueID']].groupby(by='BroadJournalHeading').count()
# reset index and rename columns
df_bjh_unconquerables_counts = df_bjh_unconquerables_counts.reset_index(drop=False)
df_bjh_unconquerables_counts = df_bjh_unconquerables_counts.rename(columns={'NlmUniqueID' : 'unconquerables_count'})
df_bjh_unconquerables_counts

Unnamed: 0,BroadJournalHeading,unconquerables_count
0,Acquired Immunodeficiency Syndrome,3
1,Aerospace Medicine,2
2,Allergy and Immunology,17
3,Anatomy,5
4,Anesthesiology,6
...,...,...
108,Vascular Diseases,8
109,Veterinary Medicine,11
110,Virology,4
111,Vital Statistics,8


In [83]:
# merge
df_bjh_counts = df_bjh_counts.merge(df_bjh_unconquerables_counts, on='BroadJournalHeading', how='left')
df_bjh_counts

Unnamed: 0,BroadJournalHeading,count,unconquerables_count
0,Acquired Immunodeficiency Syndrome,91,3.0
1,Aerospace Medicine,21,2.0
2,Allergy and Immunology,302,17.0
3,Anatomy,84,5.0
4,Anesthesiology,88,6.0
...,...,...,...
120,Veterinary Medicine,184,11.0
121,Virology,62,4.0
122,Vital Statistics,28,8.0
123,Women's Health,25,


In [84]:
# replace NaN by 0
df_bjh_counts.loc[df_bjh_counts['unconquerables_count'].isna(), 'unconquerables_count'] = 0
df_bjh_counts.loc[df_bjh_counts['unconquerables_count'].notna(), 'unconquerables_count'] = df_bjh_counts['unconquerables_count'].astype(int)
df_bjh_counts

Unnamed: 0,BroadJournalHeading,count,unconquerables_count
0,Acquired Immunodeficiency Syndrome,91,3
1,Aerospace Medicine,21,2
2,Allergy and Immunology,302,17
3,Anatomy,84,5
4,Anesthesiology,88,6
...,...,...,...
120,Veterinary Medicine,184,11
121,Virology,62,4
122,Vital Statistics,28,8
123,Women's Health,25,0


In [85]:
# add ratios
df_bjh_counts['ratio'] = df_bjh_counts['unconquerables_count'] / df_bjh_counts['count']
df_bjh_counts

Unnamed: 0,BroadJournalHeading,count,unconquerables_count,ratio
0,Acquired Immunodeficiency Syndrome,91,3,0.032967
1,Aerospace Medicine,21,2,0.095238
2,Allergy and Immunology,302,17,0.056291
3,Anatomy,84,5,0.059524
4,Anesthesiology,88,6,0.068182
...,...,...,...,...
120,Veterinary Medicine,184,11,0.059783
121,Virology,62,4,0.064516
122,Vital Statistics,28,8,0.285714
123,Women's Health,25,0,0.000000


In [86]:
# test high ratios (> 0.15)
df_bjh_counts.loc[df_bjh_counts['ratio'] > 0.15].sort_values(by='ratio', ascending=False)

Unnamed: 0,BroadJournalHeading,count,unconquerables_count,ratio
32,Disaster Medicine,9,3,0.333333
122,Vital Statistics,28,8,0.285714
58,Laboratory Animal Science,11,3,0.272727
56,Internal Medicine,40,9,0.225
70,Neurosurgery,55,12,0.218182
93,Primary Health Care,51,10,0.196078
64,Military Medicine,28,5,0.178571
92,Podiatry,6,1,0.166667


In [87]:
# test highest ratios (= 1)
df_bjh_counts.loc[df_bjh_counts['ratio'] == 1].sort_values(by='count', ascending=False)

Unnamed: 0,BroadJournalHeading,count,unconquerables_count,ratio


In [88]:
# export CSV
df_bjh_counts.sort_values(by='BroadJournalHeading', ascending=True).to_csv('results/2023/broad_journal_headings_counts.tsv', sep='\t', index=False)