In [1]:
import os
import pandas as pd

In [2]:
data_path = 'data'

# these files had been generated using BigQuery
cited_paper = pd.read_csv(os.path.join(data_path, 'citation_counts.000000000000.csv.gz'))
cited_paper.merge(pd.read_csv(os.path.join(data_path, 'citation_counts.000000000001.csv.gz')))
cited_paper.merge(pd.read_csv(os.path.join(data_path, 'citation_counts.000000000002.csv.gz')))
cited_papers = cited_paper.sort_values(
    'cited_cited_count', axis= 0, ascending=True, inplace=False, kind='quicksort', na_position='last'
)

In [3]:
cited_papers.head()

Unnamed: 0,doi,cited_cited_count,citing_citing_count
10417543,10.1007/978-94-007-7019-5_2,1,
2347276,10.1016/j.bushor.2015.06.002,1,
6104328,10.4095/126606,1,
6104324,10.1149/1.2427762,1,
6104323,10.1109/SSCI.2015.16,1,


In [4]:
citation_count_bounds = [
    (1000, 1100),
    (2000, 2300),
    (3000, 3500),
    (4000, 5000),
    (7000, 8000),
    (9000, None)
]

for lower_citation_count, upper_citation_count in citation_count_bounds:
    mask = (cited_paper['cited_cited_count'] > lower_citation_count)
    if upper_citation_count:
        mask &= (cited_paper['cited_cited_count'] <= upper_citation_count)
    cited_x_times = cited_paper[mask]['doi'].head(30)
    out_csv = os.path.join(data_path, 'DOI_cited_%d.tsv' % lower_citation_count)
    cited_x_times.to_csv(out_csv, index=False, header=True, sep='\t', encoding='utf-8')
    print(
        'within citation count range %s, %s, found %d manuscripts (max 30), written to %s' %
        (lower_citation_count, upper_citation_count, len(cited_x_times), out_csv)
    )

within citation count range 1000, 1100, found 30 manuscripts (max 30), written to data/DOI_cited_1000.tsv
within citation count range 2000, 2300, found 30 manuscripts (max 30), written to data/DOI_cited_2000.tsv
within citation count range 3000, 3500, found 30 manuscripts (max 30), written to data/DOI_cited_3000.tsv
within citation count range 4000, 5000, found 18 manuscripts (max 30), written to data/DOI_cited_4000.tsv
within citation count range 7000, 8000, found 6 manuscripts (max 30), written to data/DOI_cited_7000.tsv
within citation count range 9000, None, found 9 manuscripts (max 30), written to data/DOI_cited_9000.tsv


###  Check files

In [5]:
file_csv = os.path.join(data_path, 'DOI_cited_1000.tsv')
df1000 = pd.read_csv(file_csv, sep='\t', encoding='utf-8')
df1000.head()

Unnamed: 0,doi
0,10.1126/science.1235122
1,10.7326/0003-4819-121-12-199412150-00009
2,10.1111/j.1538-7836.2006.01753.x
3,10.1056/NEJMoa021641
4,10.1007/978-3-642-69746-3_2
