# Process Sci-Hub download logs to count downloads per article per month

More information regarding Sci-Hub's logs available at https://doi.org/10.7554/eLife.32822.

In [1]:
import collections
import csv
import itertools
import pathlib
import logging
import lzma
import urllib.request

import pandas

## Download Sci-Hub download logs

In [2]:
# Ensure downloads/scihub exists
scihub_directory = pathlib.Path('downloads/scihub')
if not scihub_directory.is_dir():
    scihub_directory.mkdir(exist_ok=True)

In [3]:
# Download Sci-Hub 2017 logs
urls = [
    'https://github.com/greenelab/scihub/raw/9280e4479fbe32a48d7c0f836b9292b0e4a5319c/download/scihub-logs/scihub-logs_2015-09_2016-02.tsv.xz',
    'https://github.com/greenelab/scihub/raw/9280e4479fbe32a48d7c0f836b9292b0e4a5319c/download/scihub-logs-2017/2017.statistics.tab.xz',
    'https://github.com/greenelab/scihub/raw/9280e4479fbe32a48d7c0f836b9292b0e4a5319c/download/scihub-logs-2017/2017.statistics.1016.j.tab.xz',
]
log_paths = list()
for url in urls:
    path = scihub_directory / pathlib.Path(url).name
    log_paths.append(path)
    if not path.exists():
        urllib.request.urlretrieve(url=url, filename=path)

In [4]:
def get_2017_log_rows(path):
    """
    Return a generator of rows as dicts in 2017.statistics.tab. Note that column names are not
    part of the dataset and are thus inferred.

    This function was derived from:
    https://github.com/greenelab/scihub/blob/9280e4479fbe32a48d7c0f836b9292b0e4a5319c/download/scihub-logs-2017/01.summarize-combined-logs.ipynb
    """
    columns = 'date', 'doi', 'IP_code', 'user_code', 'country', 'city', 'latitude', 'longitude'
    with lzma.open(path, 'rt') as read_file:
        rows = csv.DictReader(read_file, fieldnames=columns, delimiter='\t')
        for row in rows:
            try:
                for k, v in list(row.items()):
                    if v == 'N/A':
                        row[v] = None
                row['doi'] = row['doi'].lower()
                yield row
            except Exception:
                logging.warning(f'Could not process row:\n{row}')
                pass

def get_2015_log_rows(path):
    """
    Return a generator of rows as dicts in scihub-logs_2015-09_2016-02.tsv.xz.

    This function was derived from:
    https://github.com/greenelab/scihub/blob/9280e4479fbe32a48d7c0f836b9292b0e4a5319c/download/scihub-logs-2017/01.summarize-combined-logs.ipynb
    """
    with lzma.open(path, 'rt') as read_file:
        rows = csv.DictReader(read_file, delimiter='\t')
        for row in rows:
            try:
                for k, v in list(row.items()):
                    if v == '':
                        row[v] = None
                row['doi'] = row['doi'].lower()
                yield row
            except Exception:
                logging.warning(f'Could not process row:\n{row}')
                pass

In [5]:
def year_month_range(start, end):
    """
    Generate YYYY-MM strings spanning the specified range.
    Inclusive of start_month and end_month.
    Based on https://stackoverflow.com/a/5734564/4651668
    """
    start_year, start_month = map(int, start.split('-'))
    end_year, end_month = map(int, end.split('-'))
    ym_start = 12 * start_year + start_month - 1
    ym_end = 12 * end_year + end_month
    for ym in range(ym_start, ym_end):
        y, m = divmod(ym, 12)
        yield f'{y}-{m + 1:02d}'

keep_year_months = list(year_month_range('2015-09', '2016-02')) + list(year_month_range('2017-01', '2017-12'))
print('Months in list:', len(keep_year_months))

Months in list: 18


## Count downloads on a per article/month basis

In [6]:
for path in log_paths:
    print(path)

downloads/scihub/scihub-logs_2015-09_2016-02.tsv.xz
downloads/scihub/2017.statistics.tab.xz
downloads/scihub/2017.statistics.1016.j.tab.xz


In [7]:
# Create generator of Sci-Hub log rows
rows = itertools.chain(
    get_2015_log_rows(log_paths[0]),
    get_2017_log_rows(log_paths[1]),
    get_2017_log_rows(log_paths[2]),
)
# Uncomment following line for development
# rows = itertools.islice(rows, 10_000)

# Create an object storing download counts per doi per month
# Object structured like {doi: {year_month: count}}
doi_to_downloads = dict()
observed_year_months = set()
for row in rows:
    doi = row['doi']
    if not doi.startswith('10.'):
        continue
    counter = doi_to_downloads.setdefault(doi, collections.Counter())
    year_month = row['date'][:7]
    observed_year_months.add(year_month)
    counter[year_month] += 1

In [8]:
# Show months that appeared in the logs but are not retained as per keep_year_months
# Some months like 2016-03 have very few downloads were not intended to be part of the log releases
discarded_months = set(observed_year_months) - set(keep_year_months)
sorted(discarded_months)

['2016-03']

In [9]:
# # Create to a tidy dataset rather than a matrix
# count_rows = list()
# for doi, counter in doi_to_downloads.items():
#     for year_month, count in counter.items():
#         count_rows.append((doi, year_month, count))
# download_df = pandas.DataFrame(count_rows, columns=['doi', 'year_month', 'scihub_downloads'])
# download_df.sort_values(['doi', 'year_month'], inplace=True)
# download_df.head()

In [10]:
# Create a matrix-style dataframe (doi × year_month)
count_rows = (
    [counter[year_month] for year_month in keep_year_months]
    for counter in doi_to_downloads.values()
)
download_df = pandas.DataFrame(
    data=count_rows,
    index=list(doi_to_downloads),
    columns=keep_year_months,
)
download_df.index.name = 'doi'
download_df.sort_index(inplace=True)
download_df.head(3)

Unnamed: 0_level_0,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10.0000/aac.asm.org/aac/40/8/1914,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
10.0000/aac.asm.org/aac/42/1/53,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
10.0000/aac.asm.org/aac/43/6/1523,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
# Downloads per month
download_df.sum(axis='rows').apply('{:,}'.format)

2015-09     4,904,314
2015-10     6,072,144
2015-11     1,849,380
2015-12     3,879,506
2016-01     4,901,508
2016-02     6,213,053
2017-01    12,015,777
2017-02    12,944,710
2017-03    16,994,058
2017-04    11,681,866
2017-05    17,919,468
2017-06    17,602,135
2017-07    23,145,854
2017-08    22,021,949
2017-09    19,625,247
2017-10     5,321,363
2017-11    21,021,397
2017-12    14,205,903
dtype: object

In [12]:
# Write downloads to disk
download_df.to_csv('data/03.scihub-traffic.tsv.xz', sep='\t')