In [1]:
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

import json, pickle as pkl
from unidecode import unidecode
from glob import glob
from collections import Counter, defaultdict
from itertools import product, combinations
from copy import deepcopy

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)

In [None]:
import polars as pl

citation_by_time_pl = pl.read_parquet("../parquet/processed/pub_citing_cited_years.parquet")

citation_counts = citation_by_time_pl.group_by("cited_paperid").agg([
	(pl.col("year_diff") <= 1).sum().alias("citation_1y"),
	(pl.col("year_diff") <= 2).sum().alias("citation_2y"),
    (pl.col("year_diff") <= 3).sum().alias("citation_3y"),
    (pl.col("year_diff") <= 5).sum().alias("citation_5y"), 
    (pl.col("year_diff") <= 10).sum().alias("citation_10y"),
    pl.col("year_diff").count().alias("citation_inf")
]).rename({"cited_paperid": "paper_id"})

# citation_counts DataFrame columns:
# - paper_id: cited paper ID (the paper receiving citations)
# - citation_1y: number of citations within 1 year of publication
# - citation_2y: number of citations within 2 years of publication
# - citation_3y: number of citations within 3 years of publication
# - citation_5y: number of citations within 5 years of publication
# - citation_10y: number of citations within 10 years of publication
# - citation_inf: total number of citations (no time limit)

print(f"Citation counts shape: {citation_counts.shape}")
print(citation_counts.head())

citation_counts.write_csv("data/citation-counts-within-time-window.tsv", separator="\t")
citation_counts.write_ipc("data/citation-counts-within-time-window.feather")