### **Commonness**

Commonness is a simplified measure of how typical or conventional a paper's combination of cited journals is. Unlike the atypical combination measure (which uses permutation tests to calculate z-scores), commonness directly uses the co-occurrence probability of journal pairs.

For each paper, we:
1. Identify all pairs of journals cited together
2. Look up the empirical probability that these two journals are co-cited (based on historical data)
3. Take the negative log of these probabilities (lower probability = higher novelty)
4. Aggregate across all pairs (e.g., median, mean, percentiles)

This approach is computationally simpler than atypical combination because it doesn't require shuffling/permutation tests, but captures a similar concept: papers that cite unusual combinations of journals together.

**Reference:**
 - Lee, You-Na, John P. Walsh, and Jian Wang. "Creativity in scientific teams: Unpacking novelty and impact." Research policy 44.3 (2015): 684-697.

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
tqdm.pandas()
from matplotlib import pyplot as plt
import json, os, shutil
from operator import add
from glob import glob
import pickle as pkl

In [None]:
dict_paper_to_journal_id = pd.read_feather('intermediate/dict_paper_to_journal_id.feather')
dict_paper_to_journal_id = dict_paper_to_journal_id.set_index("paper_id")["journal_id"].to_dict()
print(f'len(dict_paper_to_journal_id) = {len(dict_paper_to_journal_id)}')

dict_year_to_paper_ids = pd.read_feather('intermediate/dict_year_to_paper_ids.feather')
dict_year_to_paper_ids = dict_year_to_paper_ids.set_index("year")["paper_ids"].to_dict()
print(f'len(dict_year_to_paper_ids) = {len(dict_year_to_paper_ids)}')

dict_citing_to_cited = pd.read_feather('data/dict_citing_to_cited_id_and_year.feather')
dict_citing_to_cited = dict_citing_to_cited.set_index("citing_paperid").cited_list.progress_map(lambda x: [i[0] for i in x]).to_dict()
print(f'len(dict_citing_to_cited) = {len(dict_citing_to_cited)}')

In [None]:
from collections import Counter
from itertools import combinations
from tqdm.contrib.concurrent import process_map

def process_subset(args):
    citing_paper_ids, year = args
    N_i_j_t, N_i_t, N_j_t, N_t = Counter(), Counter(), Counter(), Counter()

    for citing_paper_id in tqdm(citing_paper_ids, ncols=100, mininterval=1, desc=f'Processing year {year}'):
        # Get the cited paper IDs for the current citing paper ID
        cited_journals = []
        for cited_paper_id in dict_citing_to_cited.get(citing_paper_id, []):
            cited_journal_id = dict_paper_to_journal_id.get(cited_paper_id, None)
            if cited_journal_id != None:
                cited_journals.append(cited_journal_id)

        cited_journal_pairs = [(year, i, j) for i, j in combinations(sorted(cited_journals), r=2)]
        for t, i, j in cited_journal_pairs:
            N_i_j_t[(i, j, t)] = N_i_j_t.get((i, j, t), 0) + 1

    for (i, j, t), n_i_j_t in N_i_j_t.items():
        N_i_t[(i, t)] = N_i_t.get((i, t), 0) + n_i_j_t
        N_j_t[(j, t)] = N_j_t.get((j, t), 0) + n_i_j_t
        N_t[t] = N_t.get(t, 0) + n_i_j_t
    return N_i_j_t, N_i_t, N_j_t, N_t

tasks = [(dict_year_to_paper_ids[year], year) for year in sorted(dict_year_to_paper_ids.keys())]
results = process_map(process_subset, tasks, max_workers=40, chunksize=1000)


N_i_j_t, N_i_t, N_j_t, N_t = Counter(), Counter(), Counter(), Counter()
for local_i_j_t, local_i_t, local_j_t, local_t in tqdm(results, ncols=100, mininterval=1):
    N_i_j_t += local_i_j_t
    N_i_t   += local_i_t
    N_j_t   += local_j_t
    N_t     += local_t
    
commonness_i_j_t = {}
for (i, j, t) in tqdm(N_i_j_t.keys(), ncols=100, mininterval=1):
    n_i_j_t, n_i_t, n_j_t, n_t = N_i_j_t[(i, j, t)], N_i_t[(i, t)], N_j_t[(j, t)], N_t[t]
    commonness_i_j_t[(i, j, t)] = {
        'N_i_j_t': n_i_j_t, 'N_i_t': n_i_t, 'N_j_t': n_j_t, 'N_t': n_t,
        'commonness': n_i_j_t * n_t / (n_i_t * n_j_t)
    }

print("Transforming to Dataframe")
df_commonness = pd.DataFrame.from_dict(commonness_i_j_t, orient='index').reset_index()
df_commonness.columns = ['journal_id_1', 'journal_id_2', 'year', 'N_i_j_t', 'N_i_t', 'N_j_t', 'N_t', 'commonness']

print("Transforming to Dataframe")
df_commonness.to_csv('data/commonness_journal_pairs.tsv', sep='\t', index=False)
df_commonness.to_feather('data/commonness_journal_pairs.feather')

In [None]:
commonness_i_j_t = pd.read_csv(
    'data/commonness_journal_pairs.tsv', sep='\t'
).set_index(['journal_id_1', 'journal_id_2', 'year']).commonness.to_dict()

from collections import Counter
from itertools import combinations
def calculate_commonness(args):
    citing_paper_id, year = args
    cited_journals = []
    for cited_paper_id in dict_citing_to_cited.get(citing_paper_id, []):
        cited_journal_id = dict_paper_to_journal_id.get(cited_paper_id, None)
        if cited_journal_id is not None:
            cited_journals.append(cited_journal_id)

    cited_journal_pairs = [(i, j, year) for i, j in combinations(sorted(cited_journals), r=2)]
    if not cited_journal_pairs:
        return None

    commonness_values = np.array(
        [commonness_i_j_t[(i, j, year)] for i, j, year in cited_journal_pairs if (i, j, year) in commonness_i_j_t])

    if len(commonness_values) == 0:
        return None

    return {
        "year": year,
        "paper_id": f"pub.{citing_paper_id}",
        '1_pct_commonness': -np.log(np.quantile(commonness_values, 0.01)),
        '5_pct_commonness': -np.log(np.quantile(commonness_values, 0.05)),
        '10_pct_commonness': -np.log(np.quantile(commonness_values, 0.10)),
        '25_pct_commonness': -np.log(np.quantile(commonness_values, 0.25)),
        'median_commonness': -np.log(np.quantile(commonness_values, 0.50)),
        'mean_commonness_mean': np.mean(-np.log(commonness_values)),
        'std_commonness_std': np.std(-np.log(commonness_values)),
        "valid_pairs": len(cited_journal_pairs),
    }

In [None]:
tasks = []
for year, citing_paper_ids in dict_year_to_paper_ids.items():
    tasks.extend([(citing_paper_id, year) for citing_paper_id in citing_paper_ids])

paper_commonness = process_map(calculate_commonness, tasks, max_workers=40, chunksize=1000)
paper_commonness = [r for r in paper_commonness if r is not None]

df_paper_commonness = pd.DataFrame(paper_commonness)
df_paper_commonness.to_csv('data/commonness.tsv', sep='\t', index=False)
df_paper_commonness.to_feather('data/commonness.feather')