### **Atypical Combination**

Atypical combination measures how unconventional or novel a paper's combination of cited journals is. This approach uses permutation tests to establish a baseline expectation and calculate z-scores for journal pair co-citations.

For each paper, we:
1. Identify all pairs of journals cited together
2. Generate a null distribution by shuffling cited journals across papers (preserving the year structure)
3. Calculate z-scores comparing observed co-citation frequencies to the shuffled baseline
4. Aggregate z-scores across all pairs (e.g., median, 10th percentile)

Papers with lower z-scores (especially at the 10th percentile) cite journal combinations that are statistically unusual compared to what would be expected by chance. This captures the novelty of knowledge integration.

**Reference:**
 - Uzzi, Brian, et al. "Atypical combinations and scientific impact." Science 342.6157 (2013): 468-472.

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
tqdm.pandas()
import json, os, shutil
from operator import add
from glob import glob
import pickle as pkl
from itertools import combinations
from collections import defaultdict
from tqdm.contrib.concurrent import process_map

from itertools import combinations
from collections import defaultdict

dict_paper_to_journal_id = pd.read_feather('intermediate/dict_paper_to_journal_id.feather')
dict_paper_to_journal_id = dict_paper_to_journal_id.dropna(subset=["paper_id", "journal_id"])
dict_paper_to_journal_id = dict_paper_to_journal_id.set_index("paper_id").journal_id.astype(int)

pub_citing_cited_years = (
    pd.read_feather("intermediate/citing_cited_paper_id_year.feather")
    .query("citing_year >= cited_year")
)

pub_citing_cited_years["cited_journal_id"] = (
    pub_citing_cited_years["cited_paperid"]
    .map(dict_paper_to_journal_id)
)

ref_df = pub_citing_cited_years[pub_citing_cited_years.cited_journal_id.notna()]
ref_df["cited_journal_id"] = ref_df["cited_journal_id"].astype(int)
del dict_paper_to_journal_id, pub_citing_cited_years


def observe(df: pd.DataFrame) -> pd.DataFrame:
	pair_counts = (
		df.groupby("citing_paperid")
		.cited_journal_id
		.apply(list)
		.map(lambda x: x if len(x) < 1000 else [])
		.map(lambda x: list(combinations(sorted(x), 2)))
		.explode()
		.dropna()
		.value_counts()
	)
	return pair_counts

def shuffle(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()
	df["cited_journal_id"] = (
		df.groupby("cited_year", sort=False)['cited_journal_id']
		.transform(np.random.permutation)
	)
	return df

def calculate_z_score(save_path: str, year: int) -> pd.DataFrame:
	ref_df_year = ref_df[ref_df.citing_year == year].reset_index(drop=True)

	expected_freq_df = []
	for epoch in trange(10, ncols=100, mininterval=1, desc=f"Shuffling"):
		expected_freq_df.append(observe(shuffle(ref_df_year)))
	expected_freq_df = pd.DataFrame(expected_freq_df).T.fillna(0)
	expected_freq_mean = expected_freq_df.mean(axis=1)
	expected_freq_std = expected_freq_df.std(axis=1)

	observed_freq = observe(ref_df_year)
	index = pd.Index(observed_freq.index).intersection(expected_freq_std[expected_freq_std != 0].index)
	z_score_for_year = (observed_freq[index] - expected_freq_mean[index]) / expected_freq_std[index]
	z_score_for_year = defaultdict(lambda: None, z_score_for_year.dropna().to_dict())

	paper_pairs = (
		ref_df_year
		.groupby("citing_paperid")
		.cited_journal_id.apply(list)
		.map(lambda x: set(combinations(sorted(x), 2)))
		.reset_index()
	)

	z_median_list = []
	z_10_pct_list = []

	for row in tqdm(paper_pairs.itertuples(), total=len(paper_pairs), ncols=100, mininterval=1, desc=f"Calculating z-score for year {year}"):
		focal_paper_id = row.citing_paperid
		cited_journal_id = row.cited_journal_id
		
		k = focal_paper_id
		z_for_k = [z_score_for_year.get(pair, None) for pair in cited_journal_id]
		z_for_k = [z for z in z_for_k if z is not None]
		if len(z_for_k) > 0:
			z_median_list.append([k, np.quantile(z_for_k, 0.5), len(z_for_k)])
			z_10_pct_list.append([k, np.quantile(z_for_k, 0.1), len(z_for_k)])

	z_10_pct_df = pd.DataFrame(z_10_pct_list, columns=["paper_id", "z_10_pct", "valid_pairs"])
	z_median_df = pd.DataFrame(z_median_list, columns=["paper_id", "z_median", "valid_pairs"])

	z_10_pct_df["paper_id"] = "pub." + z_10_pct_df["paper_id"].astype(str)
	z_median_df["paper_id"] = "pub." + z_median_df["paper_id"].astype(str)

	os.makedirs(save_path, exist_ok=True)
	z_10_pct_df.to_feather(f"{save_path}/z_10_pct_{year}.feather")
	z_median_df.to_feather(f"{save_path}/z_median_{year}.feather")


save_path = 'intermediate/atypical-combination'
os.makedirs(save_path, exist_ok=True)
years = list(range(1979, 2011))
for year in tqdm(years, ncols=100, mininterval=1, desc="Z-score Calculation"):
	calculate_z_score(save_path, year)