# Reformatting .pkl to panel data

Expect inputs in .pkl:
- combined jstor + scopus metadata
- author names and affiliations data
- references data
- tables, figures and equations data


Output:
- flattened versions


In [1]:
import pandas as pd
# from unidecode import unidecode
import re
from datetime import date
import json
import numpy as np
import string
import time
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np
from itertools import combinations
import pickle

pd.set_option('display.max_colwidth', 100)

pd.set_option('display.max_rows', None)

In [2]:
base_path="/Users/sijiawu/Work/Thesis/Data/Affiliations/"
data_base_path="/Users/sijiawu/Work/Thesis/Data/"

In [3]:
proc_auths_all = pd.read_pickle(base_path+"proc_auth_aff_flat.pkl")
aff_sub=pd.read_pickle(base_path+"affiliations_combined_sub.pkl")
j_data=pd.read_pickle(data_base_path+"Combined/011_merged_proc_scopus_inception_2020.pkl")
all_refs=pd.read_excel('../031_proc_refs_full_set/refs_1940_2020.xlsx')
relevant=pd.read_excel('../031_proc_refs_full_set/refs_1940_2020_top5.xlsx')

In [4]:
def compute_jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

In [5]:
j_data["id"]=j_data["URL"].str.split("/").str[-1]

In [6]:
relevant["id_o"]=relevant["id_o"].astype(str)
relevant["year_o"]=relevant["year_o"].astype(int)
proc_auths_all["id_o"]=proc_auths_all["url"].str.split("/").str[-1]
relevant_sub=relevant[["ref_ord", "id_o", "year_o","match_id"]]

In [7]:
for i in proc_auths_all['id_o'].unique():
    if "." in i:
        print(i)

In [8]:
proc_auths_all.head()

Unnamed: 0,auth_ord,a1,a2,a3,last,affs,year,content_type,jid,url,a1_order,a2_order,a3_order,fl,a1_tk_count,id_o
0,0,benjamin enke,b. enke,b. enke,enke,"{national bureau of economic research - nber, harvard university}",2020,Article,qje,https://doi.org/10.1093/qje/qjaa012,12220,4712,4304,benjamin enke,2,qjaa012
1,0,lauren f. bergquist,l. f. bergquist,l. bergquist,bergquist,{university of michigan},2020,Article,aer,https://www.jstor.org/stable/26966478,4049,5985,13392,lauren bergquist,3,26966478
2,1,michael dinerstein,m. dinerstein,m. dinerstein,dinerstein,{university of chicago},2020,Article,aer,https://www.jstor.org/stable/26966478,1884,15221,13547,michael dinerstein,2,26966478
3,0,dominic coey,d. coey,d. coey,coey,{facebook},2020,Article,aer,https://www.jstor.org/stable/26966479,3032,7322,6575,dominic coey,2,26966479
4,1,bradley j. larsen,b. j. larsen,b. larsen,larsen,"{stanford university, national bureau of economic research - nber}",2020,Article,aer,https://www.jstor.org/stable/26966479,1624,1330,4346,bradley larsen,3,26966479


In [19]:
def build_coauthorship_network(collabs):
    G = nx.Graph()
    G.add_nodes_from(list(collabs.index))
    unique_pairs = list(combinations(collabs.index, 2))

    for i in unique_pairs:
        if collabs.loc[i[0],i[1]]!=0:
            G.add_edge(i[0], i[1], weight=collabs.loc[i[0],i[1]])
    return G

def get_network_features(G, author1, author2):
    try:
        distance = nx.shortest_path_length(G, source=author1, target=author2)
    except nx.NetworkXNoPath:
        distance = np.inf  # No path exists

    num_paths = len(list(nx.all_shortest_paths(G, source=author1, target=author2))) if distance != np.inf else 0
    return distance, num_paths

def compute_cosine_similarity(matrix):
    m_array = matrix.values
    cosine_sim = cosine_similarity(m_array)
    authors = matrix.index
    cosine_sim_df = pd.DataFrame(cosine_sim, index=authors, columns=authors)
    return cosine_sim_df

def cit_matrix(aff_auths, order, cit_data):
    # Merge to get citations at author level
    print(cit_data.shape)
    df_citations = cit_data.merge(aff_auths[[order, "id_o"]], on="id_o")
    print(df_citations.shape)
    matrix = df_citations.pivot_table(
        index=order, columns="match_id", aggfunc="size", fill_value=0
    )
    co_simm=compute_cosine_similarity(matrix)
    return {"matrix":matrix,"sim_matrix":co_simm}

def collab_matrix(aff_auths, order):
    authors = aff_auths[order].unique()
    author_index = {author: idx for idx, author in enumerate(authors)}
    matrix_size = len(authors)
    collab_matrix = np.zeros((matrix_size, matrix_size), dtype=int)
    grouped_papers = aff_auths.groupby("url")[order].apply(list)

    for authors_list in grouped_papers:
        for author1, author2 in combinations(authors_list, 2):
            idx1, idx2 = author_index[author1], author_index[author2]
            collab_matrix[idx1, idx2] += 1
            collab_matrix[idx2, idx1] += 1  

    collaboration_matrix = pd.DataFrame(collab_matrix, index=authors, columns=authors)
    
    return collaboration_matrix
    
def reduce_affs_jacc_sim(aff_auths, order, cits):
    collabs=collab_matrix(aff_auths, order)
    o={}
    for i in aff_auths.index:
        if aff_auths.loc[i,order] in o.keys():
            o[aff_auths.loc[i,order]].update(aff_auths.loc[i,"affs"])  # Merge sets
        else:
            o[aff_auths.loc[i,order]] = aff_auths.loc[i,"affs"]
    cit_mat=cit_matrix(aff_auths, order, cits)
    
    o_proc=[]
    pairs = list(combinations(list(o.keys()), 2))
    # print(cit_mat['sim_matrix'].head())
    
    G_t = build_coauthorship_network(collabs)
    
    for i in pairs:
        distance, num_paths = get_network_features(G_t, i[0], i[1])
        network_proximity = 1 / distance if distance != np.inf else 0
        log_num_paths = np.log(num_paths + 1)  # Avoid log(0)
        temp={ 
                       "pair_1":i[0], 
                       "pair_2":i[1],
                       "aff_jacc_sim":compute_jaccard_similarity(o[i[0]],o[i[1]]),
                       "collabs": collabs.loc[i[0],i[1]],
                       "distance": distance,
                       "network_proximity": network_proximity,
                       "log_num_paths":log_num_paths
        }
        
        
        if (i[0] in cit_mat["sim_matrix"].index) & (i[1] in cit_mat["sim_matrix"].index):
            temp["cit_cos_sim"]=cit_mat["sim_matrix"].loc[i[0],i[1]]
        else:
            temp["cit_cos_sim"]=0
        o_proc.append(temp)

    
    o_flat=[]
    for i in o.keys():
        o_flat.append({"order": i,"affs": o[i]})

    
    return {"sims_pairs":pd.DataFrame(o_proc), "aff_sets":o, "aff_flat_sets": pd.DataFrame(o_flat),  "collabs": collabs, "cit":cit_mat, "auth_net": G_t}

import pandas as pd
import numpy as np
from itertools import combinations

def reduce_affs_jacc_sim_optim(aff_auths, order, cits):
    collabs = collab_matrix(aff_auths, order)
    
    # Efficiently create dictionary mapping orders to sets of affiliations
    o = {}
    auths_order_col = aff_auths[order]
    auths_affs_col = aff_auths["affs"]

    for i, key in auths_order_col.items():
        if key in o:
            o[key].update(auths_affs_col[i])  # Merge sets directly
        else:
            o[key] = set(auths_affs_col[i])  # Ensure it's a set

    cit_mat = cit_matrix(aff_auths, order, cits)

    # Precompute pairs for similarity computations
    keys_list = list(o.keys())
    pairs = list(combinations(keys_list, 2))

    # Build coauthorship network once
    G_t = build_coauthorship_network(collabs)

    # Compute similarities and other metrics efficiently
    o_proc = [
        {
            "pair_1": a, 
            "pair_2": b,
            "aff_jacc_sim": compute_jaccard_similarity(o[a], o[b]),
            "collabs": collabs.loc[a, b] if a in collabs.index and b in collabs.columns else 0,
            "distance": (dist := get_network_features(G_t, a, b)[0]),
            "network_proximity": (1 / dist) if dist != np.inf else 0,
            "log_num_paths": np.log(get_network_features(G_t, a, b)[1] + 1),  # Avoid log(0)
            "cit_cos_sim": cit_mat["sim_matrix"].loc[a, b] if a in cit_mat["sim_matrix"].index and b in cit_mat["sim_matrix"].columns else 0
        }
        for a, b in pairs
    ]

    # Flatten the affiliation dictionary to DataFrame
    o_flat = [{"order": k, "affs": v} for k, v in o.items()]

    return {
        "sims_pairs": pd.DataFrame(o_proc),
        "aff_sets": o,
        "aff_flat_sets": pd.DataFrame(o_flat),
        "collabs": collabs,
        "cit": cit_mat,
        "auth_net": G_t
    }



def data_prep(t, duration, order):
    print(t)
    aff_t=proc_auths_all[(proc_auths_all['year']<t+duration)&(proc_auths_all['year']>=t)].reset_index(drop=True)
    cit_t=relevant_sub[(relevant_sub["year_o"]>=t)&(relevant_sub["year_o"]<t+duration)].reset_index(drop=True)
    j_t=j_data[(j_data["year"]>=t)&(j_data["year"]<t+duration)][['id', "title", "jid", "year", "content_type"]].reset_index(drop=True)
    a=time.time()
    jaccs=reduce_affs_jacc_sim(aff_t, order, cit_t)
    jaccs["j_data_t"]=j_t
    b=time.time()
    print(b-a)
    # print()
    return jaccs

def data_prep_optim(t, duration, order):
    print(t)
    aff_t=proc_auths_all[(proc_auths_all['year']<t+duration)&(proc_auths_all['year']>=t)].reset_index(drop=True)
    cit_t=relevant_sub[(relevant_sub["year_o"]>=t)&(relevant_sub["year_o"]<t+duration)].reset_index(drop=True)
    j_t=j_data[(j_data["year"]>=t)&(j_data["year"]<t+duration)][['id', "title", "jid", "year", "content_type"]].reset_index(drop=True)
    a=time.time()
    jaccs=reduce_affs_jacc_sim_optim(aff_t, order, cit_t)
    jaccs["j_data_t"]=j_t
    b=time.time()
    print(b-a)
    # print()
    return jaccs


In [16]:
# jacc_sims_10={}

# for i in range(1940,2020,10):
#     jacc_sims_10[i]=data_prep(i,10, 'a1_order')

# jacc_sims_20={}

# for i in range(1940,2020,20):
#     jacc_sims_20[i]=data_prep(i,20, 'a1_order')

In [17]:
jacc_sims_10={}

for i in range(1940,1990,10):
    jacc_sims_10[i]=data_prep(i,10, 'a1_order')

1940
(2447, 4)
(2597, 5)
Index(['pair_1', 'pair_2', 'distance'], dtype='object')
9.258679866790771
1950
(3171, 4)
(3420, 5)
Index(['pair_1', 'pair_2', 'distance'], dtype='object')
12.634435892105103
1960
(3677, 4)
(4442, 5)
Index(['pair_1', 'pair_2', 'distance'], dtype='object')
24.34161686897278
1970
(10147, 4)
(13368, 5)
Index(['pair_1', 'pair_2', 'distance'], dtype='object')
75.93183279037476
1980
(15578, 4)
(23216, 5)
Index(['pair_1', 'pair_2', 'distance'], dtype='object')
157.0410521030426


In [None]:
with open("flattened_data_10_dec_red"+str(time.time())+".pkl", "wb") as f:
        pickle.dump(jacc_sims_10, f)