# Reformatting .pkl to panel data

Expect inputs in .pkl:
- combined jstor + scopus metadata
- author names and affiliations data
- references data
- tables, figures and equations data


Output:
- flattened versions


In [2]:
import pandas as pd
# from unidecode import unidecode
import re
from datetime import date
import json
import numpy as np
import string
import time
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity


pd.set_option('display.max_colwidth', 100)

pd.set_option('display.max_rows', None)

In [3]:
base_path="/Users/sijiawu/Work/Thesis/Data/Affiliations/"
data_base_path="/Users/sijiawu/Work/Thesis/Data/"

In [4]:
proc_auths_all = pd.read_pickle(base_path+"proc_auth_aff_flat.pkl")
aff_sub=pd.read_pickle(base_path+"affiliations_combined_sub.pkl")
j_data=pd.read_pickle(data_base_path+"Combined/011_merged_proc_scopus_inception_2020.pkl")
all_refs=pd.read_excel('../031_proc_refs_full_set/refs_1940_2020.xlsx')
relevant=pd.read_excel('../031_proc_refs_full_set/refs_1940_2020_top5.xlsx')

In [5]:
def compute_jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

In [20]:
j_data.columns

Index(['issue_url', 'author', 'title', 'journal', 'volume', 'number', 'pages',
       'year', 'ISSN', 'abstract', 'URL', 'publisher', 'content_type', 'type',
       'jid', 'author_split', 'urldate', 'reviewed-author', 'uploaded',
       'title_10', 'URL_og', 'number_og', 'title_og', 'author_og', 'pages_og',
       'j_fix', 'scopus_jid', 'scopus_id', 'scopus_authorgroup',
       'scopus_authors', 'scopus_affiliations', 'scopus_references',
       'scopus_author_full_names', 'scopus_title', 'scopus_year',
       'scopus_source_title', 'scopus_volume', 'scopus_issue', 'scopus_art_no',
       'scopus_page_start', 'scopus_page_end', 'scopus_page_count',
       'scopus_cited_by', 'scopus_doi', 'scopus_abstract', 'scopus_publisher',
       'scopus_document_type', 'scopus_publication_stage',
       'scopus_open_access', 'scopus_source', 'scopus_eid', 'scopus_title_og',
       'scopus_volume_og', 'scopus_issue_og', 'scopus_page_start_og',
       'scopus_page_end_og', 'scopus_year_og', 's_fix', 

In [23]:
j_data["id"]=j_data["URL"].str.split("/").str[-1]

In [6]:
relevant["id_o"]=relevant["id_o"].astype(str)
relevant["year_o"]=relevant["year_o"].astype(int)
proc_auths_all["id_o"]=proc_auths_all["url"].str.split("/").str[-1]

In [7]:
relevant_sub=relevant[["ref_ord", "id_o", "year_o","match_id"]]

In [8]:
for i in proc_auths_all['id_o'].unique():
    if "." in i:
        print(i)

In [9]:
proc_auths_all.head()

Unnamed: 0,auth_ord,a1,a2,a3,last,affs,year,content_type,jid,url,a1_order,a2_order,a3_order,fl,a1_tk_count,id_o
0,0,benjamin enke,b. enke,b. enke,enke,"{national bureau of economic research - nber, harvard university}",2020,Article,qje,https://doi.org/10.1093/qje/qjaa012,12220,4712,4304,benjamin enke,2,qjaa012
1,0,lauren f. bergquist,l. f. bergquist,l. bergquist,bergquist,{university of michigan},2020,Article,aer,https://www.jstor.org/stable/26966478,4049,5985,13392,lauren bergquist,3,26966478
2,1,michael dinerstein,m. dinerstein,m. dinerstein,dinerstein,{university of chicago},2020,Article,aer,https://www.jstor.org/stable/26966478,1884,15221,13547,michael dinerstein,2,26966478
3,0,dominic coey,d. coey,d. coey,coey,{facebook},2020,Article,aer,https://www.jstor.org/stable/26966479,3032,7322,6575,dominic coey,2,26966479
4,1,bradley j. larsen,b. j. larsen,b. larsen,larsen,"{national bureau of economic research - nber, stanford university}",2020,Article,aer,https://www.jstor.org/stable/26966479,1624,1330,4346,bradley larsen,3,26966479


In [24]:

def compute_cosine_similarity(matrix):
    m_array = matrix.values
    cosine_sim = cosine_similarity(m_array)
    authors = matrix.index
    cosine_sim_df = pd.DataFrame(cosine_sim, index=authors, columns=authors)
    return cosine_sim_df

def cit_matrix(aff_auths, order, cit_data):
    # Merge to get citations at author level
    print(cit_data.shape)
    df_citations = cit_data.merge(aff_auths[[order, "id_o"]], on="id_o")
    print(df_citations.shape)
    matrix = df_citations.pivot_table(
        index=order, columns="match_id", aggfunc="size", fill_value=0
    )
    co_simm=compute_cosine_similarity(matrix)
    return {"matrix":matrix,"sim_matrix":co_simm}

def collab_matrix(aff_auths, order):
    authors = aff_auths[order].unique()
    author_index = {author: idx for idx, author in enumerate(authors)}
    matrix_size = len(authors)
    collab_matrix = np.zeros((matrix_size, matrix_size), dtype=int)
    grouped_papers = aff_auths.groupby("url")[order].apply(list)

    for authors_list in grouped_papers:
        for author1, author2 in combinations(authors_list, 2):
            idx1, idx2 = author_index[author1], author_index[author2]
            collab_matrix[idx1, idx2] += 1
            collab_matrix[idx2, idx1] += 1  

    collaboration_matrix = pd.DataFrame(collab_matrix, index=authors, columns=authors)
    
    return collaboration_matrix
    
def reduce_affs_jacc_sim(aff_auths, order, cits):
    collabs=collab_matrix(aff_auths, order)
    o={}
    for i in aff_auths.index:
        if aff_auths.loc[i,order] in o.keys():
            o[aff_auths.loc[i,order]].update(aff_auths.loc[i,"affs"])  # Merge sets
        else:
            o[aff_auths.loc[i,order]] = aff_auths.loc[i,"affs"]
    cit_mat=cit_matrix(aff_auths, order, cits)
    
    o_proc=[]
    pairs = list(combinations(list(o.keys()), 2))
    # print(cit_mat['sim_matrix'].head())
    for i in pairs:
        temp={ 
                       "pair_1":i[0], 
                       "pair_2":i[1],
                       "aff_jacc_sim":compute_jaccard_similarity(o[i[0]],o[i[1]]),
                       "collabs": collabs.loc[i[0],i[1]],
        }
        if (i[0] in cit_mat["sim_matrix"].index) & (i[1] in cit_mat["sim_matrix"].index):
            temp["cit_cos_sim"]=cit_mat["sim_matrix"].loc[i[0],i[1]]
        else:
            temp["cit_cos_sim"]=0
        o_proc.append(temp)

    
    o_flat=[]
    for i in o.keys():
        o_flat.append({"order": i,"affs": o[i]})

    
    return {"sims_pairs":pd.DataFrame(o_proc), "aff_sets":o, "aff_flat_sets": pd.DataFrame(o_flat),  "collabs": collabs, "cit":cit_mat}

def data_prep(t, duration, order):
    print(t)
    aff_t=proc_auths_all[(proc_auths_all['year']<t+duration)&(proc_auths_all['year']>=t)].reset_index(drop=True)
    cit_t=relevant_sub[(relevant_sub["year_o"]>=t)&(relevant_sub["year_o"]<t+duration)].reset_index(drop=True)
    j_t=j_data[(j_data["year"]>=t)&(j_data["year"]<t+duration)][['id', "title", "jid", "year", "content_type"]].reset_index(drop=True)
    a=time.time()
    jaccs=reduce_affs_jacc_sim(aff_t, order, cit_t)
    jaccs["j_data_t"]=j_t
    b=time.time()
    print(b-a)
    # print()
    return jaccs

jacc_sims_10={}

for i in range(1940,2020,10):
    jacc_sims_10[i]=data_prep(i,10, 'a1_order')

jacc_sims_20={}

for i in range(1940,2020,20):
    jacc_sims_20[i]=data_prep(i,20, 'a1_order')


1940
(2447, 4)
(2597, 5)
4.696178197860718
1950
(3171, 4)
(3420, 5)
6.286514759063721
1960
(3677, 4)
(4442, 5)
13.083240985870361
1970
(10147, 4)
(13368, 5)
39.83624887466431
1980
(15578, 4)
(23216, 5)
44.419708013534546
1990
(15799, 4)
(27211, 5)
39.65738606452942
2000
(19478, 4)
(38275, 5)
51.52166509628296
2010
(32981, 4)
(76793, 5)
105.8645601272583
1940
(5618, 4)
(6017, 5)
16.39908790588379
1960
(13824, 4)
(17810, 5)
73.8054780960083
1980
(31377, 4)
(50427, 5)
129.78195929527283
2000
(52459, 4)
(115068, 5)
212.62368726730347


# Models