# Matching References pre-1970s
The purpose of this notebook is to clean results obtained from mturk for journal articles that are from, on average, before 1970. There are two data sources which has resulted in slight differences in the raw data input structure.

1. AWS MTURK - a service offered by AWS, output returns as a csv file
2. fMTURK - a clone of AWS MTURK specific to scholarly publishing where the output returns as a json file

Note that naming conventions for variables vary even though they are both structured data sets so combining them will require some trivial manipulation.

Expected output: 
1. json files of reference matches 
2. json and csv files of references collected via manual interfaces
3. csv file of all input data
4. reconciliation of all input files vs output files to see which pages and files have been digitized

## Initial setup

In [1]:
# libraries required, please install pandas
import pandas as pd
from unidecode import unidecode
import re
from datetime import date
import json
import numpy as np
from os import listdir
from os.path import isfile, join
# set column options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [2]:
# change base path to point to the results of the mturk data
# the expectation is that this was directly downloaded from the respective results interface
mturk_files_out="/Users/sijiawu/Downloads/thesis_docs/mturk_process/output_files/"
mturk_files_in="/Users/sijiawu/Downloads/thesis_docs/mturk_process/input_files/"
fmturk_files_out="/Users/sijiawu/Downloads/thesis_docs/fmturk/"

In [3]:
#remove leading and trailing non-ascii characters
def strip_leading(_str):
    k=0
    l=len(_str)
    while k!=len(_str):
        if re.search('[,*" \'.:]',_str[k]) is not None:
            k=k+1
        else:
            break
    while l>0:
        if re.search('[,*" \'.:]',_str[l-1]) is not None:
            l=l-1
        else:
            break
    return _str[k:l]

In [4]:
# load in journal metadata
JOURNALS= ['AER', 'JPE', 'ECTA', 'RES', 'QJE']
#read in all processed masterlists
j_data=pd.DataFrame()
for i in JOURNALS:
    j_data=pd.concat([pd.read_excel('/Users/sijiawu/Work/Thesis/Data/Combined/'+i+'_M_sco_du.xlsx'), j_data], ignore_index=True)
#Create a batch file

j_data=j_data[j_data.duplicated()==False].reset_index().drop('index', axis=1)

# Replace the journal names with Acronyms
j_data.loc[j_data['journal']=="Econometrica",'journal']='econometrica'
j_data.loc[j_data['journal']=='The Quarterly Journal of Economics','journal']='quarterly journal of economics'
j_data.loc[j_data['journal']=='The Review of Economic Studies','journal']='review of economic studies'
j_data.loc[j_data['journal']=='Journal of Political Economy','journal']='journal of political economy'
j_data.loc[j_data['journal']=='The American Economic Review','journal']='american economic review'

#some corrections to the issue
j_data.loc[j_data["number"]=="2023-03-04 00:00:00","number"]="3--4"
j_data.loc[j_data["number"]=="4-5","number"]="4--5"
j_data.loc[j_data["number"]=="1-2","number"]="1--2"

j_data.journal.unique()

j_data["id"]=j_data["URL"].str.split("/").str[-1]
j_data["title_proc"]=j_data["title"].fillna("none").astype(str).str.lower()
j_data["title_proc"]=j_data["title_proc"].apply(strip_leading,1)

  j_data=pd.concat([pd.read_excel('/Users/sijiawu/Work/Thesis/Data/Combined/'+i+'_M_sco_du.xlsx'), j_data], ignore_index=True)


## Matching Algorithm

In [5]:
fullset=pd.read_pickle("pre_1970s")

In [6]:
fullset.type.value_counts()
interest=["american economic review","econometrica", "journal of political economy", "quarterly journal of economics", "review of economic studies"]

In [7]:
from difflib import SequenceMatcher

# Utility function to compute similarity
# Utility function to compute similarity
def similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def construct(j_data, journal,year,title, year_latest):
    # print(year)
    # print(journal)
    if journal in interest:
        temp=j_data[(j_data["journal"]==journal)&(j_data["year"]<=(year+2))&(j_data["year"]>=(year_latest-10))]["title_proc"].apply(lambda y: similar(y, title))
        # print(temp)
        if len(temp)>0:
            # print(list(temp))
            # print(max(list(temp)))
            o=temp[temp>=(max(temp)-0.15)]
            return {"index": list(o.index), "m_val":list(o)}
        else:
            return {'index':[],"m_val":[]}
    else:
        return None

In [8]:
fullset_jstor=fullset[fullset["type"]==8].reset_index(drop=True)
fullset_article=fullset[fullset["type"]==2].reset_index(drop=True)
fullset_others=fullset[(fullset["type"]!=2)&(fullset["type"]!=8)].reset_index(drop=True)

In [9]:
fullset_article["outcome"]=fullset_article.apply(lambda x:construct(j_data, x['journal_proc'],int(x['year_o']),x['title_proc'], x["year_latest"]), axis=1)


In [10]:
def construct_j(df, jstor_id):
    try:
        t=int(jstor_id)
        # print(list(j_data.loc[j_data["id"]==str(t), 'URL'])[0])
        s=list(df.loc[j_data["id"]==str(t), 'URL'])
        return s[0]
    except:
        return "CHECKREQ"
    


In [11]:
fullset_jstor['match_url']=fullset_jstor.apply(lambda x: construct_j(j_data, x["jstor"]), axis=1)

In [15]:
fullset_article[fullset_article['journal'].isin(interest)]["year_latest"].unique()

array([0.000e+00, 1.939e+03, 1.931e+03, 1.943e+03, 1.944e+03, 1.942e+03,
       1.945e+03, 1.941e+03, 1.954e+03, 1.946e+03, 1.950e+03, 1.959e+03,
       1.940e+03, 1.948e+03, 1.937e+03, 1.951e+03, 1.953e+03, 1.935e+03,
       1.933e+03, 1.936e+03, 1.929e+03, 1.930e+03, 1.952e+03, 1.938e+03,
       1.949e+03, 1.912e+03, 1.957e+03, 1.961e+03, 1.962e+03, 1.960e+03,
       1.958e+03, 1.956e+03, 1.963e+03, 1.965e+03, 1.964e+03, 1.947e+03,
       1.932e+03, 1.917e+03, 1.000e+00, 1.955e+03, 1.934e+03, 1.800e+03,
       1.919e+03, 1.918e+03, 1.925e+03, 1.928e+03, 1.914e+03, 1.926e+03,
       1.916e+03, 1.906e+03, 1.894e+03,       nan])

In [16]:
fullset_article.columns

Index(['id', 'tasknum', 'id_o', 'page_o', 'year_o', 'journal_o', 'authors_o',
       'title_o', 'volume_o', 'issue_o', 'completer', 'pdf_url', 'type',
       'author', 'title', 'journal', 'year', 'volume', 'issue', 'pages',
       'chapter_title', 'location', 'publisher', 'text_full', 'jstor',
       'journal_proc', 'year_proc', 'year_proc_split', 'year_latest',
       'volume_proc', 'issue_proc', 'title_proc', 'outcome'],
      dtype='object')

In [17]:
j_data.columns

Index(['issue_url', 'ISSN', 'URL', 'journal', 'number', 'publisher', 'title',
       'urldate', 'volume', 'year', 'abstract', 'author', 'pages',
       'reviewed-author', 'uploaded', 'content_type', 'author_split',
       'title_10', 'type', 'authorsSCO', 'titleSCO', 'journalSCO', 'DOI',
       'affiliations', 'abstractSCO', 'citations', 'document type',
       'index keywords', 'author keywords', 'document_type', 'id',
       'title_proc'],
      dtype='object')

In [18]:
fullset_article['match_url']=None
found=[]
confirmed={
    1:[0,0],
    2:[0,0],
    3:[0,0],
    4:[0,0],
    5:[0,0],
}
vary=0
vary2=0
bad=0
re=0
for i in fullset_article.index:
    if fullset_article.loc[i,"journal_proc"] in interest:
        count=len(fullset_article.loc[i,"outcome"]['index'])
        found.append(count)
        # print(fullset_article.loc[i,'author'] + " ### "+fullset_article.loc[i,'title_proc'] + " ## " + str(fullset_article.loc[i,'year_proc'] ))
        test=pd.concat([j_data.loc[fullset_article.loc[i,"outcome"]['index'], ["title",'author','URL','year']].reset_index(drop=True),pd.DataFrame(fullset_article.loc[i,"outcome"])], axis=1)
        if count==1:
            # print(test[['author','title','year']])
            a=similar(str(list(test['author'])[0]), str(fullset_article.loc[i,"author"]))
            b=similar(str(list(test['author'])[0]).split(' ')[-1], str(fullset_article.loc[i,"author"]).split(' ')[-1])
            if (a>0.8) or (b>0.8):
                confirmed[1][0]+=1
                fullset_article.loc[i,'match_url']=list(test['URL'])[0]
            else:
                fullset_article.loc[i,'match_url']="CHECKREQ"

        elif count>1:
            
            if sum(test["m_val"]==1)==1:
                # print(test[test["m_val"]==1][['author','title','year']])
                # print(similar(str(list(test[test["m_val"]==1]['author'])[0]), str(fullset_article.loc[i,"author"])))
                a=similar(str(list(test[test["m_val"]==1]['author'])[0]), str(fullset_article.loc[i,"author"]))
                b=similar(str(list(test[test["m_val"]==1]['author'])[0]).split(' ')[-1], str(fullset_article.loc[i,"author"]).split(' ')[-1])
                if (a>0.8) or (b>0.8):
                    # confirmed[count][0]+=1
                    fullset_article.loc[i,'match_url']=list(test['URL'])[0]
                else:
                    fullset_article.loc[i,'match_url']="CHECKREQ"
            elif sum(test["m_val"]>0.65)>0:
                if sum(test["year"]==fullset_article.loc[i,"year_proc"])==1:
                    # print("year match")
                    a=similar(str(list(test[test["year"]==fullset_article.loc[i,"year_proc"]]['author'])[0]), str(fullset_article.loc[i,"author"]))
                    b=similar(str(list(test[test["year"]==fullset_article.loc[i,"year_proc"]]['author'])[0]).split(' ')[-1], str(fullset_article.loc[i,"author"]).split(' ')[-1])
                    if (a>0.8) or (b>0.8):
                        fullset_article.loc[i,'match_url']=list(test[test["year"]==fullset_article.loc[i,"year_proc"]]["URL"])[0]
                        vary2+=1
                    else:
                        fullset_article.loc[i,'match_url']="CHECKREQ"
                vary+=1
                # print("test for case of multiple author, multiple legs")
                # print(fullset_article.loc[i,"outcome"]['index'])
                # print(fullset_article.loc[i,"title_proc"]+ " "+fullset_article.loc[i,"author"]+" ##### "+str(fullset_article.loc[i,"year_proc"]))
                # print(test)
            else:
                # print("unlikely match")
                fullset_article.loc[i,'match_url']="CHECKREQ"
                bad+=1
        else:
            print("no_match")
            fullset_article.loc[i,'match_url']="CHECKREQ"
            bad+=1
    else:
        fullset_article.loc[i,"outcome"]=None

no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
no_match
n

In [19]:
fullset_article[fullset_article["journal_proc"].isin(interest)].shape

(5211, 34)

In [20]:
def match_id_allocate(x):
    if pd.isna(x):
        return None
    elif x=="CHECKREQ":
        return None
    else:
        return x.split('/')[-1]

In [22]:
all_70=pd.concat([fullset_jstor, fullset_article, fullset_others], axis=0).reset_index(drop=True)
all_70['match_id']=all_70.apply(lambda x: match_id_allocate(x['match_url']), axis=1)
all_70['f_key']='mturk_'+all_70.index.astype(str)


In [23]:
fullset_jstor_match=all_70[(all_70['type']==8)&(all_70["match_url"]!="CHECKREQ")].reset_index(drop=True)
fullset_article_match=all_70[(all_70['type']==2)&(all_70["match_url"]!="CHECKREQ")&(all_70["journal_proc"].isin(interest))].reset_index(drop=True)



In [26]:
all_70.columns 

Index(['id', 'tasknum', 'id_o', 'page_o', 'year_o', 'journal_o', 'authors_o',
       'title_o', 'volume_o', 'issue_o', 'completer', 'pdf_url', 'type',
       'author', 'title', 'journal', 'year', 'volume', 'issue', 'pages',
       'chapter_title', 'location', 'publisher', 'text_full', 'jstor',
       'journal_proc', 'year_proc', 'year_proc_split', 'year_latest',
       'volume_proc', 'issue_proc', 'title_proc', 'match_url', 'outcome',
       'match_id', 'f_key'],
      dtype='object')

In [28]:
full_article_sub=all_70[(all_70['id_o']!=all_70['match_id'])&(all_70['match_id'].isna()==False)].reset_index(drop=True).drop_duplicates(subset = ['id_o', 'match_id'], keep='first').reset_index(drop=True)[['id_o', 'match_id','f_key']]

In [29]:
full_article_sub.to_excel('network_cit_pre.xlsx', index=False)

In [30]:
all_70.to_excel("refs_pre_1970.xlsx", index=False)

In [35]:
summary={
    0: {'type': "jstor_trivial",'total': fullset_jstor.shape[0], "matched":fullset_jstor_match.shape[0],"error" : fullset_jstor[fullset_jstor['match_url']=="CHECKREQ"].shape[0]},
    1: {'type':'jstor_article','total': fullset_article[fullset_article["journal_proc"].isin(interest)].shape[0], 'matched': fullset_article_match.shape[0], "error": fullset_article[fullset_article["match_url"]=="CHECKREQ"].shape[0]},
    2: {'type':'other_articles','total': fullset_article.shape[0]-fullset_article[fullset_article["journal_proc"].isin(interest)].shape[0], 'matched':0, 'error':0},
    3: {'type': 'other_refs', 'total': fullset_others.shape[0], 'matched':0, 'error':0}
}

summary_df=pd.DataFrame(summary).transpose()
print(summary_df)
print(sum(summary_df['total']))
print(sum(summary_df['matched']))
print(sum(summary_df['error']))

             type  total matched error
0   jstor_trivial   3563    3555     8
1   jstor_article   5211    4620   591
2  other_articles   8599       0     0
3      other_refs  36107       0     0
53480
8175
599
