# Cleaning References pre-1970s
The purpose of this notebook is to clean results obtained from mturk for journal articles that are from, on average, before 1970. 

## Initial setup

In [37]:
# libraries required, please install pandas
import pandas as pd
from unidecode import unidecode
import re
from datetime import date
import json
import numpy as np
# set column options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [38]:
# change base path to point to the results of the mturk data
# the expectation is that this was directly downloaded from the mturk results interface
base_path2="/Users/sijiawu/Downloads/mturk_process/"



In [48]:
# reading in the mturk processed data, first round
file_names=[
"progress_ecta_501_results.csv",
"progress_ecta_1001_results.csv",
"progress_ecta_1501_results.csv",
"progress_ecta_2001_results.csv",
"progress_ecta_2501_results.csv",
"progress_ecta_2651_results.csv",
"progress_res_501_results.csv",
"progress_res_1001_results.csv",
"progress_res_1501_results.csv",
"progress_res_2001_results.csv",
"progress_res_2501_results.csv",
"progress_jpe_501_results.csv",
"progress_jpe_1001_results.csv",
"progress_jpe_1501_results.csv",
"progress_jpe_2001_results.csv",
"progress_jpe_2501_results.csv",
"progress_jpe_3001_results.csv",
"progress_jpe_3501_results.csv",
"progress_jpe_4001_results.csv",
"progress_jpe_4501_results.csv",
"progress_jpe_5001_results.csv",
"progress_jpe_5501_results.csv",
"progress_jpe_6001_results.csv",
"progress_jpe_6501_results.csv",
"progress_jpe_7001_results.csv",
"progress_jpe_7501_results.csv",
"progress_jpe_8001_results.csv",
"progress_jpe_8101_results.csv",
"progress_jpe_8201_results.csv",
"progress_jpe_8301_results.csv",
"progress_jpe_8401_results.csv",
"progress_jpe_8501_results.csv",
"progress_jpe_8601_results.csv",
"progress_jpe_8701_results.csv",
"progress_jpe_8801_results.csv",
"progress_jpe_8901_results.csv",
"progress_jpe_9001_results.csv",
"progress_jpe_9501_results.csv",
"progress_jpe_10001_results.csv",
"progress_jpe_10168_results.csv",
"progress_qje_wo_nones_501_results.csv",
"progress_qje_wo_nones_1001_results.csv",

    "progress_qje_wo_nones_1501_results.csv",
    "progress_qje_wo_nones_2001_results.csv",
    "progress_qje_wo_nones_5501_results.csv",
    
    "progress_qje_wo_nones_2501_results.csv",
    
    "progress_qje_wo_nones_4001_results.csv",
    "progress_qje_wo_nones_4501_results.csv",
    "progress_qje_wo_nones_5001_results.csv",
    
    "progress_qje_wo_nones_keep_487_r_results.csv",
    "progress_qje_wo_nones_keep_501_r_results_partial.csv",
    "progress_qje_wo_nones_keep_719_r_results.csv"
    
]

#in reference to naming conventions for different reference categories in mturk
ref_type={"none":[1, "None"], 
          "reference":[5,"Reference list found"], 
          "article":[2, "Article"], 
          "book":[3, "Book"], 
          "other": [4, "Other"], 
          "news":[6, "News"], 
          "laws_hearings": [7, "Laws and Hearings"], 
         }

In [49]:
# some column renaming. I changed naming conventions in the original mturk file to make it consistent 
# resulting in later files having a different naming convention
dict = {'Answer.ref0.1':'Answer.0.1',
'Answer.ref0.2':'Answer.0.2',
'Answer.ref0.3':'Answer.0.3',
'Answer.ref0.4':'Answer.0.4',
'Answer.ref0.5':'Answer.0.5',
'Answer.ref0.6':'Answer.0.6',
'Answer.ref0.7':'Answer.0.7',
'Answer.ref1.1':'Answer.1.1',
'Answer.ref1.2':'Answer.1.2',
'Answer.ref1.3':'Answer.1.3',
'Answer.ref1.4':'Answer.1.4',
'Answer.ref1.5':'Answer.1.5',
'Answer.ref1.6':'Answer.1.6',
'Answer.ref1.7':'Answer.1.7',
'Answer.ref2.1':'Answer.2.1',
'Answer.ref2.2':'Answer.2.2',
'Answer.ref2.3':'Answer.2.3',
'Answer.ref2.4':'Answer.2.4',
'Answer.ref2.5':'Answer.2.5',
'Answer.ref2.6':'Answer.2.6',
'Answer.ref2.7':'Answer.2.7',
'Answer.ref3.1':'Answer.3.1',
'Answer.ref3.2':'Answer.3.2',
'Answer.ref3.3':'Answer.3.3',
'Answer.ref3.4':'Answer.3.4',
'Answer.ref3.5':'Answer.3.5',
'Answer.ref3.6':'Answer.3.6',
'Answer.ref3.7':'Answer.3.7',
'Answer.ref4.1':'Answer.4.1',
'Answer.ref4.2':'Answer.4.2',
'Answer.ref4.3':'Answer.4.3',
'Answer.ref4.4':'Answer.4.4',
'Answer.ref4.5':'Answer.4.5',
'Answer.ref4.6':'Answer.4.6',
'Answer.ref4.7':'Answer.4.7',
'Answer.ref5.1':'Answer.5.1',
'Answer.ref5.2':'Answer.5.2',
'Answer.ref5.3':'Answer.5.3',
'Answer.ref5.4':'Answer.5.4',
'Answer.ref5.5':'Answer.5.5',
'Answer.ref5.6':'Answer.5.6',
'Answer.ref5.7':'Answer.5.7',
'Answer.ref6.1':'Answer.6.1',
'Answer.ref6.2':'Answer.6.2',
'Answer.ref6.3':'Answer.6.3',
'Answer.ref6.4':'Answer.6.4',
'Answer.ref6.5':'Answer.6.5',
'Answer.ref6.6':'Answer.6.6',
'Answer.ref6.7':'Answer.6.7',
'Answer.ref7.1':'Answer.7.1',
'Answer.ref7.2':'Answer.7.2',
'Answer.ref7.3':'Answer.7.3',
'Answer.ref7.4':'Answer.7.4',
'Answer.ref7.5':'Answer.7.5',
'Answer.ref7.6':'Answer.7.6',
'Answer.ref7.7':'Answer.7.7',
'Answer.ref8.1':'Answer.8.1',
'Answer.ref8.2':'Answer.8.2',
'Answer.ref8.3':'Answer.8.3',
'Answer.ref8.4':'Answer.8.4',
'Answer.ref8.5':'Answer.8.5',
'Answer.ref8.6':'Answer.8.6',
'Answer.ref8.7':'Answer.8.7',
'Answer.ref9.1':'Answer.9.1',
'Answer.ref9.2':'Answer.9.2',
'Answer.ref9.3':'Answer.9.3',
'Answer.ref9.4':'Answer.9.4',
'Answer.ref9.5':'Answer.9.5',
'Answer.ref9.6':'Answer.9.6',
'Answer.ref9.7':'Answer.9.7',
'Answer.ref10.1':'Answer.10.1',
'Answer.ref10.2':'Answer.10.2',
'Answer.ref10.3':'Answer.10.3',
'Answer.ref10.4':'Answer.10.4',
'Answer.ref10.5':'Answer.10.5',
'Answer.ref10.6':'Answer.10.6',
'Answer.ref10.7':'Answer.10.7',
'Answer.ref11.1':'Answer.11.1',
'Answer.ref11.2':'Answer.11.2',
'Answer.ref11.3':'Answer.11.3',
'Answer.ref11.4':'Answer.11.4',
'Answer.ref11.5':'Answer.11.5',
'Answer.ref11.6':'Answer.11.6',
'Answer.ref11.7':'Answer.11.7',
'Answer.ref12.1':'Answer.12.1',
'Answer.ref12.2':'Answer.12.2',
'Answer.ref12.3':'Answer.12.3',
'Answer.ref12.4':'Answer.12.4',
'Answer.ref12.5':'Answer.12.5',
'Answer.ref12.6':'Answer.12.6',
'Answer.ref12.7':'Answer.12.7',
'Answer.ref13.1':'Answer.13.1',
'Answer.ref13.2':'Answer.13.2',
'Answer.ref13.3':'Answer.13.3',
'Answer.ref13.4':'Answer.13.4',
'Answer.ref13.5':'Answer.13.5',
'Answer.ref13.6':'Answer.13.6',
'Answer.ref13.7':'Answer.13.7',
'Answer.ref14.1':'Answer.14.1',
'Answer.ref14.2':'Answer.14.2',
'Answer.ref14.3':'Answer.14.3',
'Answer.ref14.4':'Answer.14.4',
'Answer.ref14.5':'Answer.14.5',
'Answer.ref14.6':'Answer.14.6',
'Answer.ref14.7':'Answer.14.7',
'Answer.ref15.1':'Answer.15.1',
'Answer.ref15.2':'Answer.15.2',
'Answer.ref15.3':'Answer.15.3',
'Answer.ref15.4':'Answer.15.4',
'Answer.ref15.5':'Answer.15.5',
'Answer.ref15.6':'Answer.15.6',
'Answer.ref15.7':'Answer.15.7',
'Answer.ref16.1':'Answer.16.1',
'Answer.ref16.2':'Answer.16.2',
'Answer.ref16.3':'Answer.16.3',
'Answer.ref16.4':'Answer.16.4',
'Answer.ref16.5':'Answer.16.5',
'Answer.ref16.6':'Answer.16.6',
'Answer.ref16.7':'Answer.16.7',
'Answer.ref17.1':'Answer.17.1',
'Answer.ref17.2':'Answer.17.2',
'Answer.ref17.3':'Answer.17.3',
'Answer.ref17.4':'Answer.17.4',
'Answer.ref17.5':'Answer.17.5',
'Answer.ref17.6':'Answer.17.6',
'Answer.ref17.7':'Answer.17.7',
'Answer.ref18.1':'Answer.18.1',
'Answer.ref18.2':'Answer.18.2',
'Answer.ref18.3':'Answer.18.3',
'Answer.ref18.4':'Answer.18.4',
'Answer.ref18.5':'Answer.18.5',
'Answer.ref18.6':'Answer.18.6',
'Answer.ref18.7':'Answer.18.7',
'Answer.ref19.1':'Answer.19.1',
'Answer.ref19.2':'Answer.19.2',
'Answer.ref19.3':'Answer.19.3',
'Answer.ref19.4':'Answer.19.4',
'Answer.ref19.5':'Answer.19.5',
'Answer.ref19.6':'Answer.19.6',
'Answer.ref19.7':'Answer.19.7',
'Answer.ref20.1':'Answer.20.1',
'Answer.ref20.2':'Answer.20.2',
'Answer.ref20.3':'Answer.20.3',
'Answer.ref20.4':'Answer.20.4',
'Answer.ref20.5':'Answer.20.5',
'Answer.ref20.6':'Answer.20.6',
'Answer.ref20.7':'Answer.20.7',
'Answer.ref21.1':'Answer.21.1',
'Answer.ref21.2':'Answer.21.2',
'Answer.ref21.3':'Answer.21.3',
'Answer.ref21.4':'Answer.21.4',
'Answer.ref21.5':'Answer.21.5',
'Answer.ref21.6':'Answer.21.6',
'Answer.ref21.7':'Answer.21.7',
'Answer.ref22.1':'Answer.22.1',
'Answer.ref22.2':'Answer.22.2',
'Answer.ref22.3':'Answer.22.3',
'Answer.ref22.4':'Answer.22.4',
'Answer.ref22.5':'Answer.22.5',
'Answer.ref22.6':'Answer.22.6',
'Answer.ref22.7':'Answer.22.7',
'Answer.ref23.1':'Answer.23.1',
'Answer.ref23.2':'Answer.23.2',
'Answer.ref23.3':'Answer.23.3',
'Answer.ref23.4':'Answer.23.4',
'Answer.ref23.5':'Answer.23.5',
'Answer.ref23.6':'Answer.23.6',
'Answer.ref23.7':'Answer.23.7',
'Answer.ref24.1':'Answer.24.1',
'Answer.ref24.2':'Answer.24.2',
'Answer.ref24.3':'Answer.24.3',
'Answer.ref24.4':'Answer.24.4',
'Answer.ref24.5':'Answer.24.5',
'Answer.ref24.6':'Answer.24.6',
'Answer.ref24.7':'Answer.24.7',
'Answer.ref25.1':'Answer.25.1',
'Answer.ref25.2':'Answer.25.2',
'Answer.ref25.3':'Answer.25.3',
'Answer.ref25.4':'Answer.25.4',
'Answer.ref25.5':'Answer.25.5',
'Answer.ref25.6':'Answer.25.6',
'Answer.ref25.7':'Answer.25.7',
'Answer.ref26.1':'Answer.26.1',
'Answer.ref26.2':'Answer.26.2',
'Answer.ref26.3':'Answer.26.3',
'Answer.ref26.4':'Answer.26.4',
'Answer.ref26.5':'Answer.26.5',
'Answer.ref26.6':'Answer.26.6',
'Answer.ref26.7':'Answer.26.7',
'Answer.ref27.1':'Answer.27.1',
'Answer.ref27.2':'Answer.27.2',
'Answer.ref27.3':'Answer.27.3',
'Answer.ref27.4':'Answer.27.4',
'Answer.ref27.5':'Answer.27.5',
'Answer.ref27.6':'Answer.27.6',
'Answer.ref27.7':'Answer.27.7',
'Answer.ref28.1':'Answer.28.1',
'Answer.ref28.2':'Answer.28.2',
'Answer.ref28.3':'Answer.28.3',
'Answer.ref28.4':'Answer.28.4',
'Answer.ref28.5':'Answer.28.5',
'Answer.ref28.6':'Answer.28.6',
'Answer.ref28.7':'Answer.28.7',
'Answer.ref29.1':'Answer.29.1',
'Answer.ref29.2':'Answer.29.2',
'Answer.ref29.3':'Answer.29.3',
'Answer.ref29.4':'Answer.29.4',
'Answer.ref29.5':'Answer.29.5',
'Answer.ref29.6':'Answer.29.6',
'Answer.ref29.7':'Answer.29.7',
'Answer.ref0.8':'Answer.0.8',
'Answer.ref1.8': 'Answer.1.8',
'Answer.ref10.8':'Answer.10.8',
'Answer.ref11.8':'Answer.11.8',
'Answer.ref12.8':'Answer.12.8',
'Answer.ref13.8':'Answer.13.8',
'Answer.ref14.8':'Answer.14.8',
'Answer.ref15.8':'Answer.15.8',
'Answer.ref16.8':'Answer.16.8',
'Answer.ref17.8':'Answer.17.8',
'Answer.ref18.8':'Answer.18.8',
'Answer.ref19.8':'Answer.19.8',
'Answer.ref2.8':'Answer.2.8',
'Answer.ref20.8':'Answer.20.8',
'Answer.ref21.8':'Answer.21.8',
'Answer.ref22.8':'Answer.22.8',
'Answer.ref23.8':'Answer.23.8',
'Answer.ref24.8':'Answer.24.8',
'Answer.ref25.8':'Answer.25.8',
'Answer.ref26.8':'Answer.26.8',
'Answer.ref27.8':'Answer.27.8',
'Answer.ref28.8':'Answer.28.8',
'Answer.ref3.8':'Answer.3.8',
'Answer.ref4.8':'Answer.4.8',
'Answer.ref5.8':'Answer.5.8',
'Answer.ref6.8':'Answer.6.8',
'Answer.ref7.8':'Answer.7.8',
'Answer.ref8.8':'Answer.8.8',
'Answer.ref9.8':'Answer.9.8',

       }

In [50]:
#read in each mturk file and rename the columns
All=[]
for i in file_names:
    All.append(pd.read_csv(base_path2+i).rename(columns=dict))
All=pd.concat(All, ignore_index=True)
All.sort_index(axis=1, inplace=True)
All.shape


(19707, 574)

In [51]:
list(All.columns)

['AcceptTime',
 'Answer.0.1',
 'Answer.0.2',
 'Answer.0.3',
 'Answer.0.4',
 'Answer.0.5',
 'Answer.0.6',
 'Answer.0.7',
 'Answer.0.8',
 'Answer.1.1',
 'Answer.1.2',
 'Answer.1.3',
 'Answer.1.4',
 'Answer.1.5',
 'Answer.1.6',
 'Answer.1.7',
 'Answer.1.8',
 'Answer.10.1',
 'Answer.10.2',
 'Answer.10.3',
 'Answer.10.4',
 'Answer.10.5',
 'Answer.10.6',
 'Answer.10.7',
 'Answer.10.8',
 'Answer.11.1',
 'Answer.11.2',
 'Answer.11.3',
 'Answer.11.4',
 'Answer.11.5',
 'Answer.11.6',
 'Answer.11.7',
 'Answer.11.8',
 'Answer.12.1',
 'Answer.12.2',
 'Answer.12.3',
 'Answer.12.4',
 'Answer.12.5',
 'Answer.12.6',
 'Answer.12.7',
 'Answer.12.8',
 'Answer.13.1',
 'Answer.13.2',
 'Answer.13.3',
 'Answer.13.4',
 'Answer.13.5',
 'Answer.13.6',
 'Answer.13.7',
 'Answer.13.8',
 'Answer.14.1',
 'Answer.14.2',
 'Answer.14.3',
 'Answer.14.4',
 'Answer.14.5',
 'Answer.14.6',
 'Answer.14.7',
 'Answer.14.8',
 'Answer.15.1',
 'Answer.15.2',
 'Answer.15.3',
 'Answer.15.4',
 'Answer.15.5',
 'Answer.15.6',
 'Answer.

In [18]:
#check for duplicates by checking that dataframe size is the same after dropping duplicates
check=All.drop_duplicates()
check.shape

(19707, 574)

In [19]:
sum(check["RejectionTime"].isna()==False)


120

In [9]:
input_files=[
    "progress_ecta_501.csv",
    "progress_ecta_1001.csv",
    "progress_ecta_1501.csv",
    "progress_ecta_2001.csv",
    "progress_ecta_2501.csv",
    "progress_ecta_2651.csv",
    "progress_res_501.csv",
    "progress_res_1001.csv",
    "progress_res_1501.csv",
    "progress_res_2001.csv",
    "progress_res_2501.csv",
    "progress_jpe_501.csv",
    "progress_jpe_1001.csv",
    "progress_jpe_1501.csv",
    "progress_jpe_2001.csv",
    "progress_jpe_2501.csv",
    "progress_jpe_3001.csv",
    "progress_jpe_3501.csv",
    "progress_jpe_4001.csv",
    "progress_jpe_4501.csv",
    "progress_jpe_5001.csv",
    "progress_jpe_5501.csv",
    "progress_jpe_6001.csv",
    "progress_jpe_6501.csv",
    "progress_jpe_7001.csv",
    "progress_jpe_7501.csv",
    "progress_jpe_8001.csv",
    "progress_jpe_8101.csv",
    "progress_jpe_8201.csv",
    "progress_jpe_8301.csv",
    "progress_jpe_8401.csv",
    "progress_jpe_8501.csv",
    "progress_jpe_8601.csv",
    "progress_jpe_8701.csv",
    "progress_jpe_8801.csv",
    "progress_jpe_8901.csv",
    "progress_jpe_9001.csv",
    "progress_jpe_9501.csv",
    "progress_jpe_10001.csv",
    "progress_jpe_10168.csv",
    "progress_qje_wo_nones_501.csv",
    "progress_qje_wo_nones_1001.csv",
    "progress_qje_wo_nones_1501.csv",
    "progress_qje_wo_nones_2001.csv",
    "progress_qje_wo_nones_2501.csv",
    "progress_qje_wo_nones_4001.csv",
    "progress_qje_wo_nones_4501.csv",
    "progress_qje_wo_nones_5001.csv",
    "progress_qje_wo_nones_5501.csv",
    
    "progress_qje_wo_nones_3001.csv",
    "progress_qje_wo_nones_3501.csv",
    
    "progress_qje_wo_nones_6001.csv",
    "progress_qje_wo_nones_6501.csv",
    "progress_qje_wo_nones_7001.csv",
    "progress_qje_wo_nones_7501.csv",
    "progress_qje_wo_nones_8001.csv",
    "progress_qje_wo_nones_8501.csv",
    "progress_qje_wo_nones_8759.csv",
    "progress_qje_wo_nones_keep_501.csv",
    "progress_qje_wo_nones_keep_719.csv",
    
    "progress_aer_wo_nones_all.csv"
]

input_files=[
#     "progress_qje_wo_nones_501.csv",
#     "progress_qje_wo_nones_1001.csv",
#     "progress_qje_wo_nones_1501.csv",
#     "progress_qje_wo_nones_2001.csv",
#     "progress_qje_wo_nones_2501.csv",
#     "progress_qje_wo_nones_4001.csv",
#     "progress_qje_wo_nones_4501.csv",
#     "progress_qje_wo_nones_5001.csv",
#     "progress_qje_wo_nones_5501.csv",
    
#     "progress_qje_wo_nones_3001.csv",
#     "progress_qje_wo_nones_3501.csv",
    
    "progress_qje_wo_nones_6001.csv",
#     "progress_qje_wo_nones_6501.csv",
#     "progress_qje_wo_nones_7001.csv",
#     "progress_qje_wo_nones_7501.csv",
#     "progress_qje_wo_nones_8001.csv",
#     "progress_qje_wo_nones_8501.csv",
#     "progress_qje_wo_nones_8759.csv",
#     "progress_qje_wo_nones_keep_501.csv",
#     "progress_qje_wo_nones_keep_719.csv",
]

Allin=[]
for i in input_files:
    Allin.append(pd.read_csv(base_path2+'/input_files/'+i))
Allin=pd.concat(Allin, ignore_index=True)
Allin.shape[0]-All.shape[0]


-19207

In [52]:
remaining=[]
p_url=list(All["Input.pdf_url"])
for i in Allin.index:
    if Allin.loc[i,"pdf_url"] not in p_url:
        remaining.append(Allin.iloc[i])

In [11]:
tmp=pd.DataFrame(remaining).reset_index(drop=True)


In [153]:
import csv
tmp.to_csv("qje_wo_none_remnant.csv" ,encoding="UTF-8", quotechar='"', quoting=csv.QUOTE_NONNUMERIC, index= None)


In [154]:
tmp.year.unique()

array([1970, 1941, 1945, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954,
       1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1964, 1965, 1966,
       1967, 1968, 1969, 1971, 1972, 1973, 1974])

In [202]:
filt=pd.read_csv("qje_wo_none_tofilter1970_2010.csv")

In [203]:
filt=filt.fillna(0)

In [204]:
filt["id"]=filt["pdf_url"].str.split('/').str[-1].str.split("_").str[0]

In [205]:
filt["id"].head()

0    1883008
1    1883008
2    1883008
3    1883008
4    1883008
Name: id, dtype: object

In [206]:
filt=filt.join(j_data[["author", "number", "volume", "title", "id"]].set_index("id"),on="id")

In [157]:
remaining=[]
p_url=list(Allin["pdf_url"])
for i in filt.index:
    if filt.loc[i,"pdf_url"] not in p_url:
        remaining.append(filt.iloc[i])

In [34]:
tmp=pd.DataFrame(remaining).reset_index(drop=True)

In [35]:
tmp.Answer.value_counts()

AttributeError: 'DataFrame' object has no attribute 'Answer'

In [207]:
filt[(filt.Answer==0)&(filt.year>=1972)].to_csv("qje_wo_none_remnant_72_b.csv" ,encoding="UTF-8", quotechar='"', quoting=csv.QUOTE_NONNUMERIC, index= None)

In [208]:
filt[(filt.Answer==0)&(filt.year<1972)].to_csv("qje_wo_none_remnant_72_a.csv" ,encoding="UTF-8", quotechar='"', quoting=csv.QUOTE_NONNUMERIC, index= None)

In [209]:
filt[(filt.Answer!=0)&(filt.year<1972)&(filt.Answer<4)].to_csv("qje_wo_none_remnant_72_c.csv" ,encoding="UTF-8", quotechar='"', quoting=csv.QUOTE_NONNUMERIC, index= None)

In [210]:
filt[(filt.Answer!=0)&(filt.year>=1972)&(filt.Answer<4)].to_csv("qje_wo_none_remnant_72_d.csv" ,encoding="UTF-8", quotechar='"', quoting=csv.QUOTE_NONNUMERIC, index= None)

In [95]:
Allin.year.unique()

array([1947, 1948, 1949, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961,
       1962])

In [94]:
Allin[(Allin["journal"]=='AER')&(Allin["year"]<1950)].shape

(0, 9)

In [42]:
ref_type={"none":[1, "None"], 
          "reference":[5,"Reference list found"], 
          "article":[2, "Article"], 
          "book":[3, "Book"], 
          "other": [4, "Other"], 
          "news":[6, "News"], 
          "laws_hearings": [7, "Laws and Hearings"], 
          "jstor":[8, "Top 5 econ article"]
         }

In [43]:
JOURNALS= ['AER', 'JPE', 'ECTA', 'RES', 'QJE']
#read in all processed masterlists
j_data=pd.DataFrame()
for i in JOURNALS:
    j_data=pd.concat([pd.read_excel('/Users/sijiawu/Work/Refs Danae/Thesis/Data/Combined/'+i+'_M_sco_du.xlsx'), j_data], ignore_index=True)
#Create a batch file

j_data=j_data[j_data.duplicated()==False].reset_index().drop('index', axis=1)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/sijiawu/Work/Refs Danae/Thesis/Data/Combined/AER_M_sco_du.xlsx'

In [170]:
# Replace the journal names with Acronyms
j_data.loc[j_data['journal']=="Econometrica",'journal']='econometrica'
j_data.loc[j_data['journal']=='The Quarterly Journal of Economics','journal']='quarterly journal of economics'
j_data.loc[j_data['journal']=='The Review of Economic Studies','journal']='review of economic studies'
j_data.loc[j_data['journal']=='Journal of Political Economy','journal']='journal of political economy'
j_data.loc[j_data['journal']=='The American Economic Review','journal']='american economic review'

#some corrections to the issue
j_data.loc[j_data["number"]=="2023-03-04 00:00:00","number"]="3--4"
j_data.loc[j_data["number"]=="4-5","number"]="4--5"
j_data.loc[j_data["number"]=="1-2","number"]="1--2"

j_data.journal.unique()

array(['quarterly journal of economics', 'review of economic studies',
       'econometrica', 'journal of political economy',
       'american economic review'], dtype=object)

In [177]:
j_data["id"]=j_data["URL"].str.split("/").str[-1]

In [175]:
j_data.columns

Index(['issue_url', 'ISSN', 'URL', 'journal', 'number', 'publisher', 'title',
       'urldate', 'volume', 'year', 'abstract', 'author', 'pages',
       'reviewed-author', 'uploaded', 'content_type', 'author_split',
       'title_10', 'type', 'authorsSCO', 'titleSCO', 'journalSCO', 'DOI',
       'affiliations', 'abstractSCO', 'citations', 'document type',
       'index keywords', 'author keywords', 'document_type', 'ID'],
      dtype='object')

## Pre-processing
In this section, the mturk data is restructured such that each line has a single entered reference. Some unnecessary fields from the mturk data are also dropped. We keep the HITId to keep a foreign key linking back to the raw data.

In [30]:
# list all the column names
list(All.columns)


['AcceptTime',
 'Answer.0.1',
 'Answer.0.2',
 'Answer.0.3',
 'Answer.0.4',
 'Answer.0.5',
 'Answer.0.6',
 'Answer.0.7',
 'Answer.0.8',
 'Answer.1.1',
 'Answer.1.2',
 'Answer.1.3',
 'Answer.1.4',
 'Answer.1.5',
 'Answer.1.6',
 'Answer.1.7',
 'Answer.1.8',
 'Answer.10.1',
 'Answer.10.2',
 'Answer.10.3',
 'Answer.10.4',
 'Answer.10.5',
 'Answer.10.6',
 'Answer.10.7',
 'Answer.10.8',
 'Answer.11.1',
 'Answer.11.2',
 'Answer.11.3',
 'Answer.11.4',
 'Answer.11.5',
 'Answer.11.6',
 'Answer.11.7',
 'Answer.11.8',
 'Answer.12.1',
 'Answer.12.2',
 'Answer.12.3',
 'Answer.12.4',
 'Answer.12.5',
 'Answer.12.6',
 'Answer.12.7',
 'Answer.12.8',
 'Answer.13.1',
 'Answer.13.2',
 'Answer.13.3',
 'Answer.13.4',
 'Answer.13.5',
 'Answer.13.6',
 'Answer.13.7',
 'Answer.13.8',
 'Answer.14.1',
 'Answer.14.2',
 'Answer.14.3',
 'Answer.14.4',
 'Answer.14.5',
 'Answer.14.6',
 'Answer.14.7',
 'Answer.14.8',
 'Answer.15.1',
 'Answer.15.2',
 'Answer.15.3',
 'Answer.15.4',
 'Answer.15.5',
 'Answer.15.6',
 'Answer.

In [53]:
# list of columns to drop
lst=['HITTypeId',
 'Title',
 'Description',
 'Keywords',
 'Reward',
 'CreationTime',
 'MaxAssignments',
 'RequesterAnnotation',
 'AssignmentDurationInSeconds',
 'AutoApprovalDelayInSeconds',
 'Expiration',
 'NumberOfSimilarHITs',
 'LifetimeInSeconds',
 'AssignmentId',
 'AssignmentStatus',
 'AcceptTime',
 'AutoApprovalTime',
 'ApprovalTime',
 'RejectionTime',
 'LifetimeApprovalRate',
 'Last30DaysApprovalRate',
 'Last7DaysApprovalRate',
 'SubmitTime']

All=All.drop(lst, axis=1)

# add for number of referencing counts
All["num_refs"] = None 

  All["num_refs"] = None


### Functions used to clean individual reference entries

In [54]:

# this function:
# - removes all leading and trailing non-alphabet characters, in case there is some sort of punctuation copied
# - remove + signs
# - strips leading "the" if the second field is set to True
def strip_and_convert(str_, strip_the):
    #print(str_)
    if pd.isna(str_)==True:
        return "None"
    temp=unidecode(str_)
    try:
        l = [x.isalpha() for x in temp].index(True)
        m = [x.isalpha() for x in temp[::-1]].index(True)
        temp=temp[l:len(str_)-m]
    except:
        print(temp)
        temp="none"
    #temp=re.sub('^"(.*)"$','(.*)', temp)
    if (temp[0:4].lower()=="the ")&(strip_the==True):
        temp=temp[4:]
    temp=re.sub(' +', ' ', temp)
    temp=re.sub(' ,', ', ', temp)
    temp=temp.strip()
    return temp

# the following three functions expect to recieve a row of the data from mturk and the number of the reference on the page.
# after which it will return that reference as a json dictionary.
# Please see sample entry from from Mturk to see the fields expected for each type of data entry.
def process_article(x, num):
    article_dict={
        "type": "article",
        "author": x["Answer.ref."+str(num)+"_author"],
        "title": x["Answer.ref."+str(num)+"_title"],
        "journal": strip_and_convert(x["Answer.ref."+str(num)+"_journal"], True),
        "year": x["Answer.ref."+str(num)+"_year"],
        "volume": x["Answer.ref."+str(num)+"_vol"],
        "issue": x["Answer.ref."+str(num)+"_issue"],
        "pages": x["Answer.ref."+str(num)+"_pages"],  
    }
    '''
    Journals.append({"journal":strip_and_convert(x["Answer.ref."+str(num)+"_journal"]),
                     "author": x["Answer.ref."+str(num)+"_author"],
                     "title": x["Answer.ref."+str(num)+"_title"],
                     "m":len(Journals)})
    '''
    
    Journals.append(strip_and_convert(x["Answer.ref."+str(num)+"_journal"], True))
    return article_dict

def process_book(x, num):
    book_dict={
        "type": "book",
        "author": x["Answer.ref."+str(num)+"_author"],
        "title": x["Answer.ref."+str(num)+"_title"],
        "chapter_title": x["Answer.ref."+str(num)+"_chapter_title"],
        "year": x["Answer.ref."+str(num)+"_year"],
        "volume": x["Answer.ref."+str(num)+"_vol"],
        "location": x["Answer.ref."+str(num)+"_location"],
        "publisher": x["Answer.ref."+str(num)+"_publisher"],
        "pages": x["Answer.ref."+str(num)+"_pages"],  
    }
    Book.append(strip_and_convert(x["Answer.ref."+str(num)+"_title"], False))
    return book_dict

def process_other(x, num):
    other_dict={
        "type": "other",
        "author": x["Answer.ref."+str(num)+"_author"],
        "title": x["Answer.ref."+str(num)+"_title"],
        "year": x["Answer.ref."+str(num)+"_year"],
        "publisher": x["Answer.ref."+str(num)+"_publisher"],
        "textfull": x["Answer.ref."+str(num)+"_textfull"],  
    }
    return other_dict

def process_news(x, num):
    news_dict={
        "type": "news",
        "year": x["Answer.ref."+str(num)+"_year"],
        "publisher": x["Answer.ref."+str(num)+"_publisher"],
        "textfull": x["Answer.ref."+str(num)+"_textfull"],  
    }
    return news_dict

def process_laws(x, num):
    law_dict={
        "type": "laws",
        "year": x["Answer.ref."+str(num)+"_year"],
        "textfull": x["Answer.ref."+str(num)+"_textfull"],  
    }
    return law_dict

def process_jstor(x, num):
    law_dict={
        "type": "jstor",
        "year": x["Answer.ref."+str(num)+"_year"],
        "jstorId": x["Answer.jstor_"+str(num)],  
    }
    return law_dict


# function to merge dictionaries
def Merge(dict1, dict2):
    return(dict2|dict1)

In [55]:
nA=0
ref=0
article=0
book=0
other=0
jstor=0
a=0
all_ref={}
num_refs={}
# these fields are used by the previous block of functions
Journals=[]
Book=[]

# Clean out each reference
for j in All.index:
    count=0
    
    if(pd.isna(All.iloc[j]["RequesterFeedback"])==False):
        All.at[j, "rej_flag"]=True
    else:
        All.at[j, "rej_flag"]=False
        
    for i in range(0,30,1):
        item=None
        temp={
            "HITId": All.iloc[j]["HITId"],
            "ID": All.iloc[j]["Input.ID"],
            "page_o": All.iloc[j]["Input.page"],
            "year_o": All.iloc[j]["Input.year"],
            "journal_o": All.iloc[j]["Input.journal"],
            "authors_o": All.iloc[j]["Input.authors"],
            "title_o": All.iloc[j]["Input.title"],
            "volume_o": All.iloc[j]["Input.vol"],
            "issue_o": All.iloc[j]["Input.issue"],
        }
        
        if pd.isna(All.iloc[j]["Answer."+str(i)+".1"]):
            continue
        elif All.iloc[j]["Answer."+str(i)+".1"]==True:
            nA=nA+1
        elif All.iloc[j]["Answer."+str(i)+".2"]==True:
            article=article+1
            count=count+1
            item=process_article(All.iloc[j],i)
        elif All.iloc[j]["Answer."+str(i)+".3"]==True:
            book=book+1
            count=count+1
            item=process_book(All.iloc[j],i)
        elif All.iloc[j]["Answer."+str(i)+".4"]==True:
            other=other+1
            count=count+1
            item=process_other(All.iloc[j],i)
        elif All.iloc[j]["Answer."+str(i)+".5"]==True:
            ref=ref+1
        elif All.iloc[j]["Answer."+str(i)+".6"]==True:
            item=process_news(All.iloc[j],i)
            count=count+1
        elif All.iloc[j]["Answer."+str(i)+".7"]==True:
            item=process_laws(All.iloc[j],i)
            count=count+1
        elif All.iloc[j]["Answer."+str(i)+".8"]==True:
            item=process_jstor(All.iloc[j],i)
            count=count+1
            jstor=jstor+1
            
        if(pd.isna(All.iloc[j]["RequesterFeedback"])==False)&(item != None):
            temp["rejected"]=True
            print(j)
            print(temp|item)
        if item != None:
            a=a+1
            all_ref[a]=temp|item
    
        #print(i)
        #if item!=None:
            #print(Merge(item, temp))
            #print(temp|item)
    num_refs[j]=count

#NB change this into switch statement, horrible if ladder is good enough for now

  All.at[j, "rej_flag"]=False


48
{'HITId': '3JMNNNO3C8ZFGFCNDZ9VCLYHFR42WV', 'ID': 1909101, 'page_o': 4, 'year_o': 1940, 'journal_o': 'ECONOMETRICA', 'authors_o': 'H. Neisser', 'title_o': "A Note on Pareto's Theory of Production", 'volume_o': 8, 'issue_o': '3', 'rejected': True, 'type': 'book', 'author': 'Wicksell', 'title': 'Lectures on the political economy', 'chapter_title': nan, 'year': '1934', 'volume': '1', 'location': nan, 'publisher': nan, 'pages': '129-130'}
83
{'HITId': '31YWE12TF771RGEJTGMCAVTWZ9W7X9', 'ID': 1905717, 'page_o': 12, 'year_o': 1943, 'journal_o': 'ECONOMETRICA', 'authors_o': 'Martin Bronfenbrenner', 'title_o': 'The Role of Money in Equilibrium Capital Theory', 'volume_o': 11, 'issue_o': '1', 'rejected': True, 'type': 'book', 'author': 'V. Pareto', 'title': "Cours d'economie politique", 'chapter_title': nan, 'year': '1896', 'volume': '1', 'location': 'Lausanne', 'publisher': nan, 'pages': '60'}
83
{'HITId': '31YWE12TF771RGEJTGMCAVTWZ9W7X9', 'ID': 1905717, 'page_o': 12, 'year_o': 1943, 'journa

699
{'HITId': '3CMIQF80HULYEJZ2OSMSA9X77W2Q6B', 'ID': 1907188, 'page_o': 7, 'year_o': 1945, 'journal_o': 'ECONOMETRICA', 'authors_o': 'Victor E. Smith', 'title_o': 'Nonlinearity in the Relation between Input and Output: The Canadian Automobile Industry, 1918-1930', 'volume_o': 13, 'issue_o': '3', 'rejected': True, 'type': 'book', 'author': 'Henry Schultz', 'title': 'The Theory and Measurement of Demand', 'chapter_title': 'none', 'year': '1938', 'volume': 'none', 'location': 'Chicago', 'publisher': 'The University of Chicago Press', 'pages': '757-764'}
709
{'HITId': '3N5YJ55YYNYEYOYDYDBNMK4OY45NAB', 'ID': 1905368, 'page_o': 0, 'year_o': 1946, 'journal_o': 'ECONOMETRICA', 'authors_o': 'Trygve Haavelmo', 'title_o': 'Multiplier Effects of a Balanced Budget: Reply', 'volume_o': 14, 'issue_o': '2', 'rejected': True, 'type': 'article', 'author': 'G. Harberler', 'title': "Multiplier Effects of a Balanced Budget: Some Monetary Implications of Mr. Haavelmo's Paper", 'journal': 'Econometrica', 'y

2286
{'HITId': '3TD33TP5ESXCXF9GVN6EVP4UHJRABH', 'ID': 1909442, 'page_o': 16, 'year_o': 1959, 'journal_o': 'ECONOMETRICA', 'authors_o': 'Edwin Kuh', 'title_o': 'The Validity of Cross-Sectionally Estimated Behavior Equations in Time Series Applications', 'volume_o': 27, 'issue_o': 2, 'rejected': True, 'type': 'other', 'author': 'The president', 'title': '4 National Income Supplement to the Survey of Current Business', 'year': '1954', 'publisher': 'Department of Commerce, Office of Business Economics', 'textfull': '1954 National Income Supplement to the Survey of Current Business, U.S. Department of Commerce, Office of Business Economics, p. 216. Indexes for the subsequent  two years were derived from Economic Report of the President, 1956, Tables DI -D3,  pp.165-8.'}
2300
{'HITId': '36818Z1KWA87UK53GUR8D66ITREA3Q', 'ID': 1907588, 'page_o': 13, 'year_o': 1958, 'journal_o': 'ECONOMETRICA', 'authors_o': 'Robert J. Wolfson', 'title_o': 'An Econometric Investigation of Regional Differentials

2898
{'HITId': '3DQYSJDTZS6M0VTTGO0IG5LDL1GEXS', 'ID': 2296178, 'page_o': 2, 'year_o': 1961, 'journal_o': 'Review of Economic Studies', 'authors_o': 'G. C. Archibald', 'title_o': 'Chamberlin Versus Chicago', 'volume_o': 29, 'issue_o': 1.0, 'rejected': True, 'type': 'article', 'author': 'Piero Sraffa', 'title': 'The Laws of Returns under Competitive Conditions', 'journal': 'Economic journal', 'year': '1926', 'volume': '36', 'issue': '144', 'pages': '535-550'}
2962
{'HITId': '3XUY87HIWW99I7M2JFMSIWAONZNMM5', 'ID': 2296332, 'page_o': 0, 'year_o': 1965, 'journal_o': 'Review of Economic Studies', 'authors_o': 'Pan A. Yotopoulos', 'title_o': 'The "Wage-Productivity" Theory of Underemployment: A Refinement', 'volume_o': 32, 'issue_o': 1.0, 'rejected': True, 'type': 'article', 'author': 'W. Arthur Lewis', 'title': 'Economic Development with Unlimited Supplies of Labour', 'journal': 'Manchester School of Economic and Social Studies', 'year': '1954', 'volume': 'XXII', 'issue': '2', 'pages': '139

6255
{'HITId': '3R4QIDVOKW7DUV2CLJAQSG0659VEE1', 'ID': 1826591, 'page_o': 10, 'year_o': 1943, 'journal_o': 'JPE', 'authors_o': 'Paul Geren', 'title_o': 'The Contribution of Life Insurance to the Savings Stream', 'volume_o': 51, 'issue_o': 1, 'rejected': True, 'type': 'book', 'author': 'Life Insurance Sales Research Bureau', 'title': 'Trends  in Policy Types', 'chapter_title': nan, 'year': '1940', 'volume': nan, 'location': 'Hartford', 'publisher': nan, 'pages': nan}
6255
{'HITId': '3R4QIDVOKW7DUV2CLJAQSG0659VEE1', 'ID': 1826591, 'page_o': 10, 'year_o': 1943, 'journal_o': 'JPE', 'authors_o': 'Paul Geren', 'title_o': 'The Contribution of Life Insurance to the Savings Stream', 'volume_o': 51, 'issue_o': 1, 'rejected': True, 'type': 'book', 'author': 'Simon Kuznets', 'title': 'National  Income and Capital Formation, 1919-1935', 'chapter_title': nan, 'year': '1937', 'volume': nan, 'location': 'New York', 'publisher': nan, 'pages': '48'}
8202
{'HITId': '3ZQA3IO32IM0MLLP39I7Q8BQK5G1OY', 'ID':

72
72
19181
{'HITId': '3J06WJ78IUFBYHAKBZOASOCE9OKVVV', 'ID': 1883079, 'page_o': 4, 'year_o': 1969, 'journal_o': 'QJE', 'authors_o': 'M. I. Nadiri', 'title_o': 'The Determinants of Real Cash Balances in the U.S. Total Manufacturing Sector', 'volume_o': 83, 'issue_o': 2, 'rejected': True, 'type': 'jstor', 'year': nan, 'jstorId': 'investment behavio'}


## Reconciliation of Metadata Fields

This is to increase the percentage match on a metadata dield in the masterlist. The specific fields going through reconciliation are:
- journal name
- pages
- volume
- issue
- year
- title

The output from above is a dictionary of the separated out responses in json format.

In [56]:
# transpose the data into a dataframe
ar=pd.DataFrame.from_dict(all_ref).transpose()


In [57]:
ar["type"].unique()

array(['other', 'book', 'article', 'laws', 'news', 'jstor'], dtype=object)

In [58]:
ar.columns

Index(['HITId', 'ID', 'page_o', 'year_o', 'journal_o', 'authors_o', 'title_o',
       'volume_o', 'issue_o', 'type', 'author', 'title', 'year', 'publisher',
       'textfull', 'chapter_title', 'volume', 'location', 'pages', 'journal',
       'issue', 'rejected', 'jstorId'],
      dtype='object')

In [60]:
list(All.columns)

['Answer.0.1',
 'Answer.0.2',
 'Answer.0.3',
 'Answer.0.4',
 'Answer.0.5',
 'Answer.0.6',
 'Answer.0.7',
 'Answer.0.8',
 'Answer.1.1',
 'Answer.1.2',
 'Answer.1.3',
 'Answer.1.4',
 'Answer.1.5',
 'Answer.1.6',
 'Answer.1.7',
 'Answer.1.8',
 'Answer.10.1',
 'Answer.10.2',
 'Answer.10.3',
 'Answer.10.4',
 'Answer.10.5',
 'Answer.10.6',
 'Answer.10.7',
 'Answer.10.8',
 'Answer.11.1',
 'Answer.11.2',
 'Answer.11.3',
 'Answer.11.4',
 'Answer.11.5',
 'Answer.11.6',
 'Answer.11.7',
 'Answer.11.8',
 'Answer.12.1',
 'Answer.12.2',
 'Answer.12.3',
 'Answer.12.4',
 'Answer.12.5',
 'Answer.12.6',
 'Answer.12.7',
 'Answer.12.8',
 'Answer.13.1',
 'Answer.13.2',
 'Answer.13.3',
 'Answer.13.4',
 'Answer.13.5',
 'Answer.13.6',
 'Answer.13.7',
 'Answer.13.8',
 'Answer.14.1',
 'Answer.14.2',
 'Answer.14.3',
 'Answer.14.4',
 'Answer.14.5',
 'Answer.14.6',
 'Answer.14.7',
 'Answer.14.8',
 'Answer.15.1',
 'Answer.15.2',
 'Answer.15.3',
 'Answer.15.4',
 'Answer.15.5',
 'Answer.15.6',
 'Answer.15.7',
 'Answer

## Reconciliating Journal names

There are many misspellings of journal names, this section is correct them. Process:
- Strip leading and trailing white spaces
- Ensure "the" has been removed from the beginning of the journal, this was done during the previous step.
- Lowercase the list, get a unique list and sort alphabetically
- Go through the list and copy duplicates into an array with the key as the corrected spelling: eg "AER":["AERS","AERB" ...],
- Format each misspelling into a dictionary in the form {misspelling: correction, ....,misspelling: correction} in preparation of replacing it in the dataset.
- Create a separate column in the data that is a copy of the journal column, journal_proc, caste it to string type and replace the errors in this column.

The result is a json file where each key is a journal name and an array of errors. And a json file where each error name maps to the corrected journal name including journal names that are not in error.

In [76]:
unique_journals=list(ar[ar["type"]=="article"]["journal"].str.strip().str.lower().unique())
unique_journals.sort()
len(unique_journals)

1555

In [77]:
unique_journals #[:10] #show journal names, I'm restricting it to 10 here. Unrestrict it to show all journal names

['a merican economic review, proceedings',
 'a survey of contemporary economics',
 'a. e. r',
 'a.e.r',
 'aamerican economic review',
 'accountant',
 'accounting review',
 'activity analysis of production and allocation',
 'actuarial society of america',
 'administrative science quarter',
 'admnistrative science quarterly',
 'aer',
 'aeronautical engineering review',
 'af of l weekly news service',
 'agenda',
 'agricultural economics research',
 'agricultural finance review',
 'agricultural history',
 'agricultural information bulletin',
 'air transportation',
 'ajp',
 'al of law and economics',
 'allgemeines statistisches archiv',
 'almanac issue',
 'am econ rev',
 'amercian economic review',
 'america economic review',
 'american academy of political and social science',
 'american agriculturist',
 'american banker',
 'american city',
 'american eco- nomic review',
 'american econ. rev',
 'american econkomic review',
 'american economci review',
 'american economic association',
 'am

In [78]:
len(unique_journals)

1555

In [79]:
# load in the journal names from file
sort_info=None
with open("/Users/sijiawu/Work/Refs Danae/journal_name_recon.json", 'r') as f:
    sort_info = json.load(f)
    
# def convert(o):
#     if isinstance(o, np.int64): return int(o)  
#     raise TypeError
    
# sorted_list = sorted(sort_info.items())

# sorted_dict = {}
# for key, value in sorted_list:
#     sorted_dict[key] = value

# print(sorted_dict)
# with open("/Users/sijiawu/Work/Refs Danae/journal_name_recon.json", 'w') as f:
#     json.dump(sorted_dict, f, indent = 6,default=convert) 

### Format each journal name error

In [80]:
split_out={}

for key in sort_info.keys():
    for i in sort_info[key]:
        split_out[i]=key

for i in range(len(unique_journals)):
    if unique_journals[i] in split_out.keys():
        unique_journals[i]=split_out[unique_journals[i]]
        
for i in unique_journals:
    if i not in split_out.keys():
        split_out[i]=i

### Replace the journal names for the ones we care about

In [81]:
ar["journal_proc"]=ar['journal'].fillna("none").astype(str).str.lower().str.strip()
joi=['american economic review', 'journal of political economy','econometrica','quarterly journal of economics', 'review of economic studies']
for i in ar[ar["journal_proc"].isna()==False].index:
    if ar.loc[i,'journal_proc'] in split_out.keys():
#         if split_out[ar.loc[i,'journal_proc']] in joi:
        ar.loc[i,'journal_proc']=split_out[ar.loc[i,'journal_proc']]

In [82]:
unique_journals=list(ar[ar["type"]=="article"]["journal_proc"].str.strip().str.lower().unique())
unique_journals.sort()
len(unique_journals)

971

## Reconcile the years
1. fillna as "none" and convert the year column to type string, store it in a new column called year_proc
2. some entered a month or season followed by a space and then the year, split by space or comma and take the last year value entered.
3. for each value, try caste the string year to an int, either directly or first converted to a float
4. append all other cases to a list called year_corr, get a unique set and reconcile manually via a list called year_rec
5. Format of errors is {correct_year: [year_error1, year_error2 ...], ...}
6. Save it to a file called year_recon.json. this is the file that is being read in in the next code block.

In [83]:
year_rec=None
with open("/Users/sijiawu/Work/Refs Danae/year_recon.json", 'r') as f:
    year_rec = json.load(f)

In [84]:
list(year_rec.keys())[:10]

['1693-1795',
 '1778-1791',
 '1788-1789',
 '1789-1794',
 '1791-1792',
 '1792-1793',
 '1811-1812',
 '1813-1814',
 '1814-1815',
 '1815-1816']

In [85]:
year_rec_split_out={}
for key in year_rec.keys():
    for i in year_rec[key]:
        year_rec_split_out[i]=key
# format the year_rec dictionary into one such that it is "error":"correction" form for each error.

In [86]:
ar["year_proc"]=ar['year'].fillna("none").astype(str)
ar["year_proc"]=ar["year_proc"].str.strip().str.split(',| ').str[-1]

year_corr=[] #this contains anything that isn't resolved in year_rec

for i in ar.index:
    proc_year="none"
    year_temp=re.sub("I", "1", ar.loc[i,"year_proc"]) # sub for I issue
    try:
        proc_year=int(year_temp) #convert from str to int
        if (proc_year<1000)|(proc_year>2000):
            #year_corr.append(year_temp)
            raise Exception("year below 1000")
        ar.loc[i, 'year_proc']=proc_year
    except:
        try:
            proc_year=int(float(year_temp))  #convert from float to int
            if (proc_year<1000)|(proc_year>2000):
                raise Exception("year below 1000")
            ar.loc[i, 'year_proc']=proc_year
        except:
            pst=0
            if (year_temp in year_rec_split_out.keys()):
                ar.loc[i, 'year_proc']=year_rec_split_out[year_temp]
                pst=1
            elif ("-" in year_temp) or ("/" in year_temp):
                split_year=re.sub("/", "-", year_temp).split('-')
                if (len(split_year[0])==4) and (len(split_year[1])==4):
                    ar.loc[i, 'year_proc']=re.sub("/", "-", year_temp)
                    pst=1
            if (ar.loc[i, "year_proc"]!="none") & (pst==0):
                year_corr.append(year_temp) #append to list if could not directly convert
                #print("could not "+df.loc[i, "year_proc"])


year_corr_u=list(set(year_corr)) #get unique list
year_corr_u.sort()

In [87]:
year_corr #because all years have been resolved the year_corr is empty

[]

Some year fields are multiple years, eg:1954-1955 in which case I take the latest year to occur. in the masterlist data does not have any entries with multiple years.

In [88]:
def year_split(x):
    if x[0]=="none":
        return 0
    if x[0]=="wrong":
        return 1
    if len(x)==1:
        return int(x[0])
    if len(x)==2:
        return int(x[1])
    else:
        return 0
    
ar["year_proc_split"]=ar["year_proc"].astype(str)
ar["year_proc_split"]=ar["year_proc_split"].str.split('-')
ar["year_latest"]=ar["year_proc_split"].apply(lambda x: year_split(x))
ar["year_latest"].unique()

array([1904,    0, 1933, 1932, 1939, 1935, 1914, 1937, 1924, 1940, 1929,
       1938, 1942, 1936, 1920, 1926, 1927, 1923, 1911, 1896, 1934, 1941,
       1931, 1901, 1912, 1928, 1930, 1902, 1897, 1943, 1949, 1884, 1925,
       1915,    1, 1895, 1962, 1848, 1890, 1891, 1993, 1906, 1849, 1888,
       1876, 1945, 1946, 1944, 1947, 1819, 1921, 1894, 1950, 1881, 1948,
       1951, 1922, 1917, 1918, 1908, 1892, 1919, 1899, 1875, 1909, 1910,
       1903, 1913, 1907, 1952, 1886, 1953, 1955, 1954, 1893, 1965, 1872,
       1879, 1905, 1878, 1971, 1845, 1855, 1858, 1728, 1753, 1829, 1838,
       1850, 1840, 1868, 1887, 1870, 1877, 1854, 1856, 1859, 1833, 1822,
       1861, 1863, 1957, 1959, 1961, 1956, 1960, 1958, 1885, 1874, 1963,
       1900, 1964, 1966, 1817, 1994, 1867, 1820, 1883, 1869, 1889, 1776,
       1832, 1826, 1974, 1844, 1422, 1987, 1720, 1765, 1880, 1857, 1746,
       1684, 1721, 1726, 1691, 1750, 1798, 1755, 1752, 1729, 1770, 1739,
       1761, 1758, 1754, 1763, 1768, 1762, 1769, 19

## Volume reconcilliation

1. fillna as "none" and convert the year column to type string, store it in a new column called volume_proc
2. I expect volume to be in either roman numerals or an integer, functions below convert roman numerals to decimals, detect roman numerals and detect that a piece of text is only numbers.
3. for each value, try caste the string to an int, either directly or first converted to a float
4. append all other cases to a list called oc_u, get a unique set and reconcile manually via a list called vol_rec
5. Format of errors is {correct_volume: [volume_error1, volume_error2 ...], ...}
6. Save it to a file called volume_recon.json. this is the file that is being read after the next code block.

In [89]:
roman_numerals = {"I" : 1,
                  "V" : 5,
                  "X" : 10,
                  "L" : 50,
                  "C" : 100,
                  "D" : 500,
                  "M" : 1000
                  }

def rom_to_dec(user_input):
    int_value = 0
    for i in range(len(user_input)):
        if user_input[i] in roman_numerals:
            if i + 1 < len(user_input) and roman_numerals[user_input[i]] < roman_numerals[user_input[i + 1]]:
                int_value -= roman_numerals[user_input[i]]
            else:
                int_value += roman_numerals[user_input[i]]
        else:
            print("Invalid input.")
            return "none"

    return int_value

def rom_match(strg, search=re.compile(r'[^IVXLCDM]').search):
     return not bool(search(strg))
    
def number_match(strg, search=re.compile(r'[^0-9.]').search):
     return not bool(search(strg))

In [90]:
vol_rec=None
with open("/Users/sijiawu/Work/Refs Danae/volume_recon.json", 'r') as f:
    vol_rec = json.load(f) 

In [91]:
vol_rec_split_out={}
for key in vol_rec.keys():
    for k in vol_rec[key]:
        vol_rec_split_out[k]=key

In [92]:
ar["volume_proc"]=ar["volume"].fillna("none").astype(str).str.upper()
vol_out=list(ar["volume_proc"].unique())
vol_out.sort()

w_count=0
n_count=0
vol_corr=[]

for i in ar.index:
    val=ar.loc[i,'volume_proc']
    if val in vol_rec_split_out.keys():
        val=vol_rec_split_out[val]
    if number_match(val):
        ar.loc[i,'volume_proc']=int(float(val))
    elif rom_match(val):
        ar.loc[i,'volume_proc']=rom_to_dec(val)
    elif val=="NONE":
        n_count=n_count+1
    elif val in vol_rec_split_out.keys():
        ar.loc[i,'volume_proc']=vol_rec_split_out[val]
        #print(val+" "+vol_rec_split_out[val])
    else:
        w_count=w_count+1
        vol_corr.append(val)
        print(val)


In [93]:
# NOTE THE output is empty because all issues have been resolved
vol_corr_u=list(set(vol_corr))
vol_corr_u.sort()
for a in vol_corr_u:
    print("'"+a+"':[\""+a+"\"],")

In [94]:
# with open("/Users/sijiawu/Work/Refs Danae/volume_recon.json", 'w') as f:
#     json.dump(vol_rec, f, indent = 6) 

## Issue reconcilliation

1. fillna as "none" and convert the year column to type string, store it in a new column called volume_proc
2. I expect issue to be in either roman numerals or an integer, or a float. This part uses the same functions as the previous section to for converting roman numerals to decimals, detect roman numerals and detect that a piece of text is only numbers.
3. for each value, I fist check if the value is to be corrected against the compiled file. Then try caste the string value of the issue to an int if it is a number, either directly or first converted to a float. if not a number then check for a roman numeral and then convert that to an integer. If it fails all the previous conditions check if None or if it is a string value designated to be so in the corrections file.
4. append all other cases to a list called issue_corr, get a unique set and reconcile manually via a list called issue_rec which is saved in the file issue_recon.json
5. Format of errors is {correct_issue: [issue_error1, issue_error2 ...], ...}
6. Iteratively perform the above until all errors are resolved. Save it to a file called issue_recon.json. this is the file that is being read in the next code block.


In [95]:
issue_rec=None
with open("/Users/sijiawu/Work/Refs Danae/issue_recon.json", 'r') as f:
    issue_rec=json.load(f) 

In [96]:
# make key-value pairs that can easily replace things.
issue_rec_split_out={}
for key in issue_rec.keys():
    for k in issue_rec[key]:
        issue_rec_split_out[k]=key 

In [97]:
ar["issue_proc"]=ar["issue"].fillna("none").astype(str).str.upper()
issue_out=list(ar["issue_proc"].unique())
issue_out.sort()

w_count=0
n_count=0
issue_corr=[]

for i in ar.index:
    val=ar.loc[i,'issue_proc']
    ind=0
    if val in issue_rec_split_out.keys():
        val=issue_rec_split_out[val]
    if number_match(val):
        ar.loc[i,'issue_proc']=int(float(val))
    elif rom_match(val):
        ar.loc[i,'issue_proc']=rom_to_dec(val)
    elif val=="NONE":
        n_count=n_count+1
        ar.loc[i,'issue_proc']=val
    elif val in issue_rec_split_out.keys():
        ar.loc[i,'issue_proc']=val
#         print(val)
    else:
        w_count=w_count+1
        issue_corr.append(val)
        print('"'+str(val)+'"')
  


"1,2,54"
"FASC. 4A"
"FASC. 2A"
"SONDERHEFT 41"
"FASC. 1"
"166-69"


In [98]:
issue_corr_u=list(issue_corr)
issue_corr_u.sort()
for a in issue_corr_u:
    print("'"+a+"',")

'1,2,54',
'166-69',
'FASC. 1',
'FASC. 2A',
'FASC. 4A',
'SONDERHEFT 41',


## Titles

For titles, I only compile corrections for those of journal articles. 
1. lower the test and fill the na values with "none". Strip leading and trailing characters that don't belong in titles. Assign these titles to a new column: 
2. replace americanized spelling, replace characters that don't belong in titles
3. 30332 total references with 18029 unique titles. There are 9751 journal article references of which 6580 are unique. Since I only care for those in the top 5 econ journals. There are 4174 top 5 journal references of which 2628 are unique titles. After cleaning for duplicates and spelling mistakes, unique top 5 article titles reduce to 2278 unique titles. 

In [99]:
ar.shape

(30332, 29)

In [100]:
#remove leading and trailing non-ascii characters
def strip_leading(_str):
    k=0
    l=len(_str)
    while k!=len(_str):
        if re.search('[,*" \'.:]',_str[k]) is not None:
            k=k+1
        else:
            break
    while l>0:
        if re.search('[,*" \'.:]',_str[l-1]) is not None:
            l=l-1
        else:
            break
    return _str[k:l]

#reformat some spelling
def replace_d_space(_str):
    temp=re.sub("[ ]+"," ",_str)
    temp=re.sub("ze ","se ",temp)
    temp=re.sub("zation","sation",temp)
    temp=re.sub("- | -","-", temp)
    temp=re.sub("- | -","-", temp)
    return temp


In [103]:
#pre-process the title with previous two functions
ar["title_proc"]=ar["title"].fillna("none").astype(str).str.lower()
ar["title_proc"]=ar["title_proc"].apply(strip_leading,1)
ar["title_proc"]=ar["title_proc"].apply(replace_d_space,1)
title_sort=list(ar.title_proc.unique())
title_sort.sort()

j_data["title_proc"]=j_data["title"].fillna("none").astype(str).str.lower()
j_data["title_proc"]=j_data["title_proc"].apply(strip_leading,1)

len(title_sort) #total titles


18029

In [64]:
#read in the title recon file
title_rec=None
with open("/Users/sijiawu/Work/Refs Danae/title_recon.json", 'r') as f:
    title_rec=json.load(f) 

#reformat and expand the title recon file
title_rec_split_out={}
for key in title_rec.keys():
    for k in title_rec[key]:
        title_rec_split_out[k]=key  

In [65]:
#get number of unique titles of articles
len(list(ar[ar.type=="article"].title_proc.unique()))

6580

In [66]:
#separate out top 5 article references
interest=['review of economic studies',
          'american economic review',
          'journal of political economy',
          'econometrica',
          "quarterly journal of economics"]

journal_data=ar[ar["journal_proc"].isin(interest)]

In [67]:
#get unique list of top 5 journal titles
journal_titles=list(journal_data["title_proc"].unique())
journal_titles.sort()
len(journal_titles)

2628

In [68]:
for i in ar.index:
    val=ar.loc[i,'title_proc']
    ind=0
    if val in title_rec_split_out.keys():
        ar.loc[i,'title_proc']=title_rec_split_out[val]

In [69]:
interest=['review of economic studies',
          'american economic review',
          'journal of political economy',
          'econometrica',
          "quarterly journal of economics"]

journal_data=ar[ar["journal_proc"].isin(interest)]
journal_titles=list(journal_data["title_proc"].unique())
journal_titles.sort()

In [70]:
len(journal_titles)

2278

In [126]:
from difflib import SequenceMatcher

# Utility function to compute similarity
def similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def construct(match_title):
#     print(match_title.index)
    print(1)
    temp=j_data[(j_data["journal"]==match_title["journal_proc"])&(j_data["year"]<=(match_title["year_o"]+2))]["title_proc"].apply(lambda y: similar(y, match_title["title_proc"]))
    o=temp[temp>=(max(temp)-0.1)]
    ar_match[match_title.name]={"index": list(o.index), "m_val":list(o)}

In [127]:
ar_match={}
journal_data.apply(lambda x: construct(x), axis=1)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


15       None
23       None
30       None
31       None
34       None
40       None
41       None
45       None
46       None
49       None
54       None
56       None
57       None
58       None
59       None
60       None
62       None
65       None
70       None
71       None
73       None
75       None
78       None
82       None
83       None
84       None
89       None
94       None
103      None
106      None
107      None
109      None
112      None
113      None
122      None
124      None
125      None
128      None
129      None
132      None
148      None
149      None
150      None
151      None
166      None
170      None
181      None
190      None
191      None
192      None
193      None
197      None
205      None
210      None
213      None
216      None
218      None
220      None
221      None
222      None
223      None
227      None
233      None
238      None
245      None
246      None
247      None
248      None
258      None
262      None
263      None
266   

In [87]:
for i in ar_match.keys():
    ar_match[i]=list(ar_match[i])

In [109]:
len(ar_match)

4174

In [125]:
ar_match['15']

[20211]

In [116]:
te=0
for i in ar_match.keys():
    if len(ar_match[i])>1:
        te+=1
    

In [117]:
te

157

In [112]:
with open("/Users/sijiawu/Work/Refs Danaetitle_match.json", 'w') as f:
    json.dump(ar_match, f, indent = 6) 

In [124]:
ar_match={}
with open("/Users/sijiawu/Work/Refs Danae/title_match.json", 'r') as f:
    ar_match=json.load(f) 

In [298]:
f_data

{'qjaa024': {'authorIDs': "['He, Guojun', 'Wang, Shaoda', 'Zhang, Bing']",
  'references': [],
  'table_count': None,
  'tg_count': None,
  'eq_count': None,
  'year': 2020,
  'issue': {'issue_url': 'https://academic.oup.com//qje/issue/135/4',
   'issue_number': '4',
   'type': 'N'},
  'volume': 135,
  'journal': 'quarterly journal of economics',
  'abstract': "This article estimates the effect of environmental regulation on firm productivity using a spatial regression discontinuity design implicit in China's water quality monitoring system. Because water quality readings are important for political evaluations and the monitoring stations only capture emissions from their upstream regions, local government officials are incentivized to enforce tighter environmental standards on firms immediately upstream of a monitoring station, rather than those immediately downstream. Exploiting this discontinuity in regulation stringency with novel firm-level geocoded emission and production data se

In [658]:

s_tmp="a dynamic aggregative model"
temp=j_data["title_proc"].apply(lambda x: similar(x, s_tmp))

In [239]:
j_data[temp>0.8][['issue_url', 'ISSN', 'URL', 'journal', 'number', 'publisher', 'title',
       'urldate', 'volume', 'year', 'abstract', 'author', 'pages',
       'reviewed-author', 'uploaded', 'content_type', 'author_split',
        'title_proc']]

Unnamed: 0,issue_url,ISSN,URL,journal,number,publisher,title,urldate,volume,year,abstract,author,pages,reviewed-author,uploaded,content_type,author_split,title_proc
26834,https://www.jstor.org/stable/10.2307/i304771,"00223808, 1537534X",https://www.jstor.org/stable/1827046,JPE,2,University of Chicago Press,A Dynamic Aggregative Model,2023-09-06 00:00:00,63,1955,,James Tobin,103-115,,,Article,['James Tobin'],a dynamic aggregative model


In [102]:
j_data.columns

Index(['issue_url', 'ISSN', 'URL', 'journal', 'number', 'publisher', 'title',
       'urldate', 'volume', 'year', 'abstract', 'author', 'pages',
       'reviewed-author', 'uploaded', 'content_type', 'author_split',
       'title_10', 'type', 'authorsSCO', 'titleSCO', 'journalSCO', 'DOI',
       'affiliations', 'abstractSCO', 'citations', 'document type',
       'index keywords', 'author keywords', 'document_type'],
      dtype='object')

In [243]:
journal_data.columns

Index(['HITId', 'ID', 'page_o', 'year_o', 'journal_o', 'authors_o', 'title_o',
       'volume_o', 'issue_o', 'type', 'author', 'title', 'year', 'publisher',
       'textfull', 'chapter_title', 'volume', 'location', 'pages', 'journal',
       'issue', 'rejected', 'journal_proc', 'year_proc', 'volume_proc',
       'issue_proc', 'title_proc'],
      dtype='object')

In [120]:
pd.DataFrame(it).value_counts()

NameError: name 'it' is not defined

In [119]:
pd.DataFrame(it).value_counts()

NameError: name 'it' is not defined

In [282]:
def get_article_template(a_names, 
                         year, 
                         issue_url, 
                         journal_name, 
                         abstract, 
                         jstor_url, 
                         jstor_id,
                         issue_number, 
                         volume,
                         content_type,
                         issue_type):
    author_template={ 
        "authorIDs" : a_names,
        "references" :  [],
        "table_count" : None,
        "tg_count" : None,
        "eq_count" : None,
        "year" : year,
        "issue": {
            "issue_url": issue_url,
            "issue_number": issue_number,
            "type":issue_type
        },
        "volume":volume,
        "journal" : journal_name,
        "abstract" : abstract,
        "JSTOR_URL" : jstor_url,
        "ID": jstor_id,
        "content_type":content_type,
        "cited_by" : {}
    }
    for i in range(year-2,2024,1):
        author_template["cited_by"][i]=[]
    return author_template

def get_ref_template(authors, 
                     title, 
                     year, 
                     journal_name, 
                     vol, 
                     issue, 
                     raw_ref, 
                     article_ID,
                     pages):
    ref_template={
                    "authors":authors,
                    "title" :title,
                    "year": year,
                    "journal_name":journal_name,
                    "volume":vol,
                    "issue":issue,
                    "raw_reference": raw_ref,
                    "article_ID": article_ID,
                    "pages":pages,
                    }
    return ref_template

In [283]:
f_data={}
for i in j_data.index:
#     if i==1:
    art=j_data.loc[i]
    ret=get_article_template(art["author_split"], 
                         art["year"], 
                         art["issue_url"], 
                         art["journal"], 
                         art["abstract"], 
                         art["URL"],
                         art["URL"].split('/')[-1],
                         art["number"], 
                         art["volume"],
                         art['content_type'],
                         art['type'])
    f_data[art["URL"].split('/')[-1]]=ret

In [297]:
sum(ar.type=="article")

7943

In [246]:
ar_match

{15: {'index': [20211], 'm_val': [0.9397590361445783]},
 23: {'index': [20068], 'm_val': [1.0]},
 30: {'index': [20099, 20186],
  'm_val': [0.6666666666666666, 0.6829268292682927]},
 31: {'index': [20042], 'm_val': [1.0]},
 34: {'index': [20010], 'm_val': [1.0]},
 40: {'index': [19891], 'm_val': [1.0]},
 41: {'index': [19950], 'm_val': [1.0]},
 45: {'index': [4499], 'm_val': [1.0]},
 46: {'index': [4399], 'm_val': [1.0]},
 49: {'index': [4428], 'm_val': [1.0]},
 54: {'index': [4654, 4655], 'm_val': [1.0, 0.9538461538461539]},
 56: {'index': [20069], 'm_val': [1.0]},
 57: {'index': [20004, 20013], 'm_val': [1.0, 1.0]},
 58: {'index': [19982], 'm_val': [1.0]},
 59: {'index': [19941], 'm_val': [1.0]},
 60: {'index': [19991, 19994], 'm_val': [1.0, 1.0]},
 62: {'index': [19926], 'm_val': [0.926829268292683]},
 65: {'index': [19964], 'm_val': [1.0]},
 70: {'index': [54885, 60313],
  'm_val': [0.7428571428571429, 0.7017543859649122]},
 71: {'index': [4664], 'm_val': [1.0]},
 73: {'index': [20

In [121]:
it=[]
alt_res=[]
discard=[]
uncertain=[]
for i in ar_match.keys():
    print('\n')
    temp=ar_match[i]
    tempa=journal_data.loc[int(i)]
    
    it.append(len(temp["index"]))
    
    if len(temp["index"])!=1:
        print(tempa["title_proc"])
        if tempa["title_proc"]=="none":
            alt_res.append(i)
            continue
        print(tempa["journal_proc"])
        print(tempa["author"])
        print(tempa["year_proc"])
        print(temp["index"])
        print(len(temp["index"]))
        print(list(temp["m_val"]))
        for j in range(len(temp["index"])):
           
            temp2=j_data.loc[temp["index"][j]]
#             temp3=temp["m_val"][j]
#             print(temp3)
            print(temp2["title_proc"])
            print(temp2["journal"])
            print(temp2["author"])
            print(temp2["year"])
    else:
        if temp2["URL"].split('/')[-1]==str(tempa['ID']):
            discard.append(i)
        temp2=j_data.loc[temp["index"][0]]
        temp3=list(temp["m_val"])[0]
        s_ref=tempa[['type', 'author', 'title', 'year', 'publisher',
       'textfull', 'chapter_title', 'volume', 'location', 'pages', 'journal',
       'issue', 'rejected', 'journal_proc', 'year_proc', 'volume_proc',
       'issue_proc', 'title_proc']].to_dict()
        
        mo=get_ref_template(temp2["author_split"], 
                     temp2["title_proc"], 
                     temp2["year"], 
                     temp2["journal"], 
                     temp2["volume"], 
                     temp2["number"], 
                     s_ref, 
                     temp2["URL"].split('/')[-1],
                     temp2["pages"])
        mo["certainty"]=temp3
        
        if temp3<0.85:
            if tempa["year_proc"]!=temp2["year"]:
                uncertain.append(i)
            print(tempa["title_proc"])
            print(tempa["journal_proc"])
            print(tempa["author"])
            print(tempa["year_proc"])

            print(temp2["title_proc"])
            print(temp2["journal"])
            print(temp2["author"])
            print(temp2["year"])
            print(temp3)
            
        f_data[str(tempa['ID'])]["references"].append(mo)

        citby=str(tempa['ID'])
        if citby not in f_data[temp2["URL"].split('/')[-1]]["cited_by"][tempa["year_o"]]:
            f_data[temp2["URL"].split('/')[-1]]["cited_by"][tempa["year_o"]].append(citby)

    





TypeError: list indices must be integers or slices, not str

In [218]:
119+98+296

513

In [285]:
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

with open("/Users/sijiawu/Work/Refs Danae/data_18_11_23.json", 'w') as f:
    json.dump(f_data, f, indent = 6,default=convert) 

In [304]:
3107-164

2943

In [8]:
j_data.shape

(62262, 30)

In [9]:
temp=j_data[j_data["content_type"]!="MISC"]

In [10]:
temp.content_type.unique()

array(['Article', 'Comment', 'Reply', 'Rejoinder', 'Review', 'Discussion',
       'Review2'], dtype=object)

In [11]:
temp.columns

Index(['issue_url', 'ISSN', 'URL', 'journal', 'number', 'publisher', 'title',
       'urldate', 'volume', 'year', 'abstract', 'author', 'pages',
       'reviewed-author', 'uploaded', 'content_type', 'author_split',
       'title_10', 'type', 'authorsSCO', 'titleSCO', 'journalSCO', 'DOI',
       'affiliations', 'abstractSCO', 'citations', 'document type',
       'index keywords', 'author keywords', 'document_type'],
      dtype='object')

In [15]:
temp2=temp.author+ "("+str(temp.year)+")"

In [109]:
def process(x):
    return str(x["author"])+" ("+str(x["year"])+"). \""+str(strip_leading(x["title"]))+"\". \n"+str(x["journal"])+', '+str(x["volume"])+"("+str(x['number'])+"): "+str(x["pages"])
                                                                                                                       
                                                                                                                       

In [110]:
j_data["full_ref"]=j_data.apply(lambda x: process(x), axis=1)
j_data["ID"]=j_data['URL'].str.split('/').str[-1]

In [111]:
temp.head()

Unnamed: 0_level_0,full_ref
ID,Unnamed: 1_level_1
1882040,"Abram Bergson (1972). ""Optimal Pricing for a Public Enterprise"". quarterly journal of economics, 86(4): 519-544"
1882041,"Sidney L. Carroll (1972). ""Profits in the Airframe Industry"". quarterly journal of economics, 86(4): 545-562"
1882042,"Robert F. Hebert (1972). ""A Note on the Historical Development of the Economic Law of Market Areas"". quarterly journal of economics, 86(4): 563-571"
1882043,"Herbert Gintis (1972). ""A Radical Analysis of Welfare Economics and Individual Development"". quarterly journal of economics, 86(4): 572-599"
1882044,"Vernon L. Smith (1972). ""Dynamics of Waste Accumulation: Disposal Versus Recycling"". quarterly journal of economics, 86(4): 600-616"


In [112]:
temp=j_data[(j_data["content_type"]!="MISC")&(j_data['year']<1973)][["ID", "full_ref", "year", "journal"]].reset_index(drop=True)
temp=j_data[(j_data["content_type"]!="MISC")&(j_data['year']<1973)][["ID", "full_ref"]].reset_index(drop=True)

In [113]:
temp=temp.set_index("ID")

In [114]:
len(temp.index)

29125

In [115]:
temp=temp.drop_duplicates()

In [116]:
a=temp.to_dict('index')

In [117]:
b=temp.to_dict()

In [94]:
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

with open("/Users/sijiawu/Work/Refs Danae/pre-1973_formatted.json", 'w') as f:
    json.dump(a, f, indent = 6,default=convert) 

In [118]:
with open("/Users/sijiawu/Work/Refs Danae/pre-1973_formatted.json", 'w') as f:
    json.dump(b, f, indent = 6,default=convert) 

In [211]:
temp=ar[ar["year_latest"]==1]
temp2=ar[ar["journal_proc"]=="checking"]
temp3=ar[ar["journal_proc"]=="unlikely_journals"]
temp4=ar[ar["journal_proc"]=="bullshit"]
temp5=ar[ar["volume_proc"]=="WRONG"]

In [217]:

len(list(temp["HITId"].unique()))+len(list(temp2["HITId"].unique()))+len(list(temp3["HITId"].unique()))
+len(list(temp4["HITId"].unique()))+len(list(temp5["HITId"].unique()))

98

In [177]:
list(ar.columns)


['HITId',
 'ID',
 'page_o',
 'year_o',
 'journal_o',
 'authors_o',
 'title_o',
 'volume_o',
 'issue_o',
 'type',
 'author',
 'title',
 'year',
 'publisher',
 'textfull',
 'chapter_title',
 'volume',
 'location',
 'pages',
 'journal',
 'issue',
 'rejected',
 'journal_proc',
 'year_proc',
 'year_proc_split',
 'year_latest',
 'volume_proc',
 'issue_proc',
 'title_proc']

In [64]:
list(All.columns)

['Answer.0.1',
 'Answer.0.2',
 'Answer.0.3',
 'Answer.0.4',
 'Answer.0.5',
 'Answer.0.6',
 'Answer.0.7',
 'Answer.0.8',
 'Answer.1.1',
 'Answer.1.2',
 'Answer.1.3',
 'Answer.1.4',
 'Answer.1.5',
 'Answer.1.6',
 'Answer.1.7',
 'Answer.1.8',
 'Answer.10.1',
 'Answer.10.2',
 'Answer.10.3',
 'Answer.10.4',
 'Answer.10.5',
 'Answer.10.6',
 'Answer.10.7',
 'Answer.10.8',
 'Answer.11.1',
 'Answer.11.2',
 'Answer.11.3',
 'Answer.11.4',
 'Answer.11.5',
 'Answer.11.6',
 'Answer.11.7',
 'Answer.11.8',
 'Answer.12.1',
 'Answer.12.2',
 'Answer.12.3',
 'Answer.12.4',
 'Answer.12.5',
 'Answer.12.6',
 'Answer.12.7',
 'Answer.12.8',
 'Answer.13.1',
 'Answer.13.2',
 'Answer.13.3',
 'Answer.13.4',
 'Answer.13.5',
 'Answer.13.6',
 'Answer.13.7',
 'Answer.13.8',
 'Answer.14.1',
 'Answer.14.2',
 'Answer.14.3',
 'Answer.14.4',
 'Answer.14.5',
 'Answer.14.6',
 'Answer.14.7',
 'Answer.14.8',
 'Answer.15.1',
 'Answer.15.2',
 'Answer.15.3',
 'Answer.15.4',
 'Answer.15.5',
 'Answer.15.6',
 'Answer.15.7',
 'Answer

In [80]:
All["WorkerId"].value_counts()


WorkerId
A35VR87P41DVRM    4606
A306J22T34G5NA    4284
A3DBGA6NGIUQ2J    2273
A26N3QDWD4UZOO    2045
A2ZTHO3A0JXRPM    1459
A3TMO46POHUQ5R    1303
A10K87TA2GMTG2     852
A1P79XGTU8M781     686
A1UHWIR8PV1OBZ     661
A30URQE2HQRV40     508
A33JOSIAU7PK06     297
A2Q7PMOLEJB998     296
A31SARQEMO7L70     256
A36ON2RGTLJ28P      94
A1P0TMRQXQRZVB      35
A5RU26VU9HFWL       30
A2F6LWQV8U5CLD      15
A35FUNQ2C69NPD       7
Name: count, dtype: int64

In [79]:
All[(All["Input.journal"]=="QJE")&(All["num_refs"]>3)]["WorkerId"].value_counts()

WorkerId
A26N3QDWD4UZOO    363
A2ZTHO3A0JXRPM    179
A35VR87P41DVRM    135
A33JOSIAU7PK06     92
A306J22T34G5NA     79
A1P0TMRQXQRZVB     17
A5RU26VU9HFWL      16
A2F6LWQV8U5CLD      3
A10K87TA2GMTG2      1
Name: count, dtype: int64

In [76]:
for i in num_refs.keys():
    All.loc[i,"num_refs"]=num_refs[i]

In [62]:
All.WorkerId.value_counts()


WorkerId
A35VR87P41DVRM    4606
A306J22T34G5NA    4284
A3DBGA6NGIUQ2J    2273
A26N3QDWD4UZOO    2045
A2ZTHO3A0JXRPM    1459
A3TMO46POHUQ5R    1303
A10K87TA2GMTG2     852
A1P79XGTU8M781     686
A1UHWIR8PV1OBZ     661
A30URQE2HQRV40     508
A33JOSIAU7PK06     297
A2Q7PMOLEJB998     296
A31SARQEMO7L70     256
A36ON2RGTLJ28P      94
A1P0TMRQXQRZVB      35
A5RU26VU9HFWL       30
A2F6LWQV8U5CLD      15
A35FUNQ2C69NPD       7
Name: count, dtype: int64

In [204]:
All[(All["RejectionTime"].isna()==False) & (All["WorkerId"]=="A2Q7PMOLEJB998")].WorkerId.value_counts()

WorkerId
A2Q7PMOLEJB998    9
Name: count, dtype: int64

In [189]:
list(All.columns)

['AcceptTime',
 'Answer.0.1',
 'Answer.0.2',
 'Answer.0.3',
 'Answer.0.4',
 'Answer.0.5',
 'Answer.0.6',
 'Answer.0.7',
 'Answer.1.1',
 'Answer.1.2',
 'Answer.1.3',
 'Answer.1.4',
 'Answer.1.5',
 'Answer.1.6',
 'Answer.1.7',
 'Answer.10.1',
 'Answer.10.2',
 'Answer.10.3',
 'Answer.10.4',
 'Answer.10.5',
 'Answer.10.6',
 'Answer.10.7',
 'Answer.11.1',
 'Answer.11.2',
 'Answer.11.3',
 'Answer.11.4',
 'Answer.11.5',
 'Answer.11.6',
 'Answer.11.7',
 'Answer.12.1',
 'Answer.12.2',
 'Answer.12.3',
 'Answer.12.4',
 'Answer.12.5',
 'Answer.12.6',
 'Answer.12.7',
 'Answer.13.1',
 'Answer.13.2',
 'Answer.13.3',
 'Answer.13.4',
 'Answer.13.5',
 'Answer.13.6',
 'Answer.13.7',
 'Answer.14.1',
 'Answer.14.2',
 'Answer.14.3',
 'Answer.14.4',
 'Answer.14.5',
 'Answer.14.6',
 'Answer.14.7',
 'Answer.15.1',
 'Answer.15.2',
 'Answer.15.3',
 'Answer.15.4',
 'Answer.15.5',
 'Answer.15.6',
 'Answer.15.7',
 'Answer.16.1',
 'Answer.16.2',
 'Answer.16.3',
 'Answer.16.4',
 'Answer.16.5',
 'Answer.16.6',
 'Answe