# Combining Affiliations
Affiliations were collected in two parts: 1940 to 2010 and 2011 to 2020. Although the two sets are individually consolidated, they are not fully consolidated against each other. Before this can happen:

- The 1940 to 2010 file (pre_2010) needs to be restructured
- Both files also contain data fields that are not necessary

The variation due to the different individuals who consolidated the two data files. The following code simply combines the two data sets, checks for logical errors (eg: row contains data for 6 authors when there are only 5) and simplies the available fields to the bare minimum.

Input files:
- affiliations file pre-2010
- affiliations file post-2010

Output files:
- pre-2010 file with all fields and corrections
- post-2010 file with all fields and corrections
- combined file with all fields and filled corrections
- combined file of subset of fields

In [690]:
import pandas as pd
import time
pd.set_option('display.max_colwidth', None)

In [691]:
base_path="/Users/sijiawu/Work/Thesis/Data/"
content_ex=['MISC',  'Discussion', 'Review', 'Review2']
content=['Article', 'Comment', 'Reply', 'Rejoinder',"Errata"]
jid=["aer","ecta","jpe","qje","res"]

In [692]:
pre_2010=pd.read_excel(base_path+"Affiliations/Affiliations_cleaned_1940_2010.xlsx")
pre_2010_remnant=pd.read_excel(base_path+"Affiliations/affs_remnant_mixed_final.xlsx")
post_2010_init=pd.read_excel(base_path+"Affiliations/Affiliations_cleaned_2011_2020_alt.xlsx")
post_2010_remnant=pd.read_excel(base_path+"Affiliations/affs_remnants_alt.xlsx")
j_data=pd.read_pickle(base_path+"Combined/020_merged_proc_scopus_inception_with_auth_split_2020.pkl")


In [693]:
pre_2010_remnant.shape

(5127, 18)

In [694]:
pre_2010.shape

(22789, 40)

In [695]:
post_2010_init.shape

(11507, 20)

In [696]:
post_2010_remnant.shape

(3038, 25)

In [697]:
rnames=pre_2010.columns
rnames

Index(['Journal_Name', 'Authors_count', 'Authors_list', 'authors',
       'Article_ID', 'content_type', 'stable_url', 'year', 'Ref_type',
       'ref_string', 'ref_string_clean1', 'Screened', 'Manual',
       '0_Auth_affs_0', '0_Auth_affs_1', '0_Auth_affs_2', '0_Auth_affs_3',
       '0_Auth_affs_4', '1_Auth_affs_0', '1_Auth_affs_1', '1_Auth_affs_2',
       '1_Auth_affs_3', '1_Auth_affs_4', '2_Auth_affs_0', '2_Auth_affs_1',
       '2_Auth_affs_2', '2_Auth_affs_3', '2_Auth_affs_4', '3_Auth_affs_0',
       '3_Auth_affs_1', '3_Auth_affs_2', '3_Auth_affs_3', '3_Auth_affs_4',
       '4_Auth_affs_0', '4_Auth_affs_1', '4_Auth_affs_2', '4_Auth_affs_3',
       '5_Auth_affs_0', '6_Auth_affs_0', '7_Auth_affs_0'],
      dtype='object')

In [698]:
dict= {}
issues=[]
missing_auth=[]
a=1

for i in pre_2010.index:
    stable_cols=["Journal_Name", "Authors_count", "Authors_list", "authors", "Article_ID", "content_type", "stable_url", "year", "Ref_type", "ref_string", "ref_string_clean1", 'Screened', 'Manual']
    auth_list= pre_2010.loc[i, ["Authors_list"]]
    authors= pre_2010.loc[i, ["authors"]]
    count= pre_2010.loc[i, "Authors_count"]
    temp=pre_2010.loc[i,stable_cols].to_dict()

    m=None    
    if pd.isna(auth_list.values[0]):
        if (pd.isna(authors.values)==False):
            # print(i)
            # print(authors)
            x=authors.values[0].split(",")
            m=[]
            for k in x:
                y=k.split(" And ")
                m=m+y
            missing_auth.append(i)
            # print(auth_list.values[0])
        else:
            issues.append(i)
            continue

    proc_auth=-1
    if m==None:
        proc_auth=auth_list.values[0][2:-2].split(",")
    else:
        proc_auth=m

    if len(proc_auth)!=int(pre_2010.loc[i, "Authors_count"]):
        print(i)
        print(authors)

    temp_dict={}
    for k in range(len(proc_auth)):
        temp_dict[k]=proc_auth[k]
    # print(i)

    for j in rnames:

        if (pd.isna(pre_2010.loc[i,j])==False) and (j not in stable_cols):
            entry={}
            # print('"'+j+'": '+pre_2010.loc[i,j])
            label=j.split("_")
            entry["auth_num"]=int(label[0])
            entry["aff_num"]=(int(label[-1])+1)
            entry=entry|temp
            entry["aff_main_final"]=pre_2010.loc[i,j]
            entry["author"]=temp_dict[int(label[0])]
            # print(entry)

            dict[a]=entry
            a=a+1
            

In [699]:
pre_2010_init=pd.DataFrame(dict).transpose()
pre_2010_init.rename(columns={'Journal_Name': "journal", 
                            'Article_ID': "id", 
                            'Authors_count':'auth_count', 
                            'Authors_list':'auth_list',  
                            'Ref_type': "ref_type", 
                            'ref_string_clean1':'ref_string_cleaner' , 
                            'Screened': 'screened', 
                            'Manual':'manual'}, inplace=True)


pre_2010_remnant["manual"]="MTURK"
pre_2010_remnant['stable_url']="https://www.jstor.org/stable/"+pre_2010_remnant.pdf_url.str.split('/').str[-1].str.split('_').str[0]
pre_2010_remnant['screened']='y'

pre_2010_df=pd.concat([pre_2010_init, pre_2010_remnant], axis=0).reset_index(drop=True)

In [700]:
pre_2010_df.to_excel(base_path+"Affiliations/aff_pre_2010.xlsx", index=False)
pre_2010_df.to_pickle(base_path+"Affiliations/aff_pre_2010.pkl")

In [701]:
post_2010_remnant=post_2010_remnant[['id', 'year', 'title', 'journal', 'pages', 'vol', 'number', 'author',
       'affiliation', 'auth_num', 'aff_num', 'aff_cleaner',
       'aff_cleaner_final', 'aff_main_final', 'aff_subunit_final',
       'aff_department_final', 'alt_final', 'country', 'is_business_school',
       'is_institute']]

post_2010=pd.concat([post_2010_remnant,post_2010_init])

post_2010['manual']="MTURK"
post_2010['screened']="y"
post_2010=post_2010[post_2010["aff_main_final"].isna()==False].reset_index(drop=True)
post_2010.to_excel(base_path+"Affiliations/aff_post_2010.xlsx", index=False)
post_2010.to_pickle(base_path+"Affiliations/aff_post_2010.pkl")


In [702]:
combined=pd.concat([post_2010, pre_2010_df], axis=0).reset_index(drop=True)

In [703]:
combined['manual']=combined['manual'].fillna("NA")
combined['screened']=combined['screened'].fillna('NA')
combined.columns

Index(['id', 'year', 'title', 'journal', 'pages', 'vol', 'number', 'author',
       'affiliation', 'auth_num', 'aff_num', 'aff_cleaner',
       'aff_cleaner_final', 'aff_main_final', 'aff_subunit_final',
       'aff_department_final', 'alt_final', 'country', 'is_business_school',
       'is_institute', 'manual', 'screened', 'auth_count', 'auth_list',
       'authors', 'content_type', 'stable_url', 'ref_type', 'ref_string',
       'ref_string_cleaner', 'pdf_url', 'hitId', 'worker', 'aff_clean',
       'aff_main', 'add_sub'],
      dtype='object')

In [704]:
print(len(combined['id'].unique()))
print(len(pre_2010_df['id'].unique()))
print(len(post_2010['id'].unique()))

29839
25619
4220


In [705]:
j_data[(j_data['year']>=1940)&(j_data['content_type'].isin(content))].shape[0]-len(combined['id'].unique())

98

In [706]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [707]:
repl={
    'é':'e',
    "ä":'a',
    "à":'a',
    'è':'e',
    'á':'a',
    'ö':'o',
    'ü':'u',
    'ó':'o',
    'í':'i',
    'ğ':'g',
    "&":'and',
    "u.s.":'us',
    "suny at": "state university of new york -",
    'd.c.':'dc',
    'ç':'c',
    '\xa0':' ',
    'ń':'n',
    '–':'-',
    '’':"'",
    'ò':'o', 
    'ò':'o',
    'ã':'a',
    'ñ':'n',
    'ú':'u', 
    'ø':'o', 
    'ê':'e'
}

for i in repl.keys():
    combined["aff_main_final"]=combined["aff_main_final"].str.lower().str.replace(i,repl[i])


In [708]:
combined[combined['id']==26966479]

Unnamed: 0,id,year,title,journal,pages,vol,number,author,affiliation,auth_num,aff_num,aff_cleaner,aff_cleaner_final,aff_main_final,aff_subunit_final,aff_department_final,alt_final,country,is_business_school,is_institute,manual,screened,auth_count,auth_list,authors,content_type,stable_url,ref_type,ref_string,ref_string_cleaner,pdf_url,hitId,worker,aff_clean,aff_main,add_sub
3546,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Brennan C. Platt,"Brigham Young University, Department of Economics",2.0,1,"Brigham Young University, Department of Economics",Brigham Young University,brigham young university,0,Department of Economics,Brigham Young University,,,,MTURK,y,,,,,,,,,,,,,,
5412,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Dominic Coey,"Facebook, Core Data Science",0.0,1,"Core Data Science, Facebook",Facebook,facebook,0,Core Data Science,Facebook,0.0,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
8303,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,NBER,1.0,2,NBER,National Bureau of Economic Research - NBER,national bureau of economic research - nber,0,0,National Bureau of Economic Research - NBER,0.0,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
10511,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,"Stanford University, Department of Economics",1.0,1,"Department of Economics, Stanford University",Stanford University,stanford university,0,Department of Economics,Stanford University,0.0,0.0,0.0,MTURK,y,,,,,,,,,,,,,,


In [709]:
consol={
    "washington university - st. louis":["washington university - st. louis", "washington university in st louis"],
    "vienna university of economics and business":["vienna university of economics and business","vienna university of economics and business administration",],
    "university of wisconsin - madison":["university of wisconsin - madison", "university of wisconsin-madison"],
    "university of wisconsin - milwaukee":["university of wisconsin - milwaukee","university of wisconsin-milwaukee",],
    "university of wisconsin - stevens point":["university of wisconsin-stevens point"],
    "university of wisconsin - parkside":["university of wisconsin-parkside"],
    "university of texas - dallas":["university of texas at dallas"],
    "university of texas - san antonio":["university of texas at san antonio",],
    "university of lille":["university of lille i",],
    "university of texas - austin":["university of texas - austin","university of texas at austin",],
    "university of illinois - urbana-champaign":["university of illinois - urbana-champaign","university of illinois urbana-champaign",],
    "university of illinois - chicago":["university of illinois chicago"],
    "university of north carolina - chapel hill":["university of north carolina - chapel hill","university of north carolina at chapel hill"],
    "university of north carolina - greensboro":["university of north carolina, greensboro","university of north carolina - greensboro","university of north carolina at greensboro",],
    "university of north carolina - charlotte":["university of north carolina at charlotte",],
    "university of nebraska - lincoln":["university of nebraska, lincoln", "university of nebraska-lincoln",],
    "university of nebraska - omaha":["university of nebraska, omaha"],
    "university of nevada - reno":["university of nevada","university of nevada, reno"],
    "university of missouri - columbia":["university of missouri, columbia"],
    "university of missouri - kansas city":["university of missouri, kansas city"],
    "university of missouri - st. louis":["university of missouri, st. louis"],
    "university of massachusetts - amherst":["university of massachusetts amherst"],
    "university of massachusetts - boston":["university of massachusetts boston"],
    "university of massachusetts - lowell":["university of massachusetts lowell"],
    "university of maryland - baltimore county":["university of maryland at baltimore",],
    "university of louisiana - lafayette":["university of louisiana at lafayette"],
    "university of louisiana - monroe":["university of louisiana monroe"],
    "university of hawaii - manoa":["university of hawaii at manoa"],
    "university of economics - prague":["university of economics in prague"],
    "university of colorado - boulder":["university of colorado at boulder"],
    "university of colorado - denver":["university of colorado at denver"],
    "university of california - berkeley":["university of california, berkeley"],
    "university of california - davis":["university of california, davis"],
    "university of california - irvine":["university of california, irvine"],
    "university of california - los angeles":["university of california, los angeles"],
    "university of california - merced":["university of california, merced"],
    "university of california - riverside":["university of california, riverside"],
    "university of california - san diego":["university of california, san diego"],
    "university of california - santa barbara":["university of california, santa barbara"],
    "university of california - santa cruz":["university of california, santa cruz"],
    "university of california - san francisco":["university of california, san francisco"],
    "universite de tunis":["universite' de tunis",],
    "universita della svizzera italiana - usi":["universita della svizzera italiana"],
    "university carlos iii of madrid":["universidad carlos iii de madrid", "university carlos iii of madrid",],
    "technion - israel institute of technology":["technion - israel institute of technology","technion-israel institute of technology"],
    "university of rome - tor vergata":["tor vergata university of rome", "university of rome tor-vergata",],
    "texas a and m university":["texas aandm university"],
    "swedish employers' confederation":["swedish employers confederation", "swedish employers' confederation"],
    "us agricultural marketing services":["us agricultural marketing service",],
    "us bureau of the census":["us bureau of census"],
    "us department of the treasury":["us department of treasury", ],
    "tinbergen institute":["tinbergen institute amsterdam"],
    "state university of new york - old westbury":["state university of new york - old westbury", "suny old westbury"],
    "state university of new york - upstate medical university":["suny upstate medical university",],
    "state university of new york - binghamton":["state university of new york - binghampton","state university of new york - binghamton",],
    "state university of new york - geneseo":["state university of new york - genese"],
    "sao paulo school of economics - fgv eesp": ["sao paulo school of economics"],
    "resources for the future, inc.":["resources for the future"],
    "reserve bank of india":["reserve bank of india, bombay"],
    "research institute of industrial economics - ifn - stockholm":["research institute of industrial economics"],
    "public policy institute of california - ppic":["public policy institute of california"],
    "peterson institute for international economics":["peterson institute for international economics - washington dc."],
     "organisation for economic co-operation and development - oecd":["organisation for economic cooperation and development (oecd, france)"],
    "norwegian school of economics":["nhh norwegian school of economics"],
    "new york times":["the new york times",],
    "new economic school - nes":["new economic school, moscow"],
    "national bureau of economic research - nber":["national bureau of economic research",],
    "massachusetts institute of technology - mit":["massachusetts institute of technology"],
    "leibniz university hannover":["leibniz university",],
    "korea development institute":["korean development institute"],
    "kiel institute for the world economy - ifw":["kiel institute for the world economy (ifw kiel)"],
    "joint committee on taxation, us congress":["joint committee on taxation",],
    "john hopkins university":["john hopkins university school of medicine"],
    "j. p. morgan":["j. p. morgan chase - new york", "j.p. morgan united kingdom",],
    "istituto per la ricerca valutativa sulle politiche pubbliche - irvapp":["istituto per la ricerca valutativa sulle politiche pubbliche",],
    "institute of labor economics - iza":["iza - institute of labor economics"],
    "instituto nacional de matematica pura e aplicada - impa":["instituto nacional de matematica pura e aplicada"],
    "instituto technologico autonomo de mexico - itam":["instituto technologico autonomo de mexico","instituto tecnologico autonomo de mexico - itam","itam"],
    "institute for employment research - iab":["institute for employment research, germany", "institut fur arbeitsmarkt- und berufsforschung - iab","institut fur arbeitsmarkt- und berufsforschung der bundesagentur fur arbeit kdor",],
    "institut d’analisi econòmica - iae":["institut d'analisi economica (iae)","institut d'analisi econòmica (csic, spain)","institut d’analisi econòmica - csic",],
    "institute of education and research - insper":["insper - institute of education and research"],
    "ibmec rio de janeiro":["ibmec rio de janeiro","ibmec-rj",],
    "humboldt university - berlin":["humboldt university - berlin", "humboldt university of berlin",],
    "hungarian academy of sciences - mat":["hungarian academy of sciences", "hungarian academy of sciences - mat",],
    "goethe university":["goethe university frankfurt"],
    "fundacion de estudios de economia aplicada - fedea":["fundacion de estudios de economia aplicada (fedea, spain)"],
    "fund for scientific research - flanders":["fund for scientific research - flanders"],
    "free university - amsterdam":["free university amsterdam"],
    "free university - berlin":["free university of berlin"],
    "federal reserve bank of st louis":["federal reserve bank of st. louis"],
    "federal power commission":["federal power comission"],
    "european corporate governance institute - ecgi":["european corporate governance institute"],
    "european bank of reconstruction and development - ebrd":["european bank for reconstruction and development",],
    "escola de pos-graduação em economia - fgv epge":["escola de pos-graduação em economia"],
    "carnigie mellon university":["epartment of social and decision sciences, carnigie mellon university",],
    "einaudi institute for economics and finance - eief":["einaudi institute for economics and finance",],
    "economic research service":["economic research service, washington", "economic research services",],
    "economic research service":["economic research service, washington", "economic research services",],
    "department of the air force":["department of the air force", "department of the air force, washington, dc",],
    "department of state":["department of state, washington, dc", ],
    "department of the treasury":["department of the treasury, united nations conference on trade and development, switzerland","department of treasury"],
    "council of economic advisors, washington dc":["council of economic advisers, washington dc",],
    "centro de investigacion economica - cie":["centro de investigacion economica","centro de investigacion economica - cie",],
    "centro de estudios monetarios y financieros - cemfi":["centro de estudios monetarios y financieros (cemfi, france)","centro de estudios monetarios y financieros - cemfi",],
    "centro de analisis y estudios rios perez - caerp":["centro de altisimos estudios rios perez","centro de analisis y estudios rios perez - caerp",],
    "centre interuniversitaire de recherche en economie quantitative - cireq":["centre interuniversitaire de recherche en economie quantitative ( cireq, canada)","centre interuniversitaire de recherche en economie quantitative - cireq",],
    "centre interuniversitaire de recherche en analyse des organisations - cirano":["centre interuniversitaire de recherche en analyse des organisations (cirano, canada)","centre interuniversitaire de recherche en analyse des organisations - cirano",],
    "centre for microdata methods and practice - cemmap":["centre for microdata methods and practice (cemmap, england)"],
    "centre for operations research and econometrics - core":["centre for operations research and econometrics (core, france)"],
    "centre de recerca en economia internacional - crei":["centre de recerca en economia internacional (crei, barcelona)","centre de recerca en economia internacional - crei",],
    "catalan institution for research and advanced studies - icrea":["catalan institution for research and advanced studies (icrea)"],
    "canadian institute for advanced research - cifar":["canadian institute for advanced research (cifar)","canadian institute for advanced research - cifar",],
    "california state university - bakersfield":["california state university at bakersfield"],
    "california state university - fullerton":["california state university at fullerton"],
    "california state university - hayward":["california state university at hayward"],
    "california state university - long beach":["california state university at long beach"],
    "california state university - los angeles":["california state university at los angeles"],
    "california state university - northridge":["california state university at northridge"],
    "california state university - sacramento":["california state university at sacramento"],
    "bureau of the budget washington, d.c.":["bureau of the budget (united states)"],
    "institute on behavior and inequality - briq":["briq - institute on behavior and inequality"],
    "boston consulting group":["boston consulting group","boston consulting group, new york",],
    "ben-gurion university":["ben gurion university of the negev"],
    "bar-ilan university":["bar-iian university","bar-ilan university",],
    "bank for international settlements":["bank for international settlements","bank for international settlements, basle",],
    "apax partners and company":["apax partners and co."],
    "alfred p. sloan foundation":["alfred p sloan foundation"],
    "albert-ludwigs-university freiburg":["albert-ludwigs-universitat freiburg",],
    "aalto university":["aalto university","aalto university school of business",],
    "wageningen university and research":["wageningen university"],
    "aix-marseille university": ["universite d'aix-marseille","universite d'aix-marseille iii", "quantative economics aix-marseille (greqam, france)",],
    "university of milan": ["universite degli studi di milano","university of milan"],
    "university of turin": ["university of torino","university of turin","universita di torino"],
    "university of vienna": ["universitat wien","university of vienna"],
    "sorbonne university": ["universite pierre et marie curie"],
    "universite catholique de louvain": ["university of california - louvain","universite catholique de louvain"],
    "university of mannheim": ["universitat mannheim","university of mannheim"],
    "university of montreal": ["universite de montreal","university of montreal"],
    "university of granada": ["university of granada","universidad de granada"],
    "university of malaga": ["university of malaga","universidad de malaga"],
    "university of siena": ["university of siena","universita degli studi di siena"],
    "university of bologna": ["university of bologna","universita di bologna"],
    "university of verona": ["university of verona","universita di verona"],
    "university of pavia": ["university of pavia","universita di pavia"],
    "greqe":["g.r.e.q.e."],
    "us congress": ["us states senate","us house of representatives","united states congressional joint economic committee", "us congress","us states senate","us house of represenatives","united states congress","congress","congressional budget office","congressional research service",],
    "centre for population, poverty and public policy studies - ceps/instead":["centre for population, poverty and public policy studies - ceps/instead","ceps/instead"],
    "us air force academy": ["us air force academy","united states air force academy"],
    "us census bureau": ["us census bureau","us bureau of the census","united states census bureau"],
    "us department of justice": ["us department of justice","us justice department","united states department of justice"],
    "us general services administration": ["us general services administration", "united states general services administration", "united states general service administration",],
    "us international trade commission": ["us international trade commision, washington","united states international trade commission","united states international trade commission - washington dc"],
    "us military academy": ["united states military academy - west point","us military academy","united states military academy"],
    "us navy": ["us navy","us states navy","united states navy"],
    "us department of the treasury": ["us department of the treasury", "us treasury department","united states department of the treasury"],
    "us department of health and human services": ["us department of health and human services","us department of health, education and welfare","united states department of health and human services"],
    "united nations":["united nations conference on trade and development"],
    "cattolica university":["universita cattolica","universita cattolica del sacro cuore"],
    "john hopkins university":["john hopkins university","johns hopkins university",],
    "autonomous university of barcelona": ["universitat autonoma de barcelona"],
    "ca' foscari university of venice": ["universita ca' foscari venezia", "ca' foscari university"],
    "catholic university of louvain": ["universite catholique de louvain", "uclouvain"],
    "complutense university of madrid": ["universidad complutense de madrid"],
    "free university - brussels": ["universite libre de bruxelles"],
    "laval university": ["universite laval"],
    "lumiere university lyon 2": ["universite lumiere lyon 2", "lyon 2 university","lyon 2 university berges du rhone"],
    "national autonomous university of mexico": ["universidad nacional autonoma de mexico"],
    "nova university of lisbon": ["universidade nova de lisboa", "nova lisboa university"],
    "pompeu fabra university": ["universitat pompeu fabra", "dee-upf",],
    "universite paris 2 pantheon-assas": ["universite paris - pantheon-assas","pantheon-assas university", "universite paris ii pantheon - assas", "assas university","pantheon-assas university paris ii"],
    "universite paris 12 paris-est creteil":["paris 12 val de marne university","universite paris - est creteil"],
    "universite paris 3 sorbonne-nouvelle":["sorbonne nouvelle university"],
    "sorbonne university":["sorbonne university","sorbonne university pierre and marie curie",],
    "universite paris dauphine-psl":["paris dauphine university","universite paris dauphine - psl",],
    "universite paris nanterre":["universite paris nanterre","paris nanterre university",],
    "universite paris - saclay":["universite paris - paris-sud", "paris-sud university", "universite paris - saclay",],
    "universite paris cite":["paris descartes university","university paris city"],
    "universite paris diderot":["university of paris xii"],
    "sorbonne paris nord university": ["universite paris 13 paris-nord", "paris 13 university"],
    "universite paris 1 pantheon-sorbonne": ["universite paris i pantheon-sorbonne","university paris 1 pantheon-sorbonne","university of paris"],
    "catalan institution for research and advanced studies - icrea":["icrea universitat pompeu fabra","icreagse barcelona", "catalan institution for research and advanced studies (icrea)",],
    "centro de investigacion y docencia economicas - cide":["centro de investigacion y docencia economicas - cide","centro de investigacion y docencia economicas, mexico",],
    "nova university of lisbon":["nova university lisbon","nova university of lisbon","universidade nova de lisboa"],
    "saarland university": ["universitat des saorlandes"],
    "sabanci university":["sabanci universitesi","sabanci university",],
    "sapienza university of rome":["sapienza universita di roma","sapienza university of rome","universita degli studi di roma - la sapienza", "sapienza university"],
    "technical university of munich":["technical university of munich","technische universitat munchen",],
    "graduate institute of international and development studies - iheid":["graduate institute of international and development studies - iheid","graduate institute, geneva (iheid)"],
    "university of wurzburg":["universitat wurzhurg"],
    "university of brasilia":["universidade de brasilia"],
    "escola de pos-graduacao em economia - fgv epge":["epge brazilian school of economics and finance","escola de pos-graduacao em economia - fgv epge","escola de pos-graduacao em economia"],
    "escola brasileira de administracao publica e de empresas - fgv ebape":["brazilian school of public and business administration getulio vargas foundation - ebape/fgv","escola brasileira de administracao publica e de empresas - fgv ebape"],
    "inter-american development bank":["inter american development bank"],
    "norwegian school of economics and business administration - NHH":["norwegian school of economics","norwegian school of economics and business administration"],
    "bi norwegian business school":["norwegian school of management"],
    "national scientific and technical and research council - conicet":["conicet","national scientific and technical and research council - conicet",],
    "international monetary fund - imf":["international monetary fund","international monetary fund (imf)",],
    "center for economic reseach and applications - cepremap":["center for economic reseach and applications (cepremap, france)", "centre d'etudes prospectives d'economie mathematique appliquees a la planification (cepremap ,franse)",],
    "mathematical and quantative economics (gremaq, france)":["mathematical and quantative economics (gremaq, france)"],
    "university of toulouse":["universite de toulouse","university of toulouse",],
    "institut d'analisi economica - csic":["institut d'analisi economica (csic, spain)","institut d'analisi economica - csic","institut d'analisi economica - iae",],
    "santa fe institute":["santa fe institute","sante fe institute",],
    "halle institute for economic research - iwh":["halle institute for economic research- iwh"],
    "institute for fiscal studies - ifs":["institute for fiscal studies - ifs","lnstitute for fiscal studies - ifs","i.f.s.","institute for fiscal studies, london"],
    "conservatoire national des arts et metier":["conservatoire des arts et metiers, a l'ecole polytechnique","conservatoire national des arts et metier",],
    "ecole nationale superieure des mines de paris":["ecole des mines de paris","ecole nationale superieure des mines de paris",],
    "hungarian academy of sciences - mta":["hungarian academy of sciences - mat", "computing centre of the hungarian academy of sciences"],
    "ecole d'economie de paris - pse":["paris school of economics",],
    "centre national de la recherche scientifique - cnrs":["centre national de la recherche scientifique - cnrs","national centre for scientific research (cnrs, france)",],
    "institute of economics, has - mta kti":["insitute of economics, hungarian academy of sciences (iehas)","institute of economics hungarian academy of sciences - iehas"],
    "centre for monetary and financial studies - cemfi":["center for monetary and financial studies (cemfi, madrid)","centre for monetary and financial studies - cemfi","centro de estudios monetarios y financieros - cemfi",],
    "cy cergy paris universite":["cy cergy paris universite","universite de cergy-pontoise","university of cergy-pontoise",],
    "institut de mathematiques de jussieu - paris rive gauche - imj-prg":["institut de mathematiques de jussieu - paris rive gauche - imj-prg","mathematics institute of jussieu-paris rive gauche",],
    "international food policy research institute - ifpri":["international food policy research institute (ifpri)"],
    "aarhus university":["center for research in econometric analysis of time series (creates)"],
    "centre for financial studies - cfs":["center for financial studies(cfs, germany)","centre for financial studies - cfs",],
    "czech academy of sciences - economics institute - ei ":["czech academy of sciences - economics institute - ei ","economics institute of academy of sciences of czech republic",],
    "centre for economic policy research - cepr, uk":["centre for economic policy research (cepr, england)",],
    "hebrew university of jerusalem":["hebrew university of jerusalem;federmann centre for the study of rationality"]
    
    }
consol_un={}
for i in consol.keys():
    for j in consol[i]:
        consol_un[j]=i
# print(consol_un)

for i in combined.index:
    # print(combined.loc[i,'aff_main_final'])
    if pd.isna(combined.loc[i,'aff_main_final'])==False:
        if combined.loc[i,'aff_main_final'][0:4]=="the ":
            combined.loc[i,"aff_main_final"]=combined.loc[i,'aff_main_final'][4:]

for i in consol_un.keys():
    combined.loc[combined['aff_main_final']==i,"aff_main_final"]=consol_un[i]

for i in repl.keys():
    combined["aff_main_final"]=combined["aff_main_final"].str.lower().str.replace(i,repl[i])

In [710]:
s=list(combined[combined["aff_main_final"].isna()==False]["aff_main_final"].unique())
s.sort()

In [711]:
len(s)

2825

In [712]:
combined[combined['id']==26966479]

Unnamed: 0,id,year,title,journal,pages,vol,number,author,affiliation,auth_num,aff_num,aff_cleaner,aff_cleaner_final,aff_main_final,aff_subunit_final,aff_department_final,alt_final,country,is_business_school,is_institute,manual,screened,auth_count,auth_list,authors,content_type,stable_url,ref_type,ref_string,ref_string_cleaner,pdf_url,hitId,worker,aff_clean,aff_main,add_sub
3546,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Brennan C. Platt,"Brigham Young University, Department of Economics",2.0,1,"Brigham Young University, Department of Economics",Brigham Young University,brigham young university,0,Department of Economics,Brigham Young University,,,,MTURK,y,,,,,,,,,,,,,,
5412,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Dominic Coey,"Facebook, Core Data Science",0.0,1,"Core Data Science, Facebook",Facebook,facebook,0,Core Data Science,Facebook,0.0,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
8303,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,NBER,1.0,2,NBER,National Bureau of Economic Research - NBER,national bureau of economic research - nber,0,0,National Bureau of Economic Research - NBER,0.0,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
10511,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,"Stanford University, Department of Economics",1.0,1,"Department of Economics, Stanford University",Stanford University,stanford university,0,Department of Economics,Stanford University,0.0,0.0,0.0,MTURK,y,,,,,,,,,,,,,,


In [713]:
allowed="abcdefghijklmnopqrstuvwxyz- ,'.()1234567890"

In [714]:
exclude=[]
for i in s:
    whole=False
    for j in i:
        if j not in allowed:
            whole=True
            exclude.append(j)
            print('"'+j+'",')
    if whole==True:
        print(i)

"/",
centre for population, poverty and public policy studies - ceps/instead
"/",
ipea/inpes
"!",
yahoo! research


In [715]:
#
## exporting unique affiliations at this stage out 
# 
# sub=combined[['id', 'year','author',
#        'auth_num', 'aff_num', 'aff_cleaner',
#        'aff_cleaner_final', 'aff_main_final', 'aff_subunit_final',
#        'aff_department_final', 'alt_final', 'manual', 'screened']]

# affs_uni=pd.DataFrame({"affiliations":sub['aff_main_final'].unique()})
# affs_uni.to_csv("affiliation_list.csv",index=False)

## reading reconned country list in
# uni_path="/Users/sijiawu/Work/80YearsEconomicResearch/020_author_names_recon/"
# aff_uni_results=[]
# for i in range(0, len(affs_uni), 50):
#     # print(i)
#     temp=pd.read_csv(uni_path+"affs_uni_response/affs_uni_"+str(i)+"_"+str(i+49)+"_results.csv")
#     # print(temp.columns)
#     if 'affiliations' in temp.columns:
#         print(uni_path+"affs_uni_"+str(i)+"_"+str(i+49)+"_results.csv")
#     # if len(temp.columns)>3:
#     #     print(uni_path+"affs_uni_"+str(i)+"_"+str(i+49)+"_results.csv")
#     aff_uni_results.append(temp)

# aff_uni_results=pd.concat(aff_uni_results)
# aff_uni_results=aff_uni_results.drop(columns=['id'])
# aff_uni_results.loc[aff_uni_results['country']=='USA', 'country']='United States'
# aff_uni_results.loc[aff_uni_results['country']=='UK', 'country']='United Kingdom'
# aff_uni_results.loc[aff_uni_results['country']=='international', 'country']='International'

# # aff_uni_results.to_csv("aff_uni_results.csv")

In [716]:
aff_uni_results=pd.read_csv("aff_uni_results.csv")

In [717]:
combined['country']=None
t=[]
for i in combined.index:
    if pd.isna(combined.loc[i,'aff_main_final'])==True:
        continue
    tmp=aff_uni_results[aff_uni_results['affiliation']==combined.loc[i,'aff_main_final']]['country'].values
    if len(tmp)==0:
        tmp=aff_uni_results[aff_uni_results['affiliation']==combined.loc[i,'aff_main_final'].strip().replace(',','')]['country'].values
        if len(tmp)>0:
            combined.loc[i,'country']=tmp[0]
    else:
        combined.loc[i,'country']=tmp[0]



In [718]:
combined[combined['id']==26966479]

Unnamed: 0,id,year,title,journal,pages,vol,number,author,affiliation,auth_num,aff_num,aff_cleaner,aff_cleaner_final,aff_main_final,aff_subunit_final,aff_department_final,alt_final,country,is_business_school,is_institute,manual,screened,auth_count,auth_list,authors,content_type,stable_url,ref_type,ref_string,ref_string_cleaner,pdf_url,hitId,worker,aff_clean,aff_main,add_sub
3546,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Brennan C. Platt,"Brigham Young University, Department of Economics",2.0,1,"Brigham Young University, Department of Economics",Brigham Young University,brigham young university,0,Department of Economics,Brigham Young University,United States,,,MTURK,y,,,,,,,,,,,,,,
5412,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Dominic Coey,"Facebook, Core Data Science",0.0,1,"Core Data Science, Facebook",Facebook,facebook,0,Core Data Science,Facebook,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
8303,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,NBER,1.0,2,NBER,National Bureau of Economic Research - NBER,national bureau of economic research - nber,0,0,National Bureau of Economic Research - NBER,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
10511,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,"Stanford University, Department of Economics",1.0,1,"Department of Economics, Stanford University",Stanford University,stanford university,0,Department of Economics,Stanford University,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,


In [719]:
combined=combined.sort_values(by=['country','aff_main_final'])
combined=combined.sort_values(by=["year",'auth_num','aff_num'])
combined['id']=combined['id'].astype(str)

In [720]:
combined[combined['id']=='26966479']

Unnamed: 0,id,year,title,journal,pages,vol,number,author,affiliation,auth_num,aff_num,aff_cleaner,aff_cleaner_final,aff_main_final,aff_subunit_final,aff_department_final,alt_final,country,is_business_school,is_institute,manual,screened,auth_count,auth_list,authors,content_type,stable_url,ref_type,ref_string,ref_string_cleaner,pdf_url,hitId,worker,aff_clean,aff_main,add_sub
5412,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Dominic Coey,"Facebook, Core Data Science",0.0,1,"Core Data Science, Facebook",Facebook,facebook,0,Core Data Science,Facebook,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
10511,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,"Stanford University, Department of Economics",1.0,1,"Department of Economics, Stanford University",Stanford University,stanford university,0,Department of Economics,Stanford University,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
8303,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,NBER,1.0,2,NBER,National Bureau of Economic Research - NBER,national bureau of economic research - nber,0,0,National Bureau of Economic Research - NBER,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
3546,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Brennan C. Platt,"Brigham Young University, Department of Economics",2.0,1,"Brigham Young University, Department of Economics",Brigham Young University,brigham young university,0,Department of Economics,Brigham Young University,United States,,,MTURK,y,,,,,,,,,,,,,,


In [721]:
replacements=pd.read_excel("refactor_affs_checked.xlsx")

concat_candidate=[]
for i in replacements.index:
    j=combined.index[(combined['id']==replacements.loc[i,"id"])&(combined['auth_num']==replacements.loc[i,"auth_num"])&(combined['aff_num']==replacements.loc[i,"aff_num"])]
    if len(j)==0:
        temp=combined[(combined['id']==str(replacements.loc[i,"id"]))][[
            'id', 'year', 'title', 'journal', 'pages', 'vol', 'number', 'auth_count', 'auth_list',
            'authors', 'content_type', 'stable_url']].drop_duplicates().reset_index(drop=True)
        comb=temp.iloc[0].to_dict()|replacements.iloc[i].to_dict()
        concat_candidate.append(comb)
        
        # print(pd.concat([temp,replacements.iloc[i]], axis=1).shape)
        # print(replacements.iloc[i].shape)
    else:
        combined.loc[j[0], 'aff_main_final']=replacements.loc[i,'aff_main_final']


combined=pd.concat([combined,pd.DataFrame(concat_candidate)],axis=0).reset_index(drop=True)

In [722]:
combined[combined['id']=='26966479']

Unnamed: 0,id,year,title,journal,pages,vol,number,author,affiliation,auth_num,aff_num,aff_cleaner,aff_cleaner_final,aff_main_final,aff_subunit_final,aff_department_final,alt_final,country,is_business_school,is_institute,manual,screened,auth_count,auth_list,authors,content_type,stable_url,ref_type,ref_string,ref_string_cleaner,pdf_url,hitId,worker,aff_clean,aff_main,add_sub
55692,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Dominic Coey,"Facebook, Core Data Science",0.0,1,"Core Data Science, Facebook",Facebook,facebook,0,Core Data Science,Facebook,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
56455,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,"Stanford University, Department of Economics",1.0,1,"Department of Economics, Stanford University",Stanford University,stanford university,0,Department of Economics,Stanford University,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
56672,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,NBER,1.0,2,NBER,National Bureau of Economic Research - NBER,national bureau of economic research - nber,0,0,National Bureau of Economic Research - NBER,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
56858,26966479,2020.0,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Brennan C. Platt,"Brigham Young University, Department of Economics",2.0,1,"Brigham Young University, Department of Economics",Brigham Young University,brigham young university,0,Department of Economics,Brigham Young University,United States,,,MTURK,y,,,,,,,,,,,,,,


In [723]:
# combined[(combined['id']==str(replacements.loc[i,"id"]))]
country_assign_rem=pd.read_csv("aff_country_res.csv")

for i in country_assign_rem.index:
    combined.loc[combined['aff_main_final']==country_assign_rem.loc[i,'affiliation'], "country"]=country_assign_rem.loc[i,'country']

combined["aff_main_final"]=combined["aff_main_final"].str.lower()

In [724]:
# p=0
# for i in combined.country.unique():
#     if i==None:
#         affs=combined[combined['country'].isna()]["aff_main_final"].unique()
#         print(str(i)+ ' ' +str(len(affs)))
#     else:
#         affs=combined[combined['country']==i]["aff_main_final"].unique()
#         # print(str(i)+ ' ' +str(len(affs)))
#         if len(affs)>=163:
#             print(str(i)+ ' ' +str(len(affs)))

#             p+=1
#         if len(affs)==1295:
#             affs.sort()
#             print(str(i)+ ' ' +str(len(affs)))
#             for j in affs:
#                 print('"'+j+'",')
#             print()
# print(p)

In [725]:
consol2={

"west hempstead (city)":["west hempstead",
"west hempstead (city)",],
"w.r. grace and company":["w.r. grace and co.",
"w.r. grace and company",],
"us navy":["us naval academy",
"us naval ordnance test station",
"us naval reserve",
"us navy",],
"us state department":["us department of state","us state department","department of state", "state department", "division of financial affairs, department of state", "office of financial and development policy, department of state",],
"us department of the treasury":["us department of the treasury", "department of the treasury",
"us department of treasury", "treasury department", "us department of the treasury"],
"us department of agriculture":["office of foreign agricultural relations",
"office of foreign agricultural relations, department of agriculture"
    ,"department of agriculture, washington, dc","department of argiculture","division of program surveys in department of agriculture", "office of the secretary department of agriculture", "us department of agriculture"],
"us department of commerce":["department of commerce", "us department of commerce"],
"us bureau of the budget":["us bureau of budget","bureau of budget",
"bureau of budget, state of new york","bureau of the budget",
"bureau of the budget washington, dc",
"us bureau of the budget",],
"us air force":["us air force",
"us air force academy",],
"university of texas - arlington":["university of texas - arlington",
"university of texas at arlington",],
"university of nebraska":["university of nebraska",
"university of nebraska system",],
"university of nebraska - omaha":["university of nebraska - omaha",
"university of nebraska at omaha",],
"university of missouri - columbia":["university of missouri - columbia",
"university of missouri-columbia",],
"university of missouri - kansas city":[
"university of missouri - kansas city",
"university of missouri-kansas city",],
"university of michigan":["university of michigan",
"university of michigan health system",
"university of michigan.",],
"university of massachusetts - lowell":["university of massachusetts - lowell",
"university of massachusetts at lowell",],
"university of massachusetts - boston":["university of massachusetts - boston",
"university of massachusetts-boston",],
"university of maryland - baltimore county":["university of maryland - baltimore county",
  "university of maryland-baltimore county"],
"university of maryland - college park":[
"university of maryland - college park",
"university of maryland, college park",],
"university of colorado - denver":["university of colorado - denver",
"university of colorado-denver",],
"university of colorado - boulder":["university of colorado - boulder",
"university of colorado-boulder",],

"united states":["united states",
"united states (country)",],
"state department of public service, new york":["state department of public service  new york",
"state department of public service, new york","state department of public service new york"],
"social science research council":["social research council",
"social science research council",],
"santa monica, california (city)":["santa monica california united states (city)",
"santa monica, california (city)",],
"san francisco (city)":["san francisco (city)",
"san francisco united states (city)",],
"rochester, new york (city)":["rochester new york united states (city)",
"rochester, new york (city)",],
"ripon college":["ripon college",
"ripon college, ripon, wisconsin",],
"resources for the future, inc.":["resources for the future, inc.",
"rff",],
"population council":["population council",
"population council new york",],
"pittsburgh pennsylvania united states (city)":["pittsburgh (city)",
"pittsburgh pennsylvania united states (city)",],
"office of strategic services":["office of strategic services",
"office of strategic services in washington, dc",],
"office of business economics, washington, dc":["office of business economics,  washington, dc",
"office of business economics, washington, dc",],
"oberlin college":["oberlin college",
"oberlin college and conservatory",],
"new york university":["new york university",
"new york university tandon school of engineering",],
"new york city united states (city)":["new york",
"new york city united states (city)","new york, n.y. (city)"],
"new haven conneticut united states (city)":["new haven conneticut united states (city)",
"new haven, conn.",],
"national war labor board":["national war labor board in washington",
"national war labor board, region vi, chicago",],
"national planning association, washington, dc":["national planning association",
"national planning association, washington, dc",],
"microsoft corporation":["microsoft corporation",
"microsoft research",],
"mathematica policy research, inc.":["mathematica policy research",
"mathematica policy research, inc.",],
"louisiana tech university":["louisiana tech university",
"louisiana technical university",],
"lincoln institute of land policy":["lincoln institute of land policy",
"lincoln institute of land policy in cambridge",],
"lexington kentucky united states (city)":["lexington kentucky united states (city)",
"lexington, kentucky",],
"john hopkins university":["john hopkins university",
"john hopkins university school of medicine",],
"j. p. morgan":["j. p. morgan",
"j.p. morgan",],
"indiana university-purdue university":["indiana university purdue university indianapolis",
"indiana university-purdue university",],
"icf consulting, inc.":["icf consulting, inc.",
"icf inc.",],
"harris trust and savings bank":["harris trust and savings bank",
"harris trust and savings bank, chicago, illinois",],
"google":["google",
"google research",],
"goldman sachs":["goldman sachs",
"goldman, sachs and co.",],
"georgia state university":["george state university",
"georgia state university",],
"george mason university":["george mason university",
"george mason university school of law",],
"general motors":["general motors",
"general motors corporation",],
"foreign economic administration, washington dc":["foreign economic administration",
"foreign economic administration, washington, dc",],
"federal reserve bank of minneapolis":["federal reserse bank of minneapolis",
 "federal reserve bank of minneapolis",
],
"federal reserve bank of cleveland":["federal bank of cleveland","federal reserve bank of cleveland",],
"federal reserve bank of kansas city":["federal reserve bank of kansas city",
"federal reserve bank, kansas city",],
"federal reserve bank of new york":["federal reserve bank of new york",
 "federal reserve bank in new york",],
"federal reserve bank of philadelphia":[
"federal reserve bank of philadelphia",
"federal reserve bank of philadelphia.",
"federal reserve bank, philadelphia",],
"federal reserve board":["federal reserve",
"federal reserve bank",
"federal reserve board",],
"economic research service, usda":["economic research service",
"economic research service, usd.a.",],
"duke university":["duke university",
"duke university medical centre",],
"committee for economic development":["committee for economic development",
"committee on economic development",],
"abt associates, inc":["abt associates inc.",
"abt associates, inc",],
"american enterprise institute for public policy research":["american enterprise institute",
"american enterprise institute for public policy research",
],
"american institute for economic research - aier":["american institute for economic research",
"american institute for economic research (aier)",
],
"analysis group, inc":["analysis group",
"analysis group, inc",],
"bankers trust company":["bankers trust company",
"bankers trust company, new york",],

"california state university - east bay":["california state university - east bay",
"california state university, east bay",],
"carnegie mellon university":["carnegie mellon university",
"carnigie mellon university",],
"citigroup, inc.":["citigroup",
"citigroup, inc.",],
"city university of new york":["brooklyn college",
"city college, new york",
"city university of new york",
"hostos community college",
"hunter college",
"queens college"],
"center for economic policy and research - cepr, washington":["center for economic and policy research (cepr), washington",
"center for economic policy and research",],
"central statistical board":["central statistic board",
"central statistical board"],
    "universite clermont-auvergne":["universite clermont auvergne",
"universite clermont-auvergne",],
"national institute of statistics and economic studies (paris)":["national institute of statistics and economic studies",
"national institute of statistics and economic studies (paris)",],
"university of toulouse":["gremaq, universite de toulouse","university of toulouse",],
"institut national d'etudes demographiques - ined":["institut national d'etudes demographiques",
"institut national d'etudes demographiques - ined",],
"departement et laboratoire d'economie theoretique et appliquee - delta":["departement et laboratoire d'economie theoretique et appliquee (delta)",
"departement et laboratoire d'economie theoretique et appliquee - delta",],
"conservatoire national des arts et metiers":["conservatoire national des arts et metier",
"conservatoire national des arts et metiers",],
    "university of aberdeen":["university of aberdeen","aberdeen university",],
"university of birmingham":["birmingham university","university of birmingham"],
"university of bristol":["bristol university", "university of bristol"],
"cambridge united kingdom (city)":["cambridge", "cambridge united kingdom (city)"],
"university of cambridge":["cambridge university", "university of cambridge"],
"city university of london":["city university london", "city university of london"],
"university of edinburgh":["edinburgh university", "university of edinburgh"],
"university of essex":["essex university", "university of essex"],
"glasgow united kingdom (city)":["glasgow (city)", "glasgow united kingdom (city)"], 
"university of glasgow" :["glasgow university", "university of glasgow"],
"manchester united kingdom (city)":["manchester (city)", "manchester united kingdom (city)"],
"national institute of economic and social research - niesr":["national institute of economic and social research - niesr", "national institute of economic and social research, london"],
"university of newcastle":["newcastle university", "university of newcastle"],
"university of oxford":["oxford university", "university of oxford"],
"sheffield united kingdom (city)":["sheffield united kingdom (city)", "sheffield, united kingdom (city)"],
"university of southampton":[ "southampton university", "university of southampton"],
"university of st. andrews":["st. andrews", "university of st. andrews"],
"university of surrey":["surrey university", "university of surrey"],
"trinity college london":["trinity college", "trinity college london"],
"university of ulster":["ulster university", "university of ulster"],
"university college london":["university college", "university college london"],
"university of warwick":["warwick university", "university of warwick"],

    "wissenschaftszentrum berlin fur sozialforschung - wzb":["wissenschaftszentrum berlin fur sozialforschung",
"wissenschaftszentrum berlin fur sozialforschung (wzb)",
"wzb berlin social science centre",],
    "kiel university":["university of kiel","kiel university",],
"munich germany (city)":["munich (city)",
"munich germany (city)",],
"university of mannheim":["mannheim university","university of mannheim",],
"kiel institute for the world economy - ifw":["kiel institut fur weltwirtschaft",
"kiel institute for the world economy - ifw",],
"international institute of management berlin":["international institute of management berlin",
"international institute of management, berlin",],
"institute of labor economics - iza":["institute of labor economics",
"institute of labor economics - iza",],
"heinrich-heine-university dusseldorf":["heinrich heine university",
"heinrich-heine-university dusseldorf",],
"goethe university frankfurt":["goethe university",
"goethe-universitat frankfurt","johann wolfgang goethe universitat",
"johann wolfgang goethe university",],
"european school of management and technology - esmt":["esmt berlin",
"european school of management and technology - esmt",],
"frankfurt, germany (city)":["frankfort germany (city)",
"frankfort, germany (city)",],
"universitat munster":["der universitat munster","universitat munster",],
"toronto canada (city)":["toronoto canada (city)",],
"centre interuniversitaire de recherche en analyse des organisations - cirano":["centre interuniversitaire de recherche en analyse des organisations - cirano",
"cirano",],
    "university of venice":["university of venice","ca' foscari university of venice",],
"centre for studies in economics and finance - csef":["center for studies in economies and finance",
"centre for studies in economics and finance - csef",],
"university of pavia":["university of pavia","di pavia university",],
"dipartimento di scienze economiche, rome":["dipartimento di scienze economiche",
"dipartimento di scienze economiche, rome",],
"european university institute":["european university institute",
"european university institute, economics department, villa san paolo",],
"university of brescia":["universita degli studi di brescia",
"university of brescia",],
"university of cagliari":["university of cagliari",
"universita degli studi di cagliari",],
"university of florence":["universita degli studi di firenze",
"universita di firenze",
"university of florence",],
"university of milan":["universita degli studi di milano",
"university of milan",],
"roma tre university":["roma tre university",
"terza universita' degli studi di roma",],
"parthenope university of naples":["university of naples parthenope",
"parthenope university of naples",],

"university of cantabria":["universidad de cantabria",
"university of cantabria",],
"university of barcelona":["universitat de barcelona",
"university of barcelona",],
"institutos madrileno de estudios avanzados (imdea)":["institutos madrileno de estudios avanzados (imdea)",
"madrid institute for advanced studies (imdea)",],
"institute for economic analysis - csic":["institut d'analisi economica - csic",
"institut d'analisi economica - iae",
"institut d'analisi economica csic - iae-csic",
"institute for economic analysis",
"institute for economic analysis - csic",
],
"tokyo (city)":["tokyo (city)",
"tokyo japan (city)",],
    "food and agriculture organization of the united nations":["food and agriculture organization of the united nations",
"food and agriculture organization of united nations",],
"organisation for economic co-operation and development - oecd":["organisation for economic co-operation and development ",
"organisation for economic co-operation and development (oecd)",
"organisation for economic co-operation and development - oecd",],
    
    "shanghai tech university":["shanghaitech university","shanghai tech university",],
    "amsterdam netherlands (city)":["amsterdam (city)",
"amsterdam netherlands (city)",
],
"leiden university":["leiden university",
"leyden university",],
"netherlands bureau for economic policy analysis":["netherlands bureau for economic policy analysis",
"netherlands bureau for economic policy analysis, the netherlands",],
"netherlands central bureau of statistics":["central bureau of statistics","netherlands central bureau of statistics",],
"ministry of economic affairs, netherlands":["dutch ministry of economic affairs","ministry of economic affairs, netherlands",],
    "university of new england":["university of new england",
"university of new england, australia",],
"monash university":["monash university",
"monash university - melbourne",],
    "indian parliament":["member of parliament, india","indian parliament, new delhi",],
    "ministry of finance, new delhi, india":["ministry of finance, new delhi","indian ministry of finance",
"ministry of finance, new delhi, india",],
"university of delhi":["university of delhi","delhi university","delhi school of economics",],
"bombay, india (city)":["bombay india (city)",
"bombay, india (city)",],
"university of mumbai":["bombay university","university of bombay",],
    "lund university":["lund university","university of lund",],
"swedish central bank":["sveriges riksbank","swedish central bank",],
"stockholm university":["university in stockholm","university of stockholm","stockholm university",],
"university of gothenburg":["university of gothenburg","gothenburg university",],
"uppsala university":["uppsala university","university of uppsala",],
    "catholic university of louvain":["catholic university of leuven",
"catholic university of louvain","ku leuven",
],
    "centre for operations research and econometrics - core":["center for operations research and econometrics", "centre for operations research and econometrics - core", "ecore"],
    "fundacao getulio vargas - fgv":["fundacao getulio vargas", "fundacao getulio vargas - fgv", "getulio vargas foundation"],
"escola de economia de sao paulo da fundacao getulio vargas - fgv eesp":["escola de economia de sao paulo", "escola de economia de sao paulo da fundacao getulio vargas - fgv eesp", "sao paulo school of economics - fgv eesp"],
"pontificia universidade catolica do rio de janeiro":["pontificia universidade catolica", "pontificia universidade catolica do rio de janeiro", "catholic university of rio de janeiro"],
"instituto de pesquisa economica aplicada (ipea)":["instituto de planejamento economico e social", "instituto de pesquisa economica aplicada (ipea)", "ipea/inpes"],
    "university of bergen":["university of bergen",
"university of bergent",],
"statistics norway":["central bureau of statistics of norway",
"central bureau of statistics, oslo",
],
"university of oslo":["university of norway"],
"ragnar frisch centre for economic research":["frischsenteret for samfunnsokonomisk forskning","ragnar frisch centre for economic research",
],
    "hebrew university of jerusalem":["center for study of rationality",
],
"central bureau of statistics - jerusalem":["central bureau of statistics jerusalem",
"central bureau of statistics, jerusalem",],
"ministry of finance, israel":["consultant to the israeli ministry of finance",
],
    "aarhus university":["dale t. mortensen centre"],
"university of southern denmark":["university of odense",
"university of southern denmark","odense university",
],
"institute for advanced studies - ihs vienna":["institute for advanced studies - ihs vienna",
"institute for advanced studies vienna","vienna institute for advanced studies",
],
"university of vienna":["university of vienna",
"university of vienna austria",],
"austrian institute of economic research - wifo":["osterreichisches institut fur wirtschaftsforschung, vienna",
"wifo - austrian institute of economic research","austrian institute of economic research",
],
"vienna university of technology":["vienna university of technology","university of technology vienna",],
"johannes kepler universitat linz":["johannes kepler universitat linz","university of linz",],
    "novosibirsk state university":["novosibirsk state university",
"novosibirsk state university, russia",],
"el colegio de mexico":["colegio de mexico",
"el colegio de mexico",],
"instituto technologico autonomo de mexico - itam":["centro de investigacion economica - cie", "centro de investigacion economica, instituto tecnologico autonomo de mexico",],
    "tallinn university of technology":["tallinn university of technology",
"tallinn university of technology, estonia",],
"central bank of iran":["bank markazi iran",
"central bank of iran",],
"university of khartoum":["university of khartoum",
"university of khartoum (sudan)",],
"university of the philippines - diliman":["university of philippines",
"university of the philippines",
"university of the philippines diliman"],
"universidad de los andes":["universidad de los andes",
"university of the andes",],
"jamaica (country)":["jamaica",
"jamaica (country)",],
"university of dhaka":["dhaka university",
"university of dacca",
"university of dhaka",],
"university of the west indies":["university college of the west indies",
"university of the west indies",],
"university of singapore":["national university of singapore",
"university of singapore",
],
"duke university":["duke university"],
"central school of planning and statistics, warsaw":["central school of planning and statistics",
"central school of planning and statistics, warsaw",],
"centre for economic research and graduate education - economics institute - cerge-ei":["centre for economic research and graduate education - economics institute - cerge-ei",
"cerge-ei",],
"czech academy of sciences - economics institute - ei":["academy of sciences of the czech republic",
"czech academy of sciences",
"czech academy of sciences - economics institute - ei ",],
"banco de portugal":["banco de portugal",
"bank of portugal",],
"nova university of lisbon":["new university of lisbon",
"nova university of lisbon",
],
"bank of finland":["bank of finland",
"bank of finland, helsinki",],
"helsinki university":["university of helsinki","helsinki university",
],
"abo akademi":["swedish university of turku",],
"central bank of ireland":["central bank and financial services authority of ireland",
"central bank of ireland",],
"economic and social research institute, dublin":["economic and social research institute, dublin",
"economic research institute, dublin",],
"banco central de chile":["banco central de chile",
"central bank of chile",],
"pontificia universidad catolica de chile":["catholic university of chile",
"pontificia universidad catolica de chile",
],
"universidad de chile":["universidad de chile",
"university of chile",],
"new zealand institute of economic research":["new zealand institute of economic research",
"new zealand institute of economic research, inc. wellington, new zealand",],
"auckland, new zealand (city)":["auckland new zealand (city)",
"auckland, n.z. (city)",],
"ministry of economy, argentina":["undersecretary of macroeconomic programming",
],



}

consol_un2={}
for i in consol2.keys():
    for j in consol2[i]:
        consol_un2[j]=i
# print(consol_un2)

for i in consol_un2.keys():
    combined.loc[combined['aff_main_final']==i,"aff_main_final"]=consol_un2[i]


In [726]:
for i in combined.index:
    if combined.loc[i,'author'].strip()[0]=="'":
        combined.loc[i,'author']=combined.loc[i,'author'].strip()[1:].strip()
    if combined.loc[i,'author'].strip()[-1]=="'":
        combined.loc[i,'author']=combined.loc[i,'author'].strip()[0:-1].strip()

combined['year']=combined['year'].astype(int)

In [727]:
combined[combined['id']=='26966479']

Unnamed: 0,id,year,title,journal,pages,vol,number,author,affiliation,auth_num,aff_num,aff_cleaner,aff_cleaner_final,aff_main_final,aff_subunit_final,aff_department_final,alt_final,country,is_business_school,is_institute,manual,screened,auth_count,auth_list,authors,content_type,stable_url,ref_type,ref_string,ref_string_cleaner,pdf_url,hitId,worker,aff_clean,aff_main,add_sub
55692,26966479,2020,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Dominic Coey,"Facebook, Core Data Science",0.0,1,"Core Data Science, Facebook",Facebook,facebook,0,Core Data Science,Facebook,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
56455,26966479,2020,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,"Stanford University, Department of Economics",1.0,1,"Department of Economics, Stanford University",Stanford University,stanford university,0,Department of Economics,Stanford University,United States,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
56672,26966479,2020,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Bradley J. Larsen,NBER,1.0,2,NBER,National Bureau of Economic Research - NBER,national bureau of economic research - nber,0,0,National Bureau of Economic Research - NBER,International,0.0,0.0,MTURK,y,,,,,,,,,,,,,,
56858,26966479,2020,Discounts and Deadlines in Consumer Search,american economic review,3748-3785,110.0,12,Brennan C. Platt,"Brigham Young University, Department of Economics",2.0,1,"Brigham Young University, Department of Economics",Brigham Young University,brigham young university,0,Department of Economics,Brigham Young University,United States,,,MTURK,y,,,,,,,,,,,,,,


In [728]:
combined.to_excel(base_path+"Affiliations/affiliations_combined.xlsx", index=False)
combined.to_pickle(base_path+"Affiliations/affiliations_combined.pkl")

sub=combined[['id', 'year','author',
       'auth_num', 'aff_num', 'aff_cleaner',
       'aff_cleaner_final', 'aff_main_final', 'aff_subunit_final',
       'aff_department_final', 'alt_final', 'manual', 'screened', 'country']]
sub.to_excel(base_path+"Affiliations/affiliations_combined_sub.xlsx", index=False)
sub.to_pickle(base_path+"Affiliations/affiliations_combined_sub.pkl")

In [729]:
combined.columns

Index(['id', 'year', 'title', 'journal', 'pages', 'vol', 'number', 'author',
       'affiliation', 'auth_num', 'aff_num', 'aff_cleaner',
       'aff_cleaner_final', 'aff_main_final', 'aff_subunit_final',
       'aff_department_final', 'alt_final', 'country', 'is_business_school',
       'is_institute', 'manual', 'screened', 'auth_count', 'auth_list',
       'authors', 'content_type', 'stable_url', 'ref_type', 'ref_string',
       'ref_string_cleaner', 'pdf_url', 'hitId', 'worker', 'aff_clean',
       'aff_main', 'add_sub'],
      dtype='object')

In [730]:
len(combined['aff_main_final'].unique())

2586

In [731]:
checks=[]
for i in combined.index:
    if pd.isna(combined.loc[i,'aff_main_final'])==False:
        if ("depart" in combined.loc[i,'aff_main_final']):# | ("bureau" in combined.loc[i,'aff_main_final']):
            checks.append(combined.loc[i,'aff_main_final'])
checks_u=list(set(checks))
checks_u.sort()
checks_u

['antitrust division of department of justice',
 'board of engineers for rivers and harbors of war department',
 'bureau of economic and business affairs, us department of state',
 'canada department of agriculture',
 'departement de sciences economiques',
 "departement et laboratoire d'economie theoretique et appliquee - delta",
 'department of agriculture and rural affairs, victoria, australia',
 'department of agriculture union of south africa',
 'department of commerce and industry, ireland',
 'department of consumer and corporate affairs',
 'department of defense',
 'department of economic affairs, united nations',
 'department of economics and human development, state of maryland',
 'department of finance canada',
 'department of housing and urban development, washington, dc',
 'department of labor and industry',
 'department of labour, ottawa',
 'department of the air force',
 'department of war organization of industry melbourne',
 'division of economic research, bureau of comm