In [121]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

In [41]:
d = pd.read_csv('../data/arp_grp_2010_2019_references.csv')
print(d.columns)

Index(['Unnamed: 0', 'AU', 'TI', 'SO', 'JI', 'AB', 'DE', 'ID', 'LA', 'DT',
       'DT2', 'TC', 'CR', 'C1', 'DI', 'PA', 'AR', 'BE', 'FU', 'FX', 'BN', 'SN',
       'PN', 'PP', 'PU', 'VL', 'PY', 'UT', 'NR', 'SC', 'U2', 'WC', 'EM', 'SE',
       'GA', 'BO', 'RP', 'DB', 'AU_UN', 'AU1_UN', 'AU_UN_NR', 'SR_FULL', 'SR'],
      dtype='object')


In [127]:
def split_citation(line):
    d = line.split(',')
    author = d[0]
    year = d[1]
    journal = ','.join(d[2:])
    author_year = ','.join(d[:2])
    return([author, year, journal, author_year, line])

def split_citation2(line_full):    
    author = []
    year = []
    journal = []
    author_year = []
    full = []
    if line_full is np.nan:
        print('no references')
    else:
        ref = line_full.split(';')
        for  r in ref:
            d = r.split(',')
            n_entries = len(d) - 1
            author.append(d[min([0, n_entries])])
            year.append(d[min([1, n_entries])])
            journal.append(','.join(d[min([2, n_entries]):min([3, n_entries])]))
            author_year.append(','.join(d[:min([2, n_entries])]))
            full.append(','.join(d[:min([3, n_entries])]))
        
    citation_df = pd.DataFrame.from_dict({'author': author, 
                                          'year': year, 
                                          'journal': journal, 
                                          'author_year': author_year,
                                          'full': full})
    return(citation_df)

author_info = d['Unnamed: 0'].apply(split_citation)
author_info = author_info.apply(pd.Series)
author_info = author_info.rename(columns = {0: 'author', 1: 'year', 2: 'journal', 3: 'author_year', 4: 'full'})
author_info

Unnamed: 0,author,year,journal,author_year,full
0,BENAVENT E,2019,EUR J OPER RES,"BENAVENT E, 2019","BENAVENT E, 2019, EUR J OPER RES"
1,WILLEMSE EJ,2019,COMPUT OPER RES,"WILLEMSE EJ, 2019","WILLEMSE EJ, 2019, COMPUT OPER RES"
2,ARAKAKI RK,2019,COMPUT OPER RES,"ARAKAKI RK, 2019","ARAKAKI RK, 2019, COMPUT OPER RES"
3,DE ARMAS J,2019,ANN OPER RES,"DE ARMAS J, 2019","DE ARMAS J, 2019, ANN OPER RES"
4,CERRONE C,2019,EUR J OPER RES,"CERRONE C, 2019","CERRONE C, 2019, EUR J OPER RES"
5,TFAILI S,2019,RAIRO-OPER RES,"TFAILI S, 2019","TFAILI S, 2019, RAIRO-OPER RES"
6,MOFID-NAKHAEE E,2019,WASTE MANAGE RES,"MOFID-NAKHAEE E, 2019","MOFID-NAKHAEE E, 2019, WASTE MANAGE RES"
7,FENG L,2019,EVOLUTIONARY AND SWARM INTELLIGENCE ALGORITHMS,"FENG L, 2019","FENG L, 2019, EVOLUTIONARY AND SWARM INTELLIGE..."
8,WELLER M,2018,J COMB OPTIM,"WELLER M, 2018","WELLER M, 2018, J COMB OPTIM"
9,LI M,2018,COMPUT IND ENG,"LI M, 2018","LI M, 2018, COMPUT IND ENG"


In [166]:
partial_citations = []
crs_updated = []
for i, citation in enumerate(d['CR'].values):
    print(i, author_info.iloc[i]['full'])
    author_citation = split_citation2(citation)
    fully_cited = author_citation.merge(author_info, how='inner', on=['full'])
    not_fully_cited = author_citation.merge(author_info, how='left', on=['full'])
    not_fully_cited = author_citation.merge(author_info, how='left', on=['full'])
    not_fully_cited = not_fully_cited.loc[pd.isnull(not_fully_cited['author_y'])]
    not_fully_cited = not_fully_cited.drop(columns = ['author_y', 'year_y', 'journal_y', 'author_year_y'])
    author_cited = not_fully_cited.merge(author_info, how='inner', left_on=['author_year_x'], right_on=['author_year'])
    if len(author_cited):
        author_cited['orig'] = author_info.iloc[i]['full']
        partial_citations.append(author_cited)
    crs = list(fully_cited['full']) + list(author_cited['full_y'])
    crs_updated.append(';'.join(crs))

0 BENAVENT E, 2019, EUR J OPER RES
1 WILLEMSE EJ, 2019, COMPUT OPER RES
2 ARAKAKI RK, 2019, COMPUT OPER RES
3 DE ARMAS J, 2019, ANN OPER RES
4 CERRONE C, 2019, EUR J OPER RES
5 TFAILI S, 2019, RAIRO-OPER RES
6 MOFID-NAKHAEE E, 2019, WASTE MANAGE RES
7 FENG L, 2019, EVOLUTIONARY AND SWARM INTELLIGENCE ALGORITHMS
8 WELLER M, 2018, J COMB OPTIM
9 LI M, 2018, COMPUT IND ENG
10 TIRKOLAEE EB, 2018, WASTE MANAGE
11 SHANG R, 2018, NAT COMPUT
12 CIANCIO C, 2018, EUR J OPER RES
13 TIRKOLAEE EB, 2018, SUSTAINABILITY
14 FERNANDEZ E, 2018, TRANSP SCI
15 SHANG R, 2018, MEMET COMPUT
16 ARAKAKI RK, 2018, COMPUT OPER RES
17 LAI Q, 2018, 2018 IEEE 14TH INTERNATIONAL CONFERENCE ON AUTOMATION SCIENCE AND ENGINEERING (CASE)
18 WOHLK S, 2018, J OPER RES SOC
19 TING CJ, 2018, INT J IND ENG -THEORY APPL PRACT
20 STEYN LJ, 2018, 2018 INTERNATIONAL CONFERENCE ON ADVANCES IN BIG DATA, COMPUTING AND DATA COMMUNICATION SYSTEMS (ICABCD)
21 SU Z, 2018, IFAC PAPERSONLINE
22 KIILERICH L, 2018, INFOR
23 GONZALEZ-MARTIN

186 CORBERAN A, 2007, NETWORKS
187 BENAVENT E, 2007, EUR J OPER RES
188 DENG X, 2007, 2007 IEEE INTERNATIONAL CONFERENCE ON AUTOMATION AND LOGISTICS, VOLS 1-6
189 ZHU Z, 2007, 2007 IEEE INTERNATIONAL CONFERENCE ON AUTOMATION AND LOGISTICS, VOLS 1-6
190 REGHIOUI M, 2007, APPLICATIONS OF EVOLUTIONARY COMPUTING, PROCEEDINGS
191 AMAYA A, 2007, OPER RES LETT
192 BELENGUER JM, 2006, COMPUT OPER RES
193 CORBERAN A, 2006, COMPUT OPER RES
194 WOHLK S, 2006, COMPUT OPER RES
195 LACOMME P, 2006, COMPUT OPER RES
196 CORBERAN A, 2006, MATH PROGRAM
197 LONGO H, 2006, COMPUT OPER RES
198 SNIEZEK J, 2006, ANN OPER RES
199 CHU F, 2005, EUR J OPER RES
200 BALDACCI R, 2006, NETWORKS
201 MUYLDERMANS L, 2005, OPER RES
202 LACOMME P, 2005, EUR J OPER RES
203 FLEURY G, 2005, J OPER RES SOC
204 CHU F, 2005, J INTELL MANUF
205 CORBERAN A, 2005, OPER RES
206 GULERYUZ OG, 2005, 2005 IEEE/SP 13TH WORKSHOP ON STATISTICAL SIGNAL PROCESSING (SSP), VOLS 1 AND 2
207 MOURAO MC, 2005, EUR J OPER RES
208 HERTZ A, 2005, G

In [176]:
d['CR_update'] = crs_updated
crs_updated

['ARCHETTI C, 2017, EUR J OPER RES;BELENGUER JM, 2006, COMPUT OPER RES;BENAVENT E, 2011, NETWORKS;BOSCO A, 2013, OPTIM LETT;CHU F, 2005, EUR J OPER RES;CHU F, 2005, J INTELL MANUF;CHU F, 2005, EUR J OPER RES;CHU F, 2005, J INTELL MANUF;CIANCIO C, 2018, EUR J OPER RES;CORBERAN A, 2001, MATH PROGRAM;CORBERAN A, 2013, MATH PROGRAM;IRNICH S, 2015, EUR J OPER RES;LACOMME P, 2005, EUR J OPER RES;CORBERAN A, 2014, ARC ROUTING: PROBLEMS, METHODS, AND APPLICATIONS;MONROY IM, 2013, DISCRET APPL MATH',
 "BELENGUER JM, 2006, COMPUT OPER RES;BEULLENS P, 2003, EUR J OPER RES;CORBERAN A, 2010, NETWORKS;GOUVEIA L, 2010, COMPUT OPER RES;LACOMME P, 2004, ANN OPER RES;VIDAL T, 2017, OPER RES;WILLEMSE EJ, 2016, OPER RES LETT;WILLEMSE EJ, 2016, DATA BRIEF;WILLEMSE EJ, 2016, COMPUT OPER RES;WILLEMSE EJ, 2016, OPER RES LETT;WILLEMSE EJ, 2016, DATA BRIEF;WILLEMSE EJ, 2016, COMPUT OPER RES;WILLEMSE EJ, 2016, OPER RES LETT;WILLEMSE EJ, 2016, DATA BRIEF;WILLEMSE EJ, 2016, COMPUT OPER RES;DELL'AMICO M, 2016, TRAN

In [172]:
d.to_csv('../data/arp_grp_2010_2019_references_interref_update.csv', index=False)

In [173]:
partial_citations = []
crs_updated = []
for i, citation in enumerate(d['CR_update'].values):
    print(i, author_info.iloc[i]['full'])
    author_citation = split_citation2(citation)
    fully_cited = author_citation.merge(author_info, how='inner', on=['full'])
    not_fully_cited = author_citation.merge(author_info, how='left', on=['full'])
    not_fully_cited = author_citation.merge(author_info, how='left', on=['full'])
    not_fully_cited = not_fully_cited.loc[pd.isnull(not_fully_cited['author_y'])]
    not_fully_cited = not_fully_cited.drop(columns = ['author_y', 'year_y', 'journal_y', 'author_year_y'])
    author_cited = not_fully_cited.merge(author_info, how='inner', left_on=['author_year_x'], right_on=['author_year'])
    if len(author_cited):
        author_cited['orig'] = author_info.iloc[i]['full']
        partial_citations.append(author_cited)
    crs = list(fully_cited['full']) + list(author_cited['full_y'])
    crs_updated.append(';'.join(crs))

0 BENAVENT E, 2019, EUR J OPER RES
1 WILLEMSE EJ, 2019, COMPUT OPER RES
2 ARAKAKI RK, 2019, COMPUT OPER RES
3 DE ARMAS J, 2019, ANN OPER RES
4 CERRONE C, 2019, EUR J OPER RES
5 TFAILI S, 2019, RAIRO-OPER RES
6 MOFID-NAKHAEE E, 2019, WASTE MANAGE RES
7 FENG L, 2019, EVOLUTIONARY AND SWARM INTELLIGENCE ALGORITHMS
8 WELLER M, 2018, J COMB OPTIM
9 LI M, 2018, COMPUT IND ENG
10 TIRKOLAEE EB, 2018, WASTE MANAGE
11 SHANG R, 2018, NAT COMPUT
12 CIANCIO C, 2018, EUR J OPER RES
13 TIRKOLAEE EB, 2018, SUSTAINABILITY
14 FERNANDEZ E, 2018, TRANSP SCI
15 SHANG R, 2018, MEMET COMPUT
16 ARAKAKI RK, 2018, COMPUT OPER RES
17 LAI Q, 2018, 2018 IEEE 14TH INTERNATIONAL CONFERENCE ON AUTOMATION SCIENCE AND ENGINEERING (CASE)
18 WOHLK S, 2018, J OPER RES SOC
19 TING CJ, 2018, INT J IND ENG -THEORY APPL PRACT
20 STEYN LJ, 2018, 2018 INTERNATIONAL CONFERENCE ON ADVANCES IN BIG DATA, COMPUTING AND DATA COMMUNICATION SYSTEMS (ICABCD)
21 SU Z, 2018, IFAC PAPERSONLINE
22 KIILERICH L, 2018, INFOR
23 GONZALEZ-MARTIN

175 SOLER D, 2008, J OPER RES SOC
176 REINELT G, 2008, DISCRET APPL MATH
177 REINELT G, 2008, DISCRET APPL MATH-a
178 ZHU Z, 2008, 2008 7TH WORLD CONGRESS ON INTELLIGENT CONTROL AND AUTOMATION, VOLS 1-23
179 BELKHELLADI K, 2008, 2008 IEEE CONGRESS ON EVOLUTIONARY COMPUTATION, VOLS 1-8
180 LABADI N, 2008, ADVANCES IN COMPUTATIONAL INTELLIGENCE IN TRANSPORT, LOGISTICS, AND SUPPLY CHAIN MANAGEMENT
181 BELKHELLADI K, 2008, ICINCO 2008: PROCEEDINGS OF THE FIFTH INTERNATIONAL CONFERENCE ON INFORMATICS IN CONTROL, AUTOMATION AND ROBOTICS, VOL ICSO: INTELLIGENT CONTROL SYSTEMS AND OPTIMIZATION
182 MANIEZZO V, 2008, RECENT ADVANCES IN EVOLUTIONARY COMPUTATION FOR COMBINATORIAL OPTIMIZATION
183 LABADI N, 2008, RECENT ADVANCES IN EVOLUTIONARY COMPUTATION FOR COMBINATORIAL OPTIMIZATION
184 CORBERAN A, 2008, SIAM DISCRET MATH
185 WOHLK S, 2008, VEHICLE ROUTING PROBLEM: LATEST ADVANCES AND NEW CHALLENGES
186 CORBERAN A, 2007, NETWORKS
187 BENAVENT E, 2007, EUR J OPER RES
188 DENG X, 2007, 2007 IEEE 

In [175]:
pd.concat(partial_citations)

Unnamed: 0,author_x,year_x,journal_x,author_year_x,full_x,author,year,journal,author_year,full_y,orig
0,ARCHETTI C,2017,,"ARCHETTI C, 2017","ARCHETTI C, 2017",ARCHETTI C,2017,EUR J OPER RES,"ARCHETTI C, 2017","ARCHETTI C, 2017, EUR J OPER RES","BENAVENT E, 2019, EUR J OPER RES"
1,BELENGUER JM,2006,,"BELENGUER JM, 2006","BELENGUER JM, 2006",BELENGUER JM,2006,COMPUT OPER RES,"BELENGUER JM, 2006","BELENGUER JM, 2006, COMPUT OPER RES","BENAVENT E, 2019, EUR J OPER RES"
2,BENAVENT E,2011,,"BENAVENT E, 2011","BENAVENT E, 2011",BENAVENT E,2011,NETWORKS,"BENAVENT E, 2011","BENAVENT E, 2011, NETWORKS","BENAVENT E, 2019, EUR J OPER RES"
3,BOSCO A,2013,,"BOSCO A, 2013","BOSCO A, 2013",BOSCO A,2013,OPTIM LETT,"BOSCO A, 2013","BOSCO A, 2013, OPTIM LETT","BENAVENT E, 2019, EUR J OPER RES"
4,CHU F,2005,,"CHU F, 2005","CHU F, 2005",CHU F,2005,EUR J OPER RES,"CHU F, 2005","CHU F, 2005, EUR J OPER RES","BENAVENT E, 2019, EUR J OPER RES"
5,CHU F,2005,,"CHU F, 2005","CHU F, 2005",CHU F,2005,J INTELL MANUF,"CHU F, 2005","CHU F, 2005, J INTELL MANUF","BENAVENT E, 2019, EUR J OPER RES"
6,CHU F,2005,,"CHU F, 2005","CHU F, 2005",CHU F,2005,EUR J OPER RES,"CHU F, 2005","CHU F, 2005, EUR J OPER RES","BENAVENT E, 2019, EUR J OPER RES"
7,CHU F,2005,,"CHU F, 2005","CHU F, 2005",CHU F,2005,J INTELL MANUF,"CHU F, 2005","CHU F, 2005, J INTELL MANUF","BENAVENT E, 2019, EUR J OPER RES"
8,CIANCIO C,2018,,"CIANCIO C, 2018","CIANCIO C, 2018",CIANCIO C,2018,EUR J OPER RES,"CIANCIO C, 2018","CIANCIO C, 2018, EUR J OPER RES","BENAVENT E, 2019, EUR J OPER RES"
9,CORBERAN A,2001,,"CORBERAN A, 2001","CORBERAN A, 2001",CORBERAN A,2001,MATH PROGRAM,"CORBERAN A, 2001","CORBERAN A, 2001, MATH PROGRAM","BENAVENT E, 2019, EUR J OPER RES"
