In [1]:
import pandas as pd
import numpy as np

In [2]:
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return distance[row][col]

In [3]:
data = pd.read_csv('network-data/combined-edges.csv')
data.head()

Unnamed: 0,paperID,author1,author2,affiliation1,affiliation2,location,year
0,0,Maureen M. Villamor,Ma. Mercedes T. Rodrigo,University of Southeastern Philippines,Ateneo de Manila University,local,2019
1,1,Deion C. Menor,Cher T. Panlilio,Ateneo de Manila University,Ateneo de Manila University,local,2019
2,1,Deion C. Menor,Lourdes J. Santiago,Ateneo de Manila University,Ateneo de Manila University,local,2019
3,1,Deion C. Menor,Andrei D. Coronel,Ateneo de Manila University,Ateneo de Manila University,local,2019
4,1,Cher T. Panlilio,Lourdes J. Santiago,Ateneo de Manila University,Ateneo de Manila University,local,2019


In [25]:
authors = np.unique(np.concatenate([data['author1'].unique(), data['author2'].unique()]))

In [26]:
len(authors)

1526

In [15]:
replace_authors = {}

for i in authors:
    result = filter(lambda x: x.startswith(i[0]), authors)
    for x in result:
        if i != x:
            dist = levenshtein_ratio_and_distance(i, x)
            if dist < 5 and dist > 0:
                print(dist, i, x)
                replace_authors[i] = x

1 Elmer B. Dadios Elmer P. Dadios
1 Elmer P. Dadios Elmer B. Dadios
4 J. E. Que J. L. Aves
4 J. L. Aves J. E. Que
4 Jedrick L. Chua Jenina L. Chua
4 Jenina L. Chua Jedrick L. Chua
4 John Paul S. Guzman John Paul V. Guim
4 John Paul V. Guim John Paul S. Guzman
4 Kevin M. Manalo Kevin M. Martin
4 Kevin M. Manalo Kevin S. Manalang
4 Kevin M. Martin Kevin M. Manalo
4 Kevin S. Manalang Kevin M. Manalo
3 Mark Lorenze D R Torregoza Mark Lorenze R. Torregoza
3 Mark Lorenze R. Torregoza Mark Lorenze D R Torregoza
4 Mark Lorenze R. Torregoza Mark Lorenze Torregaza
4 Mark Lorenze Torregaza Mark Lorenze R. Torregoza
4 Miguel Galace Miguel Palma
4 Miguel Palma Miguel Galace
4 Nicco Nocon Nico Laco
4 Nico Laco Nicco Nocon
4 Patrick C. Gan Patrick T. Lo
4 Patrick T. Lo Patrick C. Gan
4 Raymund C. Sison Raymund E. Dilan
4 Raymund E. Dilan Raymund C. Sison
4 Ryan Mina Ryan Sze
4 Ryan Sze Ryan Mina


In [17]:
for_replace = {
    'Mark Lorenze D R Torregoza': 'Mark Lorenze Torregoza',
    'Mark Lorenze R. Torregoza': 'Mark Lorenze Torregoza',
    'Mark Lorenze Torregaza': 'Mark Lorenze Torregoza',
    'Bobby Gerardo': 'Bobby D. Gerardo',
    'Cecil Jose Delfinado': 'Cecil Jose A. Delfinado',
    'Daniel Marc Dela Torre': 'Daniel Marc G. Dela Torre',
    'Danny Cheng': 'Danny C. Cheng',
    'Eric Camilo Punzalan': 'Eric Camilo R. Punzalan',
    'Ezekiel A. Cotoco': 'Ezekiel Karl A. Cotoco',
    'Joel Paz Ilao': 'Joel P. Ilao',
    'Johnamos Tan': 'John Amos Tan',
    'John Ultra': 'John D. Ultra',
    'John Matthew B. Villaores': 'John Matthew B. Villaflores',
    'John Dela Cruz': 'John R. Dela Cruz',
    'Jun Rangie Obispo': 'Jun Rangie C. Obispo',
    'Katrina Joy Abriol-Santos': 'Katrina Joy M. Abriol-Santos',
    'Kristofer E.Delas Penas': 'Kristofer E. Delas Peñas',
    'Kristofer Delas Peñas': 'Kristofer E. Delas Peñas',
    'Laurence A.Gan Lim': 'Laurence A. Gan Lim',
    'Laurence Gan Lim': 'Laurence A. Gan Lim',
    'Maria Isabel S. Saludaresy': 'Maria Isabel S. Saludares',
    'Marie Yvette de Robles': 'Marie Yvette B. de Robles',
    'Nathaniel C. Batayan': 'Nathaniel C. Bantayan',
    'Prospero C. Naval': 'Prospero C. Naval Jr.',
    'Rex Bringula': 'Rex P. Bringula',
    'Rowel O. Atienzay': 'Rowel O. Atienza',
    'Rowel O. Atienzaz': 'Rowel O. Atienza',
    'Ruji Medina': 'Ruji P. Medina',
    'Val Randolf Madrid': 'Val Randolf M. Madrid',
    'Elmer B. Dadios': 'Elmer P. Dadios'
}

In [18]:
data = data.replace(for_replace)

In [27]:
data.to_csv('network-data/combined-edges.csv', index=False)