In [1]:
from scholarly import scholarly, ProxyGenerator, MaxTriesExceededException
from unidecode import unidecode
import pandas as pd
import time
from random import randint
import pylcs

SIMILARITY_THRESHOLD = 0.9

# reads papers from Excel file
df = pd.read_excel("publications.xlsx")

In [2]:
# Uses proxy to avoid Google Scholar blocking the requests
pg = ProxyGenerator()
success = pg.FreeProxies()
scholarly.use_proxy(pg)
print(f'Proxy: {success}.')

Proxy: True.


In [3]:
for index, row in df.iterrows():
    original_title = unidecode(row['title'].lower())  # Removes accents and upper case letters
 
    pubs = None
    while not pubs:
        try:  # Tries to run the query
            pubs = scholarly.search_pubs(row['title'])
        except MaxTriesExceededException:
            try:
                pg.get_next_proxy()  # Gets a new proxy if the query fails
                print(f'Current proxy failed. Trying a new one.')
            except StopIteration:
                pg = ProxyGenerator()
                success = pg.FreeProxies()
                scholarly.use_proxy(pg)
                print(f'All proxies failed. New proxy list: {success}.')

    try:  # Tries to get the title of the first result (the most similar result)
        pub = next(pubs)
        scholar_title = unidecode(pub['bib']['title'].lower())  # Removes accents and upper case letters
    except StopIteration:  # Ignores if there is no result
        scholar_title = ''

    # Check whether both titles are similar using LCS algorithm and sets the citation count accordingly
    similarity = 2.0 * pylcs.lcs_sequence_length(original_title, scholar_title) / (len(original_title) + len(scholar_title))
    citations = int(pub['num_citations']) if similarity >= SIMILARITY_THRESHOLD else 0
    df.at[index, 'citations'] = citations

    # Prints the results for auditing
    print(f'[{index + 1}/{len(df)}] Similarity: {similarity}. Citations: {citations}.')
    print(f'Original title: {original_title}')
    print(f'Scholar title:  {scholar_title}\n')
    time.sleep(randint(3, 7))  # Sleeps for around 5 seconds to avoig Google Scholar blocking the queries

Current proxy failed. Trying a new one.
[1/231] Similarity: 1.0. Citations: 13.
Original title: towards component-based software maintenance via software configuration management techniques
Scholar title:  towards component-based software maintenance via software configuration management techniques

[2/231] Similarity: 1.0. Citations: 8.
Original title: documentacao essencial para manutencao de software ii
Scholar title:  documentacao essencial para manutencao de software ii

[3/231] Similarity: 0.9583333333333334. Citations: 4.
Original title: evolucao orientada a aspectos de um framework oo
Scholar title:  evolucao orientada a aspectos de um framework 00

[4/231] Similarity: 1.0. Citations: 6.
Original title: an evolution process for application frameworks
Scholar title:  an evolution process for application frameworks

[5/231] Similarity: 1.0. Citations: 6.
Original title: uma experiencia no ensino de manutencao de software
Scholar title:  uma experiencia no ensino de manutencao de 

In [4]:
df.to_excel('publications.xlsx', index=False)