Extract author information and article title using journal web page.

In [116]:
import os
import glob
import re
import logging
from collections import Counter

In [3]:
logger = logging.getLogger("clq")
logger.setLevel(logging.DEBUG)

Generate file list from directory.

In [7]:
datadir = "./"
datafilelist = sorted(glob.glob(os.path.join(datadir, "*_edition.txt")))
logger.info("Working on {0} files".format(len(datafilelist)))

INFO:clq:Working on 1 files


# Loop on the files:

In [8]:
for files in datafilelist:
    logger.debug("Workin on file {0}".format(files))

DEBUG:clq:Workin on file ./36th_edition.txt


In [118]:
titlelist = []
authorlist = []
with open(files, 'r') as f:
    i = 0
    
    for lines in f:
        i += 1
        matchpage = re.search(r'Pages\s{1}\d+-(\d+)', lines)
        matchabstract = re.search(r'\s*Abstract PDF', lines)
        matchentitled = re.search(r'\s*Entitled to full text', lines)
        matchpdf = re.search(r'\s+PDF\s\(\d+ K\)', lines) 
        matchempty = re.search(r'^\s$', lines)
        matchother = re.search(r'Editorial Board|Page IFC|Research Papers', lines)
        matchtitle = re.search(r'(.+)Original Research Article$|(.+)Research Article$|(\w+ International Liège Colloquium \w+)', lines)
        if matchpage or matchabstract or matchentitled or matchpdf or matchempty or matchother:
            #print("WTF")
            pass
        elif matchtitle:
            #print(" ".join((str(i), str(len(lines)), lines)))
            titlelist.append(matchtitle.group(1))
        else:
            authorlist += lines.rstrip().split(',')

In [109]:
if (len(titlelist) != len(authorlist)):
    logger.warning("Different number of papers and author groups")
else:
    logger.debug("Same number of papers and author groups")



# Word counting
The idea is to see the most frequent words in the titles.   
It would work easily if all the authors were identified the same way.   
However we have for example the same author identified with 3 different strings:
1. J.M. Beckers
2. Jean-Marie Beckers
3. J.-M. Beckers

Another difficulty occurs when the family name is made up of two (or more) words, for example:
* Pierre-Yves Le Traon
* Z. Ben Bouallègue.

Finally, some composed family names are not always entirely written:
* R. Bolaños
* R. Bolaños-Sanchez.

All in all it means a totally automatic processing is not feasible.

## Solution
The idea is as follow:
1. For each author in the list, the compound family names will written with a '-' between the words. This will be done through a specific dictionnary that will be updated progressively.
2. The first and middle names will be abbreviated and appended using a '.' between the letters. For instance 'Harley E.' will be converted to "H.E.".
3. Finally, the different accents will be, at least temporarily, substitued by the letter without accent. 

## Substitution dictionnary

In [148]:
subDict = {"Le Provost": "Le-Provost", "Le Traon": "Le-Traon", 
           "Ben Bouallègue": "Ben-Bouallegue",
           "á": "a", "à": "a", "é": "e", "ó": "o", "ø": "o"}

Remove leading and trailing white spaces:

In [119]:
authorlist2 = [author.strip() for author in authorlist]

In [238]:
authorlist3 = []
for author in authorlist2:
    for k, v in subDict.items():
        author = author.replace(k, v)
        familyname = "".join(["".join([q[0] + '.' for q in p.split('-')]) for p in author.split(' ')[:-1]])
    authorlist3.append(' '.join((familyname, author.split(' ')[-1])))

In [239]:
Counter(authorlist3)

Counter({'A. Alvarez': 1,
         'A. Alvera-Azcarate': 3,
         'A. Barth': 3,
         'A. Fornes': 1,
         'A. Lascaratos': 1,
         'A. Murashkovsky': 1,
         'A. Orfila': 1,
         'A. Pascual': 2,
         'A. Sanchez-Arcilla': 1,
         'A. Wallcraft': 1,
         'A.J. Wallcraft': 1,
         'B. Ingleby': 1,
         'C. Fan': 1,
         'C. Le-Provost': 1,
         'C. Maes': 1,
         'C. Raick': 1,
         'D. Delmas': 1,
         'D. Gomis': 1,
         'D. Prandle': 1,
         'D.K. Mills': 1,
         'E. Comerma': 1,
         'E. Ferreira-Coelho': 1,
         'E. Mauri': 1,
         'E. Remy': 1,
         'E.P. Chassignet': 1,
         'F. Gohin': 1,
         'F.J. Gilbert': 1,
         'G. Basterretxea': 1,
         'G. Jorda': 1,
         'G. Kivman': 1,
         'G. Korres': 1,
         'G. Larnicol': 2,
         'G. Triantafyllou': 1,
         'G. Vizoso': 1,
         'G.R. Halliwell': 1,
         'H. Etienne': 1,
         'H. Hurlburt': 1,
 

In [240]:
pp = 'A.B. Pock'

In [241]:
a = "".join(["".join([q[0] + '.' for q in p.split('-')]) for p in pp.split(' ')[:-1]])

In [242]:
a

'A.'

In [245]:
[p for p in pp.split(' ')[:-1]]

['A.B.']

In [248]:
p = 'A.B.'

In [249]:
[q for q in p.split('-')]

['A.B.']