Find the most frequent participants using the lists written in tab-separated value files.

In [1]:
import os
import re
import glob
import logging
from collections import Counter
import liegecolloquium

Prepare logger:

In [2]:
logger = logging.getLogger('counting-participants')
logger.setLevel(logging.DEBUG)
logging.debug("Start")

# Files and directories

In [3]:
datadir = "../data/processed/"
participantfilelist = sorted(glob.glob(os.path.join(datadir, "ParticipantList-*.tsv")))
outputdir = "../data/"
outputfile = os.path.join(outputdir, "ParticipantCountry.geojson")
logger.info("Working on {0} participant files".format(len(participantfilelist)))

INFO:counting-participants:Working on 39 participant files


# Read data
We provide a list of files to be read in order to create a list of participants for the period of interest (full period, decades, ...)

## Participant list

In [25]:
participantlist = []
namelist = []
namelist_noliege = []
namelist_nobelgium = []
nptotal = 0
for participantfile in participantfilelist:
    logger.debug("Working on file {0}".format(os.path.basename(participantfile)))
    with open(participantfile, "r") as f:
        for line in f:
            l = line.rstrip().split('\t')
            participant = liegecolloquium.Participant(l[0], l[1], l[2], l[3], l[4])
            participant.replace_country()
            participant.abbrev_name()
            participantlist.append(participant)
            namelist.append("".join((participant.firstname, participant.name.rstrip())))
            if participant.country != "Belgium":
                namelist_nobelgium.append("".join((participant.firstname, participant.name)))
            else:
                if participant.city != "Liège":
                    namelist_noliege.append("".join((participant.firstname, participant.name)))
logger.info("Total participants: {}".format(len(participantlist)))
logger.info("Participant outside Belgium: {}".format(len(namelist_nobelgium)))
logger.info("Participant outside Liège: {}".format(len(namelist_noliege)))

DEBUG:counting-participants:Working on file ParticipantList-1970.tsv
DEBUG:counting-participants:Working on file ParticipantList-1971.tsv
DEBUG:counting-participants:Working on file ParticipantList-1972.tsv
DEBUG:counting-participants:Working on file ParticipantList-1973.tsv
DEBUG:counting-participants:Working on file ParticipantList-1974.tsv
DEBUG:counting-participants:Working on file ParticipantList-1975.tsv
DEBUG:counting-participants:Working on file ParticipantList-1976.tsv
DEBUG:counting-participants:Working on file ParticipantList-1977.tsv
DEBUG:counting-participants:Working on file ParticipantList-1978.tsv
DEBUG:counting-participants:Working on file ParticipantList-1979.tsv
DEBUG:counting-participants:Working on file ParticipantList-1980.tsv
DEBUG:counting-participants:Working on file ParticipantList-1981.tsv
DEBUG:counting-participants:Working on file ParticipantList-1982.tsv
DEBUG:counting-participants:Working on file ParticipantList-1983.tsv
DEBUG:counting-participants:Workin

In [26]:
c1 = Counter(namelist)
c2 = Counter(namelist_nobelgium)
c3 = Counter(namelist_noliege)

## Results

Find 10 most frequent:

In [27]:
c1.most_common(10)

[('J.C.J.Nihoul', 27),
 ('F.Ronday', 20),
 ('G.Lebon', 19),
 ('J.Smitz', 15),
 ('A.Disteche', 14),
 ('E.Deleersnijder', 14),
 ('Y.Adam', 12),
 ('G.Pichot', 12),
 ("G.Chabert d'Hières", 11),
 ('S.Djenidi', 10)]

10 most frequent not from Belgium:

In [28]:
c2.most_common(10)

[("G.Chabert d'Hières", 11),
 ('H.G.Ramming', 7),
 ('E.Stanev', 7),
 ('A.Kostianoy', 7),
 ('P.Brasseur', 7),
 ('P.C.Chu', 7),
 ('L.Goodman', 6),
 ('A.Bah', 6),
 ('J.D.Woods', 5),
 ('D.Olbers', 5)]

10 most frequent from Belgium but not from Liège:

In [29]:
c3.most_common(10)

[('E.Deleersnijder', 8),
 ('J.L.Van Hamme', 7),
 ('G.Pichot', 5),
 ('A.Berger', 4),
 ('J.Ozer', 4),
 ('W.Bayens', 3),
 ('G.Billen', 3),
 ('P.Gaspar', 3),
 ('E.Smets', 3),
 ('F.Dehairs', 3)]