In [155]:
import os
import glob
import logging
from collections import Counter
from geopy.geocoders import Nominatim

# Configuration

In [245]:
datadir = "../data/"
outputdir = "../data/"
geolocator = Nominatim()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [246]:
filelist = sorted(glob.glob(os.path.join(datadir, "ParticipantList-1972*tsv")))
logging.info("Working on {0} files".format(len(filelist)))

INFO:root:Working on 1 files


In [247]:
participantlist = []
locationlist = []

for datafiles in filelist[0:1]:
    logging.info("Working on file {0}".format(datafiles))
    outputbasename = os.path.basename(datafiles).replace(".tsv", ".dat")
    coordbasename = os.path.basename(datafiles).replace(".tsv", ".coord")
    outputfile = os.path.join(outputdir, outputbasename)
    coordfile = os.path.join(outputdir, coordbasename)
    logging.info("Writing info in file {0}".format(outputfile))
    logging.info("Writing coordinates in file {0}".format(outputfile))
    
    with open(coordfile, 'w') as cf:
        cf.write("var coords = [\n")

    with open(datafiles, 'r') as f:
        for line in f:
            l = line.rstrip().split('\t')
            
            participant = Participant(l[0], l[1], l[2], l[3], l[4])
            
            # Modify country name via dictionary
            participant.replace_country()
            
            # Find location ; check if in the list before using geopy
            loc1 = ", ".join((participant.affiliation, participant.city, participant.country))
            loc2 = ", ".join((participant.city, participant.country))
            
            if loc1 in locationlist:
                logger.debug("Already in the list")
                ind = locationlist.index(loc1)
                logger.debug(ind)
                participant.location = participantlist[ind].location
            elif loc2 in locationlist:
                logger.debug("Already in the list")
                ind = locationlist.index(loc2)
                logger.debug(ind)
                participant.location = participantlist[ind].location
            else:
                logger.debug("Not in the list, so need to use Nomatim")
                locationstring = participant.get_location()
                locationlist.append(locationstring)
                
            # Write info to a text file
            participant.write_to(outputfile)
            
            # Write coordinates (only) into another file
            participant.write_coords_to(coordfile)
            
            # Append to list
            participantlist.append(participant)
            
    with open(coordfile, 'a') as cf:
        cf.write("]")

INFO:root:Working on file ../data/ParticipantList-1972.tsv
INFO:root:Writing info in file ../data/ParticipantList-1972.dat
INFO:root:Writing coordinates in file ../data/ParticipantList-1972.dat
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Already in the list
DEBUG:root:0
DEBUG:root:Already in the list
DEBUG:root:1
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Already in the list
DEBUG:root:1
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Already in the list
DEBUG:root:0
DEBUG:root:Already in the list
DEBUG:root:1
DEBUG:root:Already in the list
DEBUG:root:1
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Already in the list
DEBUG:root:1
DEBUG:root:Not in the list, so need to use Nomatim
DEBUG:root:Not in the list, so need to 

In [242]:
class Participant(object):
    
    def __init__(self, name=None, firstname=None, affiliation=None, city=None, 
                 country=None, location=None):
        self.name = name
        self.firstname = firstname
        self.affiliation = affiliation
        self.city = city
        self.country = country
    
    def __repr__(self):
        return str(self.__dict__)
        
    def replace_country(self):
        dictcountry = {"U.S.A.": "United States of America", 
                       "The Netherlands": "Nederland"}
        for k, v in dictcountry.items():
            self.country = self.country.replace(k, v)
            
    def get_location(self):
        # Try with different combinations of affil., city, country
        locationstring = ", ".join((self.affiliation, self.city, self.country))
        location = geolocator.geocode(locationstring)
        if not(location):
            locationstring = ", ".join((self.city, self.country))
            location = geolocator.geocode(locationstring)
            
        self.location = location
        return locationstring
    
    def write_to(self, filename, sep='\t'):
        """
        Write the participants information into a file
        """
        with open(filename, 'a') as f:
            f.write(sep.join((self.name, self.firstname, 
                             self.affiliation, self.country,
                             str(self.location.latitude),
                             str(self.location.longitude), "\n"))
                   )
    
    def write_coords_to(self, filename, sep="\t"):
        """
        Write the coordinates in a file
        """
        with open(filename, 'a') as f:
            f.write("[{0}, {1}],\n".format(self.location.latitude,
                                               self.location.longitude)) 

In [None]:
with open(filename, 'a') as f:
    f.write("var coords = [\n")
    for k, v in coordCount.items():
        f.write("[{0}, {1}, {2}],\n".format(k[0], k[1], float(v)))
    f.write("]")