In [1]:
#importing the necessary modules
import nltk
import re
import pandas as pd
import os
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import sys
import unidecode
import unicodedata
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import GeoNames
import time

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charlotteout/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#path = "/Users/charlotteout/Documents/SDS"
#os.chdir(path)

In [3]:
#define the geolocator
geolocator = GeoNames(username = 'chaout')
geocode = geocode = RateLimiter(geolocator.geocode,min_delay_seconds=1)

In [5]:
#download dataset
vegan_data = pd.read_csv('vegan_data.csv')

In [6]:
#filter out the countries that have an empty location filed 
vegan_data_withloc = vegan_data[pd.notna(vegan_data['location'])]

In [9]:
#a helper function used in the main function to remove the accents from the locations
#to be able to match them with the values in the dictionary
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [10]:
#INPUT: the datset including the vegan tweets and their respective locations
#OUTPUT: the dataset including a column with the country matching the location

#the function applied the GeoNames service to the location field in the dataset. 
#We then remove the accents and lowercase this string such that it matched the countries
#defined in the "countrylist". If in an adress, we found a match between the country
#specified in the adress and the countries in the countrylist, we add the 
#respective name of the country in english as specified in the countrydict 
#to the dataset 

def countryrecognition(dataset):
    
    
    #list of countries in english and the respective local language that we want to recognize
    countrylist = ["netherlands", 'nederland', 'sweden','sverige','ital','franc','norway'
    , 'norge', 'iceland', 'island', 'brazil', 'brasil', 'russia',
    'south africa', 'suid-afrika', 'india', 'switzerland', 'schweiz', 
    'suisse', 'svizzer', 'australia', 'austria', 'osterreich', "belgi",
    'canada', 'chile', 'chzech', 'ceska', 'denmark', 'danmark', 'estonia',
    'esti', 'finland', 'suomi', 'germany', 'deutschland', 'greece',
    'hungary', 'magyarorsz', 'ireland', 'israel', 'japan', 'south korea',
                   "lavtiva", 'latvija', 'lithuania', 'lietuva',
                   'luxembourg', 'mexico', 'new zealand', 'poland',
                   'polska', 'portugal', 'slovakia', 'slovensko',
                   'sloveni', 'slovenia', 'spain', 'espana', 'turk',
                   'kingdom', 'states', 'england', 'scotland', 'wales', 'great britain']
                  
    
    #dictionary mapping the country found in the countrylist to the name of the country in
    #english, as we want to have it in the dataset.
    countrydict = {
        'netherlands' : "netherlands",
        'nederland' : "netherlands",
        'sweden' : "sweden",
        'sverige' : "sweden",
        'ital' : "italy",
        'franc' : "france",
        'norway' : "norway",
        'norge' : "norway",
        'iceland' : "iceland",
        'island' : "iceland",
        'brazil' : "brazil",
        'brasil' : "brazil",
        'russia' : "russia",
        'south africa' : "south africa",
        'suid-afrika' : "south africa",
        'india' : "india",
        'switzerland' : "switzerland",
        'schweiz' : "switzerland",
        'suisse' : "switzerland",
        'svizzer' : "switzerland",
        'australia' : "australia",
        'austria' : "austria",
        'osterreich': "austria",
        'belgi' : "belgium",
        'canada' : "canada",
        'chile' : "chile",
        'chzech' : "czech republic",
        'ceska' : "czech republic",
        'denmark' : "denmark",
        'danmark' : "denmark",
        'estonia' : "estonia",
        'esti' : "estonia",
        'finland' : 'finland',
        'suomi' : 'finland',
        'germany' : 'germany',
        'deutschland' : 'germany',
        'greece' : 'greece',
        'hungary' : 'hungary',
        'magyarorsz' : 'hungary',
        'ireland' : 'ireland',
        'israel' : 'israel',
        'japan' : 'japan',
        'south korea': 'south korea',
        'latvia' : 'latvia',
        'lavtija' : 'latvia',
        'lithuania' : 'lithuania',
        'lietuva' : 'lithuania',
        'luxembourg' : 'luxembourg',
        'mexico' : 'mexico',
        'new zealand' : 'new zealand',
        'poland' : 'poland',
        'polska': 'poland',
        'portugal' : 'portugal',
        'slovakia' : 'slovakia',
        'slovensko' : 'slovakia',
        'sloveni' : 'slovenia',
        'slovenia' : 'slovenia',
        'spain' : 'spain',
        'espagna' : "spain",
        'turk' : 'turkey',
        'kingdom': "united kingdom",
        'states' : 'united states',
        'england' : 'united kingdom',
        'scotland' : 'united kingdom',
        'wales' : 'united kingdom',
        'great britain' : 'united kingdom'
    }              
                  
    
    for row in range(np.shape(dataset)[0]):
        
        location = geolocator.geocode(dataset['location'].iloc[row])
       
        if location == None:
            continue
        else:
            #remove the accents of the locations
            unaccent_location = remove_accents(str(location))
            #make the string lower case
            location_string = str(unaccent_location).lower()
            for country in countrylist:
                #if it found one of the specified countries in the adress
                if location_string.find(country) != -1:
                    #add this country to the country column in the dataset
                    dataset['country'].iloc[row] = countrydict.get(country)
                    continue
                else:
                    continue
                        
    dataset.to_csv(r'/Users/charlotteout/Documents/SDS/10april_99000:100000')
                        
    