Note: To run this notebook, you would need a google maps api key. Pls enter api key in the cell below

In [10]:
# Google maps client setup
from googlemaps import Client as GoogleMapsClient
############
# api_key = 
############
gmaps = GoogleMapsClient(api_key)

In [11]:
# import modules
import pandas as pd
import geopy
from unidecode import unidecode


from geopy.geocoders import Nominatim
from fuzzywuzzy import process
from unidecode import unidecode
from spellchecker import SpellChecker
import time


from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import requests
import time
import numpy as np

In [12]:
# read input files
df = pd.read_csv("location_changes.csv")
wpi = pd.read_csv("WPI.csv")
iso_codes = pd.read_csv("ISO_codes.csv")

In [13]:

wpi["locode"]= wpi["UN/LOCODE"].str[:2]
wpi=wpi[["locode","Country Code"]].drop_duplicates(subset='locode')
wpi= wpi.rename(columns= {"Country Code":"country"})
wpi= wpi.drop(index=0).reset_index(drop= True)

iso_codes= iso_codes[["name","alpha-2"]]

In [14]:
df = df.rename(columns= {"previous_country":"previous_country_code", "country":"country_code"})

# merge country info from wpi ( for both previous country and current country)
df= pd.merge(left = df, right=wpi , left_on="previous_country_code", right_on= "locode", how= "left")
df = df.rename(columns= {"country":"previous_country"})
df = df.drop(columns= "locode")
df = pd.merge(left = df, right=wpi , left_on="country_code", right_on= "locode", how= "left")


# where country info is still missing, merge country info from iso_codes
df = pd.merge(left = df, right=iso_codes , left_on="country_code", right_on= "alpha-2", how= "left")
df = df.drop(columns = "alpha-2")
df['country'] = df['country'].fillna(df['name'])
df = df.drop(columns="name")
df = pd.merge(left = df, right=iso_codes , left_on="previous_country_code", right_on= "alpha-2", how= "left")
df = df.drop(columns = "alpha-2")
df['previous_country'] = df['previous_country'].fillna(df['name'])
df = df.drop(columns="name")




In [15]:

# extract required columns
df= df[["player_id","date","previous_country_code","previous_country","previous_location","country_code","country","location"]]

# filter only rows where at least one of 'previous country' or 'current country' exist
df = df[~((df["previous_location"].isna()) & (df["location"].isna()))].reset_index(drop = True)


The fucntion below performs the following:   

1) id country is not NA, use city + country as search term   
2) if there is no country available search with city only
3) Look through the returned json payload, skip results athat are partial matches   
4) Extract the address compomemt of the result   
5) From the address component, extract the country, extract the city.   
6) Now, check if the country compoent matches the country in the search term. If yes, extract the location componet (lat and lon). If no, return none  
7) if 2) i.e. no country in search term, simply return the lat and lon 


In [16]:
# Function to geocode location or reverse geocode coordinates
def geocode_location(city_name, country_name=None):
    try:
        if pd.isna(city_name) or not isinstance(city_name, str):
            return None, None, None, None  # Handle missing or invalid inputs 

        # geocode with city_name and country_name if provided
        if country_name and isinstance(country_name, str):
            result = gmaps.geocode(f"{city_name}, {country_name}")
        else:
            # If no country is provided, geocode using the city only
            result = gmaps.geocode(f"{city_name}")

        if result:
            for res in result:
                # skip partial matches. They are usually incorrect
                if 'partial_match' in res and res['partial_match']:
                    continue

                # Extract address from the returned json payload
                address_components = res.get('address_components', [])
                
                # Get country and city components, confirm that they are not None
                country_component = next((comp for comp in address_components if 'country' in comp['types']), None)
                city_component = next(
                    (comp for comp in address_components if 'locality' in comp['types']), 
                    next(
                        (comp for comp in address_components if 'administrative_area_level_3' in comp['types']),  
                        next(
                            (comp for comp in address_components if 'administrative_area_level_2' in comp['types']),  
                            next(
                                (comp for comp in address_components if 'administrative_area_level_1' in comp['types']), 
                                None
                            )
                        )
                    )
                )
                
                # Check if the country exists and matches
                if country_component and country_name and isinstance(country_name, str) and country_component['long_name'].lower() == country_name.lower():
                    location = res['geometry']['location']
                    # Return lat, lon, city (if available), and country (if available)
                    return location['lat'], location['lng'], city_component['long_name'] if city_component else None, country_component['long_name']
                # If no country name is provided, return the result for the city
                elif (country_name is None or pd.isna(country_name)) and city_component:
                    location = res['geometry']['location']
                    return location['lat'], location['lng'], city_component['long_name'], country_component['long_name'] if country_component else None

        return None, None, None, None
    except Exception as e:
        print(f"Error geocoding {city_name}, {country_name}: {e}")
        return None, None, None, None

The fucntion below calls the geocode_location() function and performs the following :   

First Segment: handling previous location:   

1) if both previous location and previous country are not NA, use them to obtain lon and lat, locatin(city) name and country name
2) else if there is no country available use location (city) only   and set status to "location not in country"

Second Segment: handling current location:    
Here, in addition to searching for location result, we also try to deduce if the user forgot to update either the city or country.    
1) check if the country has not changed (i.e. previous countryy is same as current country). If yes, first search using the new city and new country.   
2) if a above leads to no successful search, there is a possibility user forgot to change country. Therefore we search using the city only.  set status to "country not updated" 

3) if previous location(city) is the same as current location (city) but the countries are different, we assume the that the user did not update the location (city)   
4) If not of the above is true, do a normal search with city and country


In [17]:
# Function to assess the status of location and determine if country or city was not updated
def assess_location_status(record):
    # Initialize status and coordinates
    status_previous = ""
    status_current = ""
    previous_location_lat = None
    previous_location_lon = None
    location_lat = None
    location_lon = None
    location_result_previous = None
    location_result_current = None
    country_result_previous = None
    country_result_current = None

    # Section 1: Previous Location
    if pd.notna(record['previous_location']) and pd.notna(record['previous_country']):
        # First attempt to geocode the previous location with the previous country
        prev_lat, prev_lon, location_result_previous, country_result_previous = geocode_location(record['previous_location'], record['previous_country'])
        if prev_lat is not None and prev_lon is not None:
            previous_location_lat = prev_lat
            previous_location_lon = prev_lon
        else:
            # If location is not in the country, try geocoding without the country
            status_previous = "location not in country"
            prev_lat, prev_lon, location_result_previous, country_result_previous = geocode_location(record['previous_location'])
            if prev_lat is not None and prev_lon is not None:
                previous_location_lat = prev_lat
                previous_location_lon = prev_lon

    # Section 2: Current Location
    if record['previous_country'] == record['country']:  # Check if the country hasn't changed
        if pd.notna(record['location']):  # Make sure the location (city) is provided
            loc_lat, loc_lon, location_result_current, country_result_current = geocode_location(record['location'], record['country'])
            
            # Check if the geocoding was successful
            if loc_lat is not None and loc_lon is not None:
                location_lat = loc_lat
                location_lon = loc_lon
            else:
                # If location is not found in the country, search without the country
                status_current = "country not updated"
                loc_lat, loc_lon, location_result_current, country_result_current = geocode_location(record['location'])
                if loc_lat is not None and loc_lon is not None:
                    location_lat = loc_lat
                    location_lon = loc_lon
    elif pd.notna(record['previous_location']) and record['previous_location'] == record['location']:
        # If previous location matches current location but the country is different
        status_current = "city not updated"
        loc_lat, loc_lon, location_result_current, country_result_current = geocode_location(record['location'])
        if loc_lat is not None and loc_lon is not None:
            location_lat = loc_lat
            location_lon = loc_lon

    else: # do a normal search for location and country
        loc_lat, loc_lon, location_result_current, country_result_current = geocode_location(record['location'], record['country'])
        if loc_lat is not None and loc_lon is not None:
            location_lat = loc_lat
            location_lon = loc_lon

    return pd.Series([status_previous, previous_location_lat, previous_location_lon, location_result_previous, country_result_previous, 
                      status_current, location_lat, location_lon, location_result_current, country_result_current])

In [18]:
# Apply the function
df[['status_previous', 'previous_location_lat', 'previous_location_lon', 'location_result_previous', 'country_result_previous', 
    'status_current', 'location_lat', 'location_lon', 'location_result_current', 'country_result_current']] = df.apply(assess_location_status, axis=1)

Error geocoding 34.1443° N, 118.0019° W, None: INVALID_REQUEST
Error geocoding 42N 50W, None: INVALID_REQUEST
Error geocoding 27.4705° S, 153.0260° E, None: INVALID_REQUEST
Error geocoding 27.4705° S, 153.0260° E, None: INVALID_REQUEST
Error geocoding 89°, None: INVALID_REQUEST
Error geocoding 89°, None: INVALID_REQUEST


In [19]:
# save full result
df.to_csv("full_result.csv", index= False)

In [20]:
# extract important columns and save 
df_2= df[["player_id","date","previous_country","previous_location","previous_location_lat", "previous_location_lon", "country","location","location_lat","location_lon"]]
df_2.to_csv("full_result_2.csv", index= False)