# Applying Country Codes to Main Dataset

Using the pycountry package, the country codes will be applied to the Main Dataset for graphing purposes, such as heatmaps.

In [1]:
import pycountry
import pandas as pd
import numpy as np
import pycountry_convert as pc


In [2]:
# load dataset
df = pd.read_csv('Main dataset 7-10.csv')

In [3]:
# looking at data
df.head(10)

Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,user_name,created_at,updated_at,...,place_town_name,place_county_name,place_state_name,place_country_name,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id,Source
0,293700.0,5/21/2013,5/21/2013,,Eastern Time (US & Canada),4860.0,rcurtis,Rob Curtis,2013-06-09 22:09:46 UTC,2022-12-22 17:14:07 UTC,...,,Summit,Ohio,United States,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,1433448.0,iNaturalist
1,803544.0,7/26/2014 10:20,7/26/2014,2014-07-26 17:20:28 UTC,Pacific Time (US & Canada),2991.0,sea-kangaroo,,2014-07-27 05:01:45 UTC,2022-12-17 21:12:01 UTC,...,Mountain View,Santa Clara,California,United States,Hesperomyces virescens,Hesperomyces virescens,Green Beetle Hanger,Fungi,1267749.0,iNaturalist
2,804805.0,7/4/2014 9:30,7/4/2014,2014-07-04 13:30:02 UTC,Eastern Time (US & Canada),4860.0,rcurtis,Rob Curtis,2014-07-28 02:29:17 UTC,2023-01-09 20:58:06 UTC,...,,Summit,Ohio,United States,Marienk√§ferpilz,Hesperomyces virescens,Green Beetle Hanger,Fungi,1267749.0,iNaturalist
3,852643.0,7/1/2014,7/1/2014,,Eastern Time (US & Canada),30601.0,calopteryx,Matthew O'Donnell,2014-08-29 05:02:52 UTC,2023-01-09 20:58:01 UTC,...,,Fairfax,Virginia,United States,Marienk√§ferpilz,Hesperomyces virescens,Green Beetle Hanger,Fungi,1267749.0,iNaturalist
4,1050649.0,11/2/2014,11/2/2014,,Auckland,15329.0,stephen_thorpe,Stephen Thorpe,2014-11-02 02:55:10 UTC,2022-08-24 22:28:06 UTC,...,,Auckland Region2,Auckland,New Zealand,Hesperomyces coccinelloides,Hesperomyces coccinelloides,,Fungi,707811.0,iNaturalist
5,1765681.0,7/11/2015,7/11/2015,,Eastern Time (US & Canada),12045.0,larry522,Larry Clarfeld,2015-07-15 13:05:04 UTC,2023-01-09 21:12:40 UTC,...,Woodstock,Windsor,Vermont,United States,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,1433448.0,iNaturalist
6,1814527.0,7/18/2015,7/18/2015,,Eastern Time (US & Canada),76331.0,khimmler,Kurtis Himmler,2015-07-30 00:06:27 UTC,2023-01-09 20:57:43 UTC,...,"East Brunswick, NJ",Middlesex,New Jersey,United States,Marienk√§ferpilz,Hesperomyces virescens,Green Beetle Hanger,Fungi,1267749.0,iNaturalist
7,1876714.0,8/9/2015 20:28,8/9/2015,2015-08-10 00:28:15 UTC,Eastern Time (US & Canada),3773.0,anita363,Anita,2015-08-20 17:37:17 UTC,2023-01-09 20:57:37 UTC,...,Highland Park New Jersey,Middlesex,New Jersey,United States,Green Beetle Hanger,Hesperomyces virescens,Green Beetle Hanger,Fungi,1267749.0,iNaturalist
8,2010658.0,9/18/2015 21:43,9/18/2015,2015-09-19 01:43:55 UTC,Eastern Time (US & Canada),3773.0,anita363,Anita,2015-09-29 04:00:23 UTC,2022-12-17 17:45:14 UTC,...,Highland Park New Jersey,Middlesex,New Jersey,United States,Green Beetle Hanger,Hesperomyces virescens,Green Beetle Hanger,Fungi,1267749.0,iNaturalist
9,2648522.0,2/6/2016,2/6/2016,,Auckland,15329.0,stephen_thorpe,Stephen Thorpe,2016-02-06 03:26:52 UTC,2022-08-24 22:29:09 UTC,...,,Auckland Region2,Auckland,New Zealand,Hesperomyces coccinelloides,Hesperomyces coccinelloides,,Fungi,707811.0,iNaturalist


In [4]:
df_source_not_inaturalist = df.loc[df['Source'] != 'iNaturalist']

In [5]:
# testing out pycountry package
france = pycountry.countries.get(alpha_2='FR')
france

Country(alpha_2='FR', alpha_3='FRA', flag='üá´üá∑', name='France', numeric='250', official_name='French Republic')

In [1]:
from geopy.geocoders import Nominatim
import pandas as pd

def get_country_code(latlong_series):
    """
    Given a Pandas Series of latitude-longitude coordinates, returns a list of 
    corresponding country codes obtained through reverse geocoding using the 
    Nominatim geocoding service from the geopy library. If a location cannot be 
    found for a given coordinate, the string 'needs to be checked' is returned 
    instead of a country code.

    Parameters
    ----------
    latlong_series : pandas.Series
        A Pandas Series of strings representing latitude-longitude coordinates 
        in the format "latitude,longitude". Example: ["37.7749,-122.4194", 
        "40.7128,-74.0060", ...].

    Returns
    -------
    list of str
        A list of ISO 3166-1 alpha-2 country codes corresponding to the 
        coordinates in `latlong_series`, or the string 'needs to be checked' 
        if a location cannot be found for a given coordinate.

    """
    # Create a geolocator object for the Nominatim geocoding service
    geolocator = Nominatim(user_agent="my_application")

    # Apply the reverse_geocode() method to all values in latlong_series
    locations = latlong_series.apply(lambda x: geolocator.reverse(x, language="en"))

    # Extract the country code from each location object, or append "needs to be checked"
    # if the location is None
    country_codes = locations.apply(lambda x: x.raw['address']['country_code'] if x is not None else 'needs to be checked')
    
    print(locations)
    # Convert the resulting Pandas Series back to a list and return it
    return country_codes.tolist()


In [16]:
# assuming your original dataframe is called "df"
df_source_not_inaturalist['country code'] = get_country_code(df_source_not_inaturalist['combined_lat_and_lon'])

3130    (Salon and Spa Venessa, 8516, Main Street, Dob...
3131    (12638, 14th Avenue South, Boulevard Park, Bur...
3132    (Walygator Sud-Ouest, Route d'Agen, Roquefort,...
3133    (16231, Windemeir Lane, Huntington Beach, Oran...
3134    (Huatugou, Mangnai City, Haixi Mongol and Tibe...
                              ...                        
3456    (Alte Ziegelei, August-Bebel-Stra√üe, Neukirche...
3457    (5, Fabricestra√üe, Albertstadt, Neustadt, Dres...
3458    (58, Friedrich-Rottra-Stra√üe, Efringen-Kirchen...
3459    (Station Jannowitzbr√ºcke, Br√ºcke 2, Rolandufer...
3460    (9–∞, Starostynska Street, Ostroh, Ostroh Urban...
Name: combined_lat_and_lon, Length: 331, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_source_not_inaturalist['country code'] = get_country_code(df_source_not_inaturalist['combined_lat_and_lon'])


In [17]:
df_source_not_inaturalist.to_csv('small dataset with country codes.csv')

In [19]:
def get_continent_name(continent_code: str) -> str:
    """
    Given a continent's code, returns the continent name as a whole string
    
    Args:
        continent_code(str): continent code
    """
    # dictionary of continent codes
    continent_dict = {
        "NA": "North America",
        "SA": "South America",
        "AS": "Asia",
        "AF": "Africa",
        "OC": "Oceania",
        "EU": "Europe",
        "AQ" : "Antarctica"
    }
    return continent_dict[continent_code]

# initialize continent list
continent_full = []


def get_continent_name(country_code):
    """
    Given an ISO 3166-1 alpha-2 country code or the string 'needs to be checked', 
    returns the corresponding continent name using the pycountry_convert library. 
    If the code is 'needs to be checked', the string 'needs to be checked' is returned 
    instead of a continent name.

    Parameters
    ----------
    country_code : str
        An ISO 3166-1 alpha-2 country code or the string 'needs to be checked'. 
        Example: 'US', 'CA', 'MX', 'needs to be checked'.

    Returns
    -------
    str
        The continent name corresponding to the code, or the string 'needs to be checked' 
        if the code is 'needs to be checked'.

    """
    # Convert the code to uppercase
    country_code = country_code.upper()

    # Check if the code is 'needs to be checked'
    if country_code == 'NEEDS TO BE CHECKED':
        return 'needs to be checked'

    # Use pycountry_convert to convert the code to a continent name
    try:
        country_continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return continent_name
    except KeyError:
        # If the code is not recognized, return 'needs to be checked'
        return 'needs to be checked'


In [20]:
# assuming your original dataframe is called "df"
df_source_not_inaturalist['continent name'] = df_source_not_inaturalist['country code'].apply(get_continent_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_source_not_inaturalist['continent name'] = df_source_not_inaturalist['country code'].apply(get_continent_name)


In [21]:
df_source_not_inaturalist

Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,user_name,created_at,updated_at,...,place_state_name,place_country_name,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id,Source,country code,continent name
3130,,"Wednesday, June 9, 2010",6/9/2010 0:00,,Eastern Time (US & Canada),,John Chulick,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Flickr,us,North America
3131,,"Wednesday, October 11, 2006",########,,Pacific Daylight Time,,Cheryl Moorehead,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Flickr,us,North America
3132,,"Sunday, February 8, 2015",2/8/2015 0:00,,Central European Time,,Debouvry,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Flickr,fr,Europe
3133,,"Thursday, July 24, 2014",########,,Pacific Daylight Time,,Mollivan Jon,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Flickr,us,North America
3134,,"Saturday, March 31, 2012",########,,Central Daylight Time,,Coastlander,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Flickr,cn,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3456,,,18/10/2021,18/10/2021 00:00,Germany,,naturgucker,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Naturgucker,de,Europe
3457,,,18/10/2021,18/10/2021 00:00,Germany,,naturgucker,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Naturgucker,de,Europe
3458,,,18/10/2021,18/10/2021 00:00,Germany,,naturgucker,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Naturgucker,de,Europe
3459,,,31/10/2021,31/10/2021 00:00,Germany,,naturgucker,,,,...,,,Hesperomyces harmoniae,Hesperomyces harmoniae,,Fungi,,Naturgucker,de,Europe


In [22]:
# create a csv with the applied country and continents
df_source_not_inaturalist.to_csv('small dataset with country codes and country names.csv')

In [27]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def get_country_name(country_code, timeout=5):
    """
    Returns the country name based on the given country code.

    Args:
        country_code (str): Country code (ISO 3166-1 alpha-2).
        timeout (int): Timeout value for the geocoding request in seconds. Default is 5 seconds.

    Returns:
        str: Country name.

    Example:
        >>> get_country_name("PL")
        'Poland'
    """
    geolocator = Nominatim(user_agent="country_info")
    location = None
    try:
        location = geolocator.geocode(country_code, exactly_one=True, timeout=timeout)
    except GeocoderTimedOut:
        return get_country_name(country_code, timeout=timeout)  # Retry the request in case of timeout error
    
    if location:
        return location.address.split(",")[-1].strip()
    else:
        return None


In [28]:
# assuming your original dataframe is called "df"
df_source_not_inaturalist['country name'] = df_source_not_inaturalist['country code'].apply(get_country_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_source_not_inaturalist['country name'] = df_source_not_inaturalist['country code'].apply(get_country_name)


In [29]:
# create a csv with the applied country and continents
df_source_not_inaturalist.to_csv('small dataset with country codes and country names.csv')