In [1]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install unidecode

Note: you may need to restart the kernel to use updated packages.


# Applying Coordinates of Regions for Published Records

To find the coordinates of regions in a dataset of published records, we can use the Nomanatim package from `geopy.geocoders`.

In [4]:
# Import the required library
from geopy.geocoders import Nominatim

# Initialize Nominatim API
geolocator = Nominatim(user_agent="MyApp")

# testing out the location
location = geolocator.geocode('Boston,MA')

# return to the user the lat and long values
print("The latitude of the location is: ", location.latitude)
print("The longitude of the location is: ", location.longitude)

The latitude of the location is:  42.3554334
The longitude of the location is:  -71.060511


### Breakdown:

Looking at the code above, we can see that given a region, tbhe lat and long values can be determined.

In [5]:
# import published records dataset
import pandas as pd

path = 'Data/Published Records Without Coordinates.xlsx'

df = pd.read_excel(path)

In [6]:
# take a look at the dataframe
df.head(15)

Unnamed: 0,Country,Region,Locality,Longitude,Latitude,Year,Month,Day,Reference(s)
0,China,Sichuan Province,Yibin,,,1930,April,10–14,"Haelewaters D, Comont RF, Zhao SY, Pfister DH...."
1,USA,Virginia,Pocahontas State Park,,,2002,May,11,"Haelewaters D, Zhao SY, Clusella-Trullas S, Co..."
2,USA,Tennessee,Great Smoky Mountains National Park,,,2002,July,1 & 9,"Haelewaters D, Zhao SY, Clusella-Trullas S, Co..."
3,USA,Ohio,"Wayne County, Wooster, The Ohio State Universi...",,,2002,July–November,,"Garces S, Williams R. 2004. First record of He..."
4,USA,Pennsylvania,"Lebanon County, Mt. Gretna",40?14.78'N,76?27.35'W,2002,November,20,"Riddick EW, Schaefer PW. 2005. Occurrence, den..."
5,USA,Pennsylvania,"Lebanon County, Mt. Gretna",40?14.78'N,76?27.35'W,2003,October–November,,"Riddick EW, Schaefer PW. 2005. Occurrence, den..."
6,USA,North Carolina,"Wake County, Weaverville",35.71Â°N,82.45Â°W,2003,October,13,"Nalepa CA, Weir A. 2007. Infection of Harmonia..."
7,USA,North Carolina,"Wake County, Cary, NCDA & CS Beneficial Insect...",35.79Â°N,78.73Â°W,2003,October,30,"Nalepa CA, Weir A. 2007. Infection of Harmonia..."
8,USA,North Carolina,"Wake County, Cary, NCDA & CS Beneficial Insect...",35.79Â°N,78.73Â°W,2003,December,3,"Nalepa CA, Weir A. 2007. Infection of Harmonia..."
9,USA,North Carolina,"Wake County, Cary, NCDA & CS Beneficial Insect...",35.79Â°N,78.73Â°W,2004,March,2,"Nalepa CA, Weir A. 2007. Infection of Harmonia..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Country       126 non-null    object
 1   Region        126 non-null    object
 2   Locality      13 non-null     object
 3   Longitude     8 non-null      object
 4   Latitude      9 non-null      object
 5   Year          126 non-null    object
 6   Month         12 non-null     object
 7   Day           8 non-null      object
 8   Reference(s)  126 non-null    object
dtypes: object(9)
memory usage: 9.0+ KB


# Analysis:
 Looking at the dataframe, we can find information on the region and country in the respective `Region` and `Country` columns. There is also a `Locality` section, but there are missing values. So we will go with a concatenation of `Region` and `Country` to provide the most accurate location with the given location.

In [8]:
from unidecode import unidecode

def remove_special_chars(string):
    """
    Removes special characters from a string using unidecode.
    
    Args:
        string (str): Input string to remove special characters from.
    
    Returns:
        str: Cleaned string with special characters removed.
    """
    return unidecode(string)


In [9]:
# concatenating region and country

def concatenate(string1, string2):
    """
    concatenates two strings together with a comma in between
    
    Args:
        string1(str): string1
        string2(str): string2
        
    Returns:
        string_final(str): string that was concatenated
    """
    
    string_final = f'{string1},{string2}'
    return string_final

In [10]:
# using concatenate function

# apply the function to the "Country" and "Region" columns using df.apply
df['Country_Region'] = df.apply(lambda row: concatenate(row['Region'], row['Country']), axis=1)

In [11]:
# Apply the function to the 'Region_Country' column and assign the cleaned values to a new column 'Cleaned Region'
df['Cleaned Region'] = df['Country_Region'].apply(remove_special_chars)

In [12]:
# looking at the new df column
df.head()

Unnamed: 0,Country,Region,Locality,Longitude,Latitude,Year,Month,Day,Reference(s),Country_Region,Cleaned Region
0,China,Sichuan Province,Yibin,,,1930,April,10–14,"Haelewaters D, Comont RF, Zhao SY, Pfister DH....","Sichuan Province,China","Sichuan Province,China"
1,USA,Virginia,Pocahontas State Park,,,2002,May,11,"Haelewaters D, Zhao SY, Clusella-Trullas S, Co...","Virginia,USA","Virginia,USA"
2,USA,Tennessee,Great Smoky Mountains National Park,,,2002,July,1 & 9,"Haelewaters D, Zhao SY, Clusella-Trullas S, Co...","Tennessee,USA","Tennessee,USA"
3,USA,Ohio,"Wayne County, Wooster, The Ohio State Universi...",,,2002,July–November,,"Garces S, Williams R. 2004. First record of He...","Ohio,USA","Ohio,USA"
4,USA,Pennsylvania,"Lebanon County, Mt. Gretna",40?14.78'N,76?27.35'W,2002,November,20,"Riddick EW, Schaefer PW. 2005. Occurrence, den...","Pennsylvania,USA","Pennsylvania,USA"


With a specified region with its country, we can move forward with the provision of coordinates using the function defined below.

In [13]:
# Initialize Nominatim API
geolocator = Nominatim(user_agent="MyApp")

def provide_coordinates(region):
    """
    Given a region, returns the region's lat and long.

    Args:
        region (str): Region to geocode.

    Returns:
        pd.Series([lat, long]): Pandas series containing latitude and longitude.
    """
    try:
        # Obtain the location, 4 seconds per try
        location = geolocator.geocode(region, timeout=4)

        if location is None:
            # will let us know and try 
            print(f"No location found for {region}. Trying with the string to the left of the comma...")

            # Try again with the string to the left of the comma in region
            left_of_comma = region.split(",")[0]
            location = geolocator.geocode(left_of_comma, timeout=10)
            
            # will let us kow if there was no location found
            if location is None:
                print(f"No location found for {left_of_comma} either.")
                return pd.Series([None, None])

        # Obtain lat and long values
        lat = location.latitude
        long = location.longitude

        return pd.Series([lat, long])

    except GeocoderTimedOut as e:
        print(f"Geocoding service timed out for {region}.")
        return pd.Series([None, None])


In [14]:
# apply the function to the "Region_Country" column using df.apply and create new "Latitude" and "Longitude" columns
df[['Latitude', 'Longitude']] = df['Cleaned Region'].apply(lambda x: provide_coordinates(x))


No location found for North Brabant Province,The Netherlands. Trying with the string to the left of the comma...
No location found for North Brabant Province,The Netherlands. Trying with the string to the left of the comma...
No location found for North Brabant Province,The Netherlands. Trying with the string to the left of the comma...
No location found for North Brabant Province,The Netherlands. Trying with the string to the left of the comma...
No location found for North Brabant Province,The Netherlands. Trying with the string to the left of the comma...
No location found for South Bohemian Region,Czech Republic. Trying with the string to the left of the comma...
No location found for South Bohemian Region either.
No location found for South-Western Region,Bulgaria. Trying with the string to the left of the comma...
No location found for Central Macedonian Region,Greece. Trying with the string to the left of the comma...
No location found for Central Macedonian Region either.
No lo

In [15]:
# look at the dataframe
df.head()

Unnamed: 0,Country,Region,Locality,Longitude,Latitude,Year,Month,Day,Reference(s),Country_Region,Cleaned Region
0,China,Sichuan Province,Yibin,102.5,30.5,1930,April,10–14,"Haelewaters D, Comont RF, Zhao SY, Pfister DH....","Sichuan Province,China","Sichuan Province,China"
1,USA,Virginia,Pocahontas State Park,-78.492772,37.123224,2002,May,11,"Haelewaters D, Zhao SY, Clusella-Trullas S, Co...","Virginia,USA","Virginia,USA"
2,USA,Tennessee,Great Smoky Mountains National Park,-86.282008,35.773008,2002,July,1 & 9,"Haelewaters D, Zhao SY, Clusella-Trullas S, Co...","Tennessee,USA","Tennessee,USA"
3,USA,Ohio,"Wayne County, Wooster, The Ohio State Universi...",-82.68814,40.225357,2002,July–November,,"Garces S, Williams R. 2004. First record of He...","Ohio,USA","Ohio,USA"
4,USA,Pennsylvania,"Lebanon County, Mt. Gretna",-77.727883,40.969989,2002,November,20,"Riddick EW, Schaefer PW. 2005. Occurrence, den...","Pennsylvania,USA","Pennsylvania,USA"


# Analysis:

There are a few regions that are problematic and are not caught by the script, since there are small in number, we can manually change and look up the coordinates. Regions such as `South Bohemian Region, Czech Republic`, `Central Macedonian Region,Greece`, and `Hunedoara County, Romania` need to be changed with a manual internet lookup.

In [16]:
# find 'South Bohemian Region'
df.loc[df['Cleaned Region'] == 'South Bohemian Region,Czech Republic']

Unnamed: 0,Country,Region,Locality,Longitude,Latitude,Year,Month,Day,Reference(s),Country_Region,Cleaned Region
64,Czech Republic,South Bohemian Region,,,,2014,,,"Fiedler & Nedv?d (2019), Haelewaters et al. (2...","South Bohemian Region,Czech Republic","South Bohemian Region,Czech Republic"


#### Lat and Long Values for 'South Bohemian Region'
48.9458° N, 14.4416° E 

In [17]:
# replace the values
# index is 64

df['Longitude'][64] = 14.4416
df['Latitude'][64] = 48.9458

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Longitude'][64] = 14.4416
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Latitude'][64] = 48.9458


In [18]:
# find 'Central Macedonian Region,Greece'
df.loc[df['Cleaned Region'] == 'Central Macedonian Region,Greece']

Unnamed: 0,Country,Region,Locality,Longitude,Latitude,Year,Month,Day,Reference(s),Country_Region,Cleaned Region
92,Greece,Central Macedonian Region,,,,2017,,,Ceryngier & Romanowski (2017),"Central Macedonian Region,Greece","Central Macedonian Region,Greece"


#### Lat and Long Values for 'Central Macedonian Region,Greece'
40.6212° N, 23.1918° E

In [19]:
# replace the values
# index is 92

df['Longitude'][92] = 23.1918
df['Latitude'][92] = 40.6212

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Longitude'][92] = 23.1918
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Latitude'][92] = 40.6212


In [20]:
# find 'Hunedoara County, Romania'
df.loc[df['Cleaned Region'] == 'Hunedoara County,Romania']

Unnamed: 0,Country,Region,Locality,Longitude,Latitude,Year,Month,Day,Reference(s),Country_Region,Cleaned Region
108,Romania,Hunedoara County,,,,2019,,,This paper: Additional material examined,"Hunedoara County,Romania","Hunedoara County,Romania"


#### Lat and Long Values for 'Hunedoara County, Romania'

45.767813° N, 22.907233° E

In [21]:
# replace the values
# index is 108

df['Longitude'][108] = 22.907233
df['Latitude'][108] = 45.767813

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Longitude'][108] = 22.907233
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Latitude'][108] = 45.767813


In [22]:
# check if there aer any nan values
df['Latitude'].unique()

array([ 30.5000001 ,  37.1232245 ,  35.7730076 ,  40.2253569 ,
        40.9699889 ,  35.6729639 ,  32.9715285 ,  37.5726028 ,
        32.6944793 ,  44.3763598 ,  40.9381735 ,  36.0721527 ,
        30.2226352 ,  29.834772  ,  42.2801822 ,  50.9341625 ,
        32.5543462 ,  50.8802265 ,  50.6080651 ,  48.3472808 ,
        51.48587835,  27.3900897 ,  50.8035441 ,  42.7269522 ,
        50.5862066 ,  52.6008538 ,  52.1343369 ,  52.0659639 ,
        52.0187505 ,  52.1847669 ,  51.58561875,  51.9685817 ,
        51.96802135,  38.2722313 ,  51.0962462 ,  52.0809856 ,
        54.7813351 ,  30.301949  ,  50.7981063 ,  39.2908816 ,
        39.9194117 ,  51.0556204 ,  42.9045675 ,  48.9747357 ,
        48.8588897 ,  49.882834  ,  52.2319581 , -33.934444  ,
        48.9458    ,  -1.3397668 ,  47.531399  ,  52.02620525,
        41.7425538 ,  42.9525022 ,  42.485452  ,  40.8832318 ,
        49.2608724 ,  47.6677606 ,  48.1516988 ,  42.2582044 ,
       -22.4984079 ,  48.2083537 ,  45.7162129 ,  44.45

In [23]:
# create the published records coordinates csv
df.to_csv('Published Records With Coordinates.csv')