# Check sheep coordinates
Check if the country associated with the given coordinates is in the correct country or not.

In [1]:
import pandas as pd
import geopandas as gpd
import geodatasets

from io import StringIO
from functools import lru_cache
from shapely.geometry import Point
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, SampleSheep
from src.features.utils import countries

In [2]:
conn = global_connection()
tqdm.pandas()

def fix_id(df: pd.DataFrame, field: str = "_id", set_index: bool = False):
    """Parse id and make index"""
    df[field] = df[field].apply(lambda val: val['$oid'])
    if set_index:
        df = df.set_index(field)
    return df

def add_geometry(df: pd.DataFrame):
    """Add a geometry column from locations"""

    def get_geometry(value):
        if isinstance(value, dict):
            return Point(*value['coordinates'][0])
        return value

    df['geometry'] = df['locations'].apply(get_geometry)
    return df

Collect all sheeps with geographical coordinates:

In [3]:
sheeps = SampleSheep.objects.filter(locations__exists=1)
sheeps.count()

9295

In [4]:
sheep_df = pd.read_json(StringIO(sheeps.to_json()))
sheep_df = fix_id(sheep_df, "_id")
sheep_df = fix_id(sheep_df, "dataset_id")
sheep_df = add_geometry(sheep_df)
sheep_df.drop(['locations', 'metadata', 'phenotype', 'father_id', 'mother_id', 'sex', 'alias'], axis=1, inplace=True)
sheep_df.head()

Unnamed: 0,_id,original_id,smarter_id,country,breed,breed_code,dataset_id,type,chip_name,species,geometry
0,664f6b13fac53bfc4b0a70ff,20181210002,UYOA-TEX-000000001,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980235632396 -32.85966555055839)
1,664f6b14fac53bfc4b0a7100,20181210003,UYOA-TEX-000000002,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980235632396 -32.85966555055839)
2,664f6b14fac53bfc4b0a7101,20181210005,UYOA-TEX-000000003,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980235632396 -32.85966555055839)
3,664f6b14fac53bfc4b0a7102,20181210006,UYOA-TEX-000000004,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980235632396 -32.85966555055839)
4,664f6b15fac53bfc4b0a7103,20181210008,UYOA-TEX-000000005,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980235632396 -32.85966555055839)


Next, we need to get the world boundary features:

In [5]:
world = gpd.read_file(geodatasets.get_path('naturalearth.land'))

Now convert `DataFrame` to `GeoDataFrame`. Clearly state the *coordinate system* which is *WGS84 (EPSG:4326)*:

In [6]:
sheep_gdf = gpd.GeoDataFrame(sheep_df, crs="EPSG:4326")
sheep_gdf = sheep_gdf.set_crs(world.crs)
sheep_gdf.head()

Unnamed: 0,_id,original_id,smarter_id,country,breed,breed_code,dataset_id,type,chip_name,species,geometry
0,664f6b13fac53bfc4b0a70ff,20181210002,UYOA-TEX-000000001,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980 -32.85967)
1,664f6b14fac53bfc4b0a7100,20181210003,UYOA-TEX-000000002,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980 -32.85967)
2,664f6b14fac53bfc4b0a7101,20181210005,UYOA-TEX-000000003,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980 -32.85967)
3,664f6b14fac53bfc4b0a7102,20181210006,UYOA-TEX-000000004,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980 -32.85967)
4,664f6b15fac53bfc4b0a7103,20181210008,UYOA-TEX-000000005,Uruguay,Texel,TEX,604f75a61a08c53cebd09b67,background,IlluminaOvineSNP50,Ovis aries,POINT (-54.79980 -32.85967)


Now try to determine the country relying on GPS coordinates:

In [7]:
# define the reverse geocoding
locator = Nominatim(user_agent="myGeocoder", timeout=10)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.1)

@lru_cache(maxsize=None)
def get_country(point):
    data = rgeocode([point.y, point.x], language="English")
    if data:
        country_code = data.raw['address']['country_code']
        return countries.get(alpha_2=country_code).name
    else:
        return data

# find countries
sheep_gdf["geocoding_country"] = sheep_gdf["geometry"].progress_apply(get_country)

  0%|          | 0/9295 [00:00<?, ?it/s]

Ok now filter out all the record where country is different from the geocoded country:

In [8]:
mismatch = sheep_gdf[sheep_gdf["country"] != sheep_gdf["geocoding_country"]].copy()
mismatch["latitude"] = mismatch["geometry"].y
mismatch["longitude"] = mismatch["geometry"].x
mismatch.drop(["geometry"], axis=1, inplace=True)
mismatch.to_excel("mismatch.xlsx", index=False)