In [None]:
import json
import os
import sys
from functools import lru_cache
from pathlib import Path

import pandas as pd
from tqdm import tqdm

sys.path.insert(0, "..")
from twittertools.geocode import (
    load_friends_data, load_city_data, load_state_data,
    geocode_simplemaps, test,
)

## Geocode with SimpleMaps

In [None]:
data = load_friends_data()
data_coded = []
for screen_name, location in tqdm(data):
    res = geocode_simplemaps(location)
    city, country = lookup_index(res)  # could return (None, None)
    data_coded.append(
        dict(screen_name=screen_name, location=location, city=city, country=country)
    )
return pd.DataFrame(data_coded)

### Compute Scores

In [16]:
SCORES = []

numerator = len(res_df[~res_df['country'].isna()])
denomiator = len(res_df['country'])

# without nas
res_df_dropna = res_df[res_df['location'] != ""]
numerator2 = len(res_df_dropna[~res_df_dropna['country'].isna()])
denomiator2 = len(res_df_dropna['country'])

print(f"Success: {numerator}/{denomiator} = {numerator / denomiator}")

print("\nExcluding empty:")
print(f"Success: {numerator2}/{denomiator2} = {numerator2 / denomiator2}")

SCORES.append(numerator2 / denomiator2)

Success: 917/1884 = 0.4867303609341826

Excluding empty:
Success: 917/1482 = 0.6187584345479082


In [29]:
df_cities[df_cities['city'].str.contains("st.", regex=False)]

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
385,st. louis,st. louis,38.6358,-90.2451,united states,us,usa,missouri,,2024074.0,1840001651
1463,port st. lucie,port st. lucie,27.2796,-80.3883,united states,us,usa,florida,,461152.0,1840015119
1970,st. catharines,st. catharines,43.1833,-79.2333,canada,ca,can,ontario,,309319.0,1124140229
1975,st. paul,st. paul,44.9477,-93.1040,united states,us,usa,minnesota,admin,308096.0,1840008940
2194,st. petersburg,st. petersburg,27.7931,-82.6652,united states,us,usa,florida,,265351.0,1840015977
...,...,...,...,...,...,...,...,...,...,...,...
25381,west st. paul,west st. paul,50.0119,-97.1150,canada,ca,can,manitoba,,5368.0,1124001136
25393,st. paul park,st. paul park,44.8360,-92.9949,united states,us,usa,minnesota,,5362.0,1840008917
25512,st. clair,st. clair,42.8262,-82.4930,united states,us,usa,michigan,,5279.0,1840003954
25688,st. james,st. james,33.9467,-78.1132,united states,us,usa,north carolina,,5141.0,1840017954


In [12]:
missed = res_df_dropna[res_df_dropna['country'].isna()]

In [13]:
missed.sample(30)

Unnamed: 0,screen_name,location,city,country
4264,Pinboard,Lonely street of broken dreams,,
2129,nowthisnews,The Internet & NYC,,
812,RacismDog,United States,,
3003,LeonYin,www,,
2481,DigitalBlkHippy,a mac around a bunch of pcs,,
3437,glasnt,Eora Country,,
3686,Esquiring,NYC // SF // 🌊,,
2407,css,"Chris in Bend, OR + Remote Team",,
1150,staceyabrams,Georgia,,
2828,salimismail,"ÜT: 37.458282,-122.164066",,


In [110]:
# with pd.option_context('display.max_rows', None):
#     display(missed)

In [111]:
# with pd.option_context('display.max_rows', None):
#     display(res_df_dropna[~res_df_dropna['country'].isna()])

## GeoCoding with Web

In [1]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent="twitter-tools")

In [None]:
df = pd.DataFrame(data_coded)

In [None]:
def get_country(loc):
    try:
        return loc.split(',')[-1].strip()
    except:
        return None

In [None]:
s = df['location_address']
df['country'] = s.apply(get_country)

In [None]:
df.to_csv('data.csv')

In [None]:
geocoded = {}
for item in data_coded:
    geocoded[item['location_raw']] = dict(
        loc_address=item['location_address'],
        loc_latitude=item['location_latitude'],
        loc_longitude=item['location_longitude']
    )

In [None]:
with open('geo_cache.json', 'w') as f:
    json.dump(geocoded, f)

In [None]:
def geocode(loc):
    location = geolocator.geocode(loc)
    if location:
        return dict(
            loc_address=location.address,
            loc_latitude=location.latitude,
            loc_longitude=location.longitude
        )
    return dict(
            loc_address=None,
            loc_latitude=None,
            loc_longitude=None
        )

In [None]:
df_cities = pd.read_csv('../data/simplemaps/worldcities.csv')

In [None]:
data_coded = []
for screen_name, location in data:
    country = geocode_simplemaps(location)
    data_coded.append(dict(
        screen_name=screen_name,
        location=location,
        country=country
    ))

In [None]:
pd.DataFrame(data_coded).country.unique()

In [None]:
df['location'].apply(geocode_simplemaps)

In [None]:
data_coded = []
for screen_name, loc in tqdm(data):
    entry = dict(screen_name=screen_name, location_raw=loc)
    # check cache
    if loc in geocoded:
        geo_vals = geocoded[loc]
    else:
        geo_vals = geocode(loc)
    data_coded.append({**entry, **geo_vals})

### GeoJSON

In [None]:
with open('../data/worldgeo.json') as f:
    geojson = json.load(f)

In [None]:
[x['properties']['sovereignt'] for x in geojson['features']]