In [2]:
import json
import os
import sys
import random

import pandas as pd
from tqdm import tqdm

In [3]:
# %load_ext autoreload
# %autoreload 2

sys.path.insert(0, "..")
from twittertools.geocode import (
    load_friends_data, load_city_data, load_state_data,
    geocode_simplemaps, lookup_index, test,
)

## Geocode with SimpleMaps

In [4]:
data = load_friends_data()
len(data)

1884

In [5]:
# s = pd.Series(data)
# data = list(s.sample(n=1000))
# len(data)

In [6]:
data_coded = []
for screen_name, location in tqdm(data):
    res = geocode_simplemaps(location)
    city, country = lookup_index(res)  # could return (None, None)
    data_coded.append(
        dict(screen_name=screen_name, location=location, city=city, country=country)
    )
res_df = pd.DataFrame(data_coded)

100%|██████████| 1884/1884 [00:11<00:00, 157.37it/s]


### Compute Scores

In [7]:
SCORES = []

numerator = len(res_df[~res_df['country'].isna()])
denomiator = len(res_df['country'])

# without nas
res_df_dropna = res_df[res_df['location'] != ""]
numerator2 = len(res_df_dropna[~res_df_dropna['country'].isna()])
denomiator2 = len(res_df_dropna['country'])

print("Including empty:")
print(f"Success: {numerator}/{denomiator} = " + "{:.2%}".format(numerator / denomiator))

print("\nExcluding empty:")
print(f"Success: {numerator2}/{denomiator2} = " + "{:.2%}".format(numerator2 / denomiator2))

SCORES.append(numerator2 / denomiator2)

Including empty:
Success: 922/1884 = 48.94%

Excluding empty:
Success: 922/1482 = 62.21%


In [7]:
# df_cities[df_cities['city'].str.contains("st.", regex=False)]

In [12]:
missed = res_df_dropna[res_df_dropna['country'].isna()]
len(missed)

560

In [13]:
missed.sample(30)

Unnamed: 0,screen_name,location,city,country
94,eliquorice,Beach City,,
1112,jtauber,Esperance / Perth / Boston,,
308,suneelgokhale,United Arab Emirates,,
393,NASAPersevere,"Jezero Crater, Mars",,
301,EndowCapital,"Europe, Africa",,
1145,akronisticlotor,On the 72R,,
1843,gutenberg_org,University of North Carolina,,
580,LeonYin,www,,
763,warmshowers,worldwide,,
228,running_tide,"Casco Bay, Maine",,


In [110]:
# with pd.option_context('display.max_rows', None):
#     display(missed)

In [111]:
# with pd.option_context('display.max_rows', None):
#     display(res_df_dropna[~res_df_dropna['country'].isna()])

## GeoCoding with Web

In [1]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent="twitter-tools")

In [None]:
df = pd.DataFrame(data_coded)

In [None]:
def get_country(loc):
    try:
        return loc.split(',')[-1].strip()
    except:
        return None

In [None]:
s = df['location_address']
df['country'] = s.apply(get_country)

In [None]:
df.to_csv('data.csv')

In [None]:
geocoded = {}
for item in data_coded:
    geocoded[item['location_raw']] = dict(
        loc_address=item['location_address'],
        loc_latitude=item['location_latitude'],
        loc_longitude=item['location_longitude']
    )

In [None]:
with open('geo_cache.json', 'w') as f:
    json.dump(geocoded, f)

In [None]:
def geocode(loc):
    location = geolocator.geocode(loc)
    if location:
        return dict(
            loc_address=location.address,
            loc_latitude=location.latitude,
            loc_longitude=location.longitude
        )
    return dict(
            loc_address=None,
            loc_latitude=None,
            loc_longitude=None
        )

In [None]:
df_cities = pd.read_csv('../data/simplemaps/worldcities.csv')

In [None]:
data_coded = []
for screen_name, location in data:
    country = geocode_simplemaps(location)
    data_coded.append(dict(
        screen_name=screen_name,
        location=location,
        country=country
    ))

In [None]:
pd.DataFrame(data_coded).country.unique()

In [None]:
df['location'].apply(geocode_simplemaps)

In [None]:
data_coded = []
for screen_name, loc in tqdm(data):
    entry = dict(screen_name=screen_name, location_raw=loc)
    # check cache
    if loc in geocoded:
        geo_vals = geocoded[loc]
    else:
        geo_vals = geocode(loc)
    data_coded.append({**entry, **geo_vals})

### GeoJSON

In [14]:
with open('../data/worldgeo.json') as f:
    geojson = json.load(f)

In [18]:
# geojson["features"][0]

{'type': 'Feature',
 'properties': {'scalerank': 1,
  'featurecla': 'Admin-0 country',
  'labelrank': 6,
  'sovereignt': 'Belize',
  'sov_a3': 'BLZ',
  'adm0_dif': 0,
  'level': 2,
  'type': 'Sovereign country',
  'admin': 'Belize',
  'adm0_a3': 'BLZ',
  'geou_dif': 0,
  'geounit': 'Belize',
  'gu_a3': 'BLZ',
  'su_dif': 0,
  'subunit': 'Belize',
  'su_a3': 'BLZ',
  'brk_diff': 0,
  'name': 'Belize',
  'name_long': 'Belize',
  'brk_a3': 'BLZ',
  'brk_name': 'Belize',
  'brk_group': None,
  'abbrev': 'Belize',
  'postal': 'BZ',
  'formal_en': 'Belize',
  'formal_fr': None,
  'note_adm0': None,
  'note_brk': None,
  'name_sort': 'Belize',
  'name_alt': None,
  'mapcolor7': 1,
  'mapcolor8': 4,
  'mapcolor9': 5,
  'mapcolor13': 7,
  'pop_est': 307899,
  'gdp_md_est': 2536,
  'pop_year': -99,
  'lastcensus': 2010,
  'gdp_year': -99,
  'economy': '6. Developing region',
  'income_grp': '4. Lower middle income',
  'wikipedia': -99,
  'fips_10': None,
  'iso_a2': 'BZ',
  'iso_a3': 'BLZ',
  'i

In [11]:
[x['properties']['sovereignt'] for x in geojson['features']]

['Belize',
 'Canada',
 'The Bahamas',
 'Cuba',
 'Dominican Republic',
 'Denmark',
 'Guatemala',
 'Haiti',
 'Costa Rica',
 'Honduras',
 'Mexico',
 'Nicaragua',
 'Panama',
 'Jamaica',
 'United States of America',
 'United States of America',
 'El Salvador',
 'Trinidad and Tobago',
 'Argentina',
 'Bolivia',
 'Chile',
 'Guyana',
 'Colombia',
 'Brazil',
 'Peru',
 'Ecuador',
 'Afghanistan',
 'United Arab Emirates',
 'Armenia',
 'Azerbaijan',
 'Bangladesh',
 'United Kingdom',
 'Venezuela',
 'Suriname',
 'Brunei',
 'Bhutan',
 'China',
 'Northern Cyprus',
 'Cyprus',
 'Paraguay',
 'Uruguay',
 'Georgia',
 'Indonesia',
 'India',
 'Iran',
 'Iraq',
 'Israel',
 'Jordan',
 'Japan',
 'Kazakhstan',
 'Kyrgyzstan',
 'Cambodia',
 'South Korea',
 'Kuwait',
 'Laos',
 'Lebanon',
 'Sri Lanka',
 'Myanmar',
 'Mongolia',
 'Malaysia',
 'Nepal',
 'Oman',
 'Pakistan',
 'Philippines',
 'North Korea',
 'Saudi Arabia',
 'Israel',
 'Qatar',
 'Syria',
 'Thailand',
 'Tajikistan',
 'Turkmenistan',
 'East Timor',
 'Turkey',