In [1]:
import json
import os
import sys
import random

import pandas as pd
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

sys.path.insert(0, "..")
from twittertools.geocode import (
    load_friends_data, load_city_data, load_state_data,
    geocode_simplemaps, lookup_index, test,
)

## Geocode with SimpleMaps

In [3]:
data = load_friends_data()
len(data)

1884

In [4]:
# s = pd.Series(data)
# data = list(s.sample(n=1000))
# len(data)

In [116]:
city_df = load_city_data()

In [6]:
data_coded = []
for screen_name, location in tqdm(data):
    res = geocode_simplemaps(location)
    city, country = lookup_index(res)  # could return (None, None)
    data_coded.append(
        dict(screen_name=screen_name, location=location, city=city, country=country)
    )
res_df = pd.DataFrame(data_coded)

100%|██████████| 1884/1884 [00:05<00:00, 325.86it/s]


In [12]:
res_df.to_csv("found.csv", index=False)

### Compute Scores

In [8]:
SCORES = []

numerator = len(res_df[~res_df['country'].isna()])
denomiator = len(res_df['country'])

# without nas
res_df_dropna = res_df[res_df['location'] != ""]
numerator2 = len(res_df_dropna[~res_df_dropna['country'].isna()])
denomiator2 = len(res_df_dropna['country'])

print("Including empty:")
print(f"Success: {numerator}/{denomiator} = " + "{:.2%}".format(numerator / denomiator))

print("\nExcluding empty:")
print(f"Success: {numerator2}/{denomiator2} = " + "{:.2%}".format(numerator2 / denomiator2))

SCORES.append(numerator2 / denomiator2)

Including empty:
Success: 922/1884 = 48.94%

Excluding empty:
Success: 922/1482 = 62.21%


In [7]:
# df_cities[df_cities['city'].str.contains("st.", regex=False)]

In [9]:
missed = res_df_dropna[res_df_dropna['country'].isna()]
len(missed)

560

In [10]:
missed.sample(30)

Unnamed: 0,screen_name,location,city,country
1617,Atrios,Toon,,
1503,wkamaubell,Sheltering in Place,,
1413,djangoproject,Web,,
1002,nicolesjchung,Instagram: @nicolesjchung,,
956,FFF_goteborg,Göteborg,,
661,AdaLovelaceInst,United Kingdom,,
1024,jbenedictbrown,"Vännäs, Sverige",,
1778,brandon_rhodes,"Bluffton, Ohio",,
714,annaraven,Sillycon Valley,,
182,GailSimone,I need more dice,,


In [110]:
# with pd.option_context('display.max_rows', None):
#     display(missed)

In [111]:
# with pd.option_context('display.max_rows', None):
#     display(res_df_dropna[~res_df_dropna['country'].isna()])

## GeoCoding with Web

In [1]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent="twitter-tools")

In [None]:
df = pd.DataFrame(data_coded)

In [None]:
def get_country(loc):
    try:
        return loc.split(',')[-1].strip()
    except:
        return None

In [None]:
s = df['location_address']
df['country'] = s.apply(get_country)

In [None]:
df.to_csv('data.csv')

In [None]:
geocoded = {}
for item in data_coded:
    geocoded[item['location_raw']] = dict(
        loc_address=item['location_address'],
        loc_latitude=item['location_latitude'],
        loc_longitude=item['location_longitude']
    )

In [None]:
with open('geo_cache.json', 'w') as f:
    json.dump(geocoded, f)

In [None]:
def geocode(loc):
    location = geolocator.geocode(loc)
    if location:
        return dict(
            loc_address=location.address,
            loc_latitude=location.latitude,
            loc_longitude=location.longitude
        )
    return dict(
            loc_address=None,
            loc_latitude=None,
            loc_longitude=None
        )

In [None]:
df_cities = pd.read_csv('../data/simplemaps/worldcities.csv')

In [None]:
data_coded = []
for screen_name, location in data:
    country = geocode_simplemaps(location)
    data_coded.append(dict(
        screen_name=screen_name,
        location=location,
        country=country
    ))

In [None]:
pd.DataFrame(data_coded).country.unique()

In [None]:
df['location'].apply(geocode_simplemaps)

In [None]:
data_coded = []
for screen_name, loc in tqdm(data):
    entry = dict(screen_name=screen_name, location_raw=loc)
    # check cache
    if loc in geocoded:
        geo_vals = geocoded[loc]
    else:
        geo_vals = geocode(loc)
    data_coded.append({**entry, **geo_vals})

### GeoJSON

In [71]:
with open('../data/worldgeo.json') as f:
    geojson = json.load(f)

#### Add data

In [91]:
df = pd.read_csv("found.csv")

In [108]:
counts = df[["screen_name", "country"]].groupby(by='country').count()["screen_name"]

In [111]:
for feature in geojson["features"]:
    props = feature["properties"]
    if props["name"] in counts:
        props["count"] = int(counts[props["name"]])

In [112]:
checkname = "Denmark"

counts[checkname]

7

In [113]:
val = [
    x for x in geojson["features"]
    if x["properties"]["name"] == checkname
].pop()
val["properties"]["count"]

7

In [115]:
with open('../data/worldgeo_export.json', "w") as f:
    json.dump(geojson, f)

In [28]:
# [x['properties']['sovereignt'] for x in geojson['features']]