# Data prep

#### import some modules we'll need

In [285]:
import pandas as pd
import requests
import numpy as np
from collections import Counter
import codecs

#### read in data tables

In [246]:
df = pd.read_csv("LETTHEKIDS Cleaned CSV GEOTAGS.csv")

In [247]:
d = pd.read_csv("fk_comments corrected Geotags.csv",header=None)

#### clean up tables

In [248]:
d = d.iloc[:,:12] # this removes empty columns

In [249]:
# the fk_comments file had no header, so I compared columns with other file to get header

d.rename(columns = {
        0:"id",
        1:"guid",
        2:"link",
        3:"medialink",
        4:"pubdate",
        5:"author",
        6:"title",
        7:"description",
        8:"like_count",
        9:"filter",
        10:"coords",
        11:"not_sure_what_this_is"
    },
        inplace=True)

#### Design our API call; make a test call to make sure it works

In [250]:
BASEURL = "https://maps.googleapis.com/maps/api/geocode/json?"
KEY="key=" # you'll need to get an API key and put it here

In [251]:
df.coords.iloc[15]

'-22.5617,-47.4028'

In [252]:
qstring = BASEURL+KEY+"&latlng="+df.coords.iloc[15] # this string is our API call

In [253]:
r = requests.get(qstring) # this is the actual http request

In [254]:
# this code takes the API response, parses the json, and extracts locality and country

ac = r.json()['results'][0]['address_components']
locality = [item['long_name'] for item in ac if 'locality' in item['types']][0]
country = [item['long_name'] for item in ac if 'country' in item['types']][0]
print locality,country

Limeira Brazil


# Make API calls

### Requests for first dataset

In [255]:
address_components_list = []
errors = []

In [256]:
for i in df.index:
    coords = df.coords.loc[i]
    
    if np.random.binomial(1,0.001)==1: 
        print i # printing roughly every thousandth index; this is to monitor progress and is nonessential
    
    try:
        coords.split(",")[1] # a trick to ignore any missing coords
        qstring = BASEURL+KEY+"&latlng="+coords
        r = requests.get(qstring)
        address_components = r.json()['results'][0]['address_components']
        address_components_list.append(address_components)
    except Exception as e:
        errors.append((i,e))
        address_components_list.append('missing')  

3224
3355
4592
6139
6733
7071
7517
8441
12986
13462
13864
16061
17932
18725
23169
23920


### Extracting country and locality

In [262]:
countries = []

In [263]:
for address_components in address_components_list:
    try:
        country = [item['long_name'] for item in address_components if 'country' in item['types']][0]
        countries.append(country)
    except:
        countries.append('missing')

In [264]:
df['country'] = countries

In [265]:
localities = []

In [266]:
for address_components in address_components_list:
    try:
        locality = [item['long_name'] for item in address_components if 'locality' in item['types']][0]
        localities.append(locality)
    except:
        localities.append('missing')

In [267]:
df['locality'] = localities

In [274]:
len(df[df.country!='missing'])

4060

In [275]:
len(df[df.locality!='missing'])

3815

In [276]:
len(df[df.coords.notnull()])

4060

In [272]:
df['address_components'] = address_components_list

In [277]:
len(df[df.address_components!='missing'])

4060

In [279]:
del df['address_components']

In [287]:
dftmp = df.replace('missing',np.nan)

In [288]:
dftmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25332 entries, 0 to 25331
Data columns (total 16 columns):
id                    25332 non-null int64
guid                  25332 non-null object
link                  25332 non-null object
medialink             25332 non-null object
pubdate               25332 non-null object
author                25332 non-null object
title                 25071 non-null object
description           25071 non-null object
like_count            25332 non-null int64
filter                25332 non-null object
to                    0 non-null float64
coords                4060 non-null object
title_cleaned         0 non-null float64
like_count_cleaned    0 non-null float64
country               4060 non-null object
locality              3815 non-null object
dtypes: float64(3), int64(2), object(11)
memory usage: 3.1+ MB


### Dealing with encoding issues

In [464]:
dftmp = dftmp.fillna('missing')

In [466]:
# this is a hack, but I tried everything else and nothing would work...
dftmp.locality = dftmp.locality.map(lambda x: x.encode('unicode-escape').decode('utf-8'))

In [468]:
dftmp.to_csv("letthekids_country_locality.csv",index=False)

### Requests for second dataset

In [470]:
address_components_list_2 = []

In [471]:
for i in d.index:
    coords = d.coords.loc[i]
    
    if np.random.binomial(1,0.001)==1: 
        print i # printing roughly every thousandth index; this is to monitor progress and is nonessential
    
    try:
        coords.split(",")[1] # a trick to ignore any missing coords
        qstring = BASEURL+KEY+"&latlng="+coords
        r = requests.get(qstring)
        address_components = r.json()['results'][0]['address_components']
        address_components_list_2.append(address_components)
        
    except Exception as e:
        errors.append((i,e))
        address_components_list_2.append('missing') 

770
1331
1555
1903
2371
3274
3448
7185
7540
8159
8299
9098
9813
11952
12630
14370
15246
15961
17618
17943
22201
23504
25044
26273
27061
27369
29451
29635
29950
30811


In [472]:
countries = []

In [473]:
for address_components in address_components_list_2:
    try:
        country = [item['long_name'] for item in address_components if 'country' in item['types']][0]
        countries.append(country)
    except:
        countries.append('missing')

In [474]:
d['country'] = countries

In [475]:
localities = []

In [476]:
for address_components in address_components_list_2:
    try:
        locality = [item['long_name'] for item in address_components if 'locality' in item['types']][0]
        localities.append(locality)
    except:
        localities.append('missing')

In [477]:
d['locality'] = localities

In [478]:
len(d[d.country!='missing'])

3424

In [480]:
len(d[d.coords.notnull()])

3435

In [482]:
d = d.fillna('missing')

In [483]:
d.locality = d.locality.map(lambda x: x.encode('unicode-escape').decode('utf-8'))

In [485]:
d.country = d.country.map(lambda x: x.encode('unicode-escape').decode('utf-8'))

In [486]:
d.to_csv("fkcomments_country_locality.csv",index=False)