# Cleaning Data for D3

This notebook uses the `reverse_geocoder` package to obtain State and County information for data points in the USA. It can be installed using:

`pip install reverse_geocoder`

## Setup

### Imports

In [55]:
import json as json
from datetime import datetime

import numpy as np
import pandas as pd
import reverse_geocoder as rg

from tqdm import tqdm

### Path to Data

In [2]:
geodata_path = 'data/parler-videos-geocoded.csv'

## Obtaining State and County Information

### Loading Data

In [3]:
df_geodata = pd.read_csv(geodata_path)
df_geodata.head()

Unnamed: 0,Longitude,Latitude,Timestamp,ID
0,0.0,0.0,2010-08-08 21:44:38,PtowPIzpewhu
1,0.0,0.0,2011-03-19 16:48:35,dGOhNqNgNywF
2,-118.8878,39.5554,2011-08-01 22:22:40,RGTLwBQugFNU
3,-74.6049,39.3308,2011-11-11 21:36:54,YNBV5GZkeM2E
4,-101.8747,33.4269,2012-12-24 22:50:15,oqLyjjYghOMi


### Exploring How the Package Works, Testing Performance

In [19]:
def get_coord(index):
    x = df_geodata.loc[index, ['Longitude', 'Latitude']]
    return x.Longitude, x.Latitude

def viz_coord(index):
    lon, lat = get_coord(index)
    results = rg.search((lat, lon))
    print(lon, lat)
    print(results)

In [20]:
viz_coord(0)

0.0 0.0
[{'lat': '4.88447', 'lon': '-1.75536', 'name': 'Takoradi', 'admin1': 'Western', 'admin2': '', 'cc': 'GH'}]


In [21]:
viz_coord(2)

-118.8878 39.5554
[{'lat': '39.47353', 'lon': '-118.77737', 'name': 'Fallon', 'admin1': 'Nevada', 'admin2': 'Churchill County', 'cc': 'US'}]


In [22]:
viz_coord(3)

-74.6049 39.3308
[{'lat': '39.31762', 'lon': '-74.5946', 'name': 'Somers Point', 'admin1': 'New Jersey', 'admin2': 'Atlantic County', 'cc': 'US'}]


### Filtering Data to Remove Missing Locations

In [33]:
rows = [(r[1].Latitude, r[1].Longitude) for r in df_geodata.iterrows()
       if np.abs(r[1].Latitude) > 1 and np.abs(r[1].Longitude) > 1]

print(len(df_geodata))
print(len(rows))

68284
63941


### Getting Information

In [36]:
batch_size = 1000
locations = []

for b in tqdm(range(0, len(rows), batch_size)):
    data = rows[b:b+batch_size]
    results = rg.search(data)
    locations += results

100%|██████████| 64/64 [00:41<00:00,  1.53it/s]


### Cleaning Output

In [44]:
row_timestamps = [(r[1].Timestamp) for r in df_geodata.iterrows()
                  if np.abs(r[1].Latitude) > 1 and np.abs(r[1].Longitude) > 1]

In [45]:
print(len(locations), len(row_timestamps))

63941 63941


In [46]:
usa_videos = {
    'data_points': [
        {'location': loc, 'timestamp': t} for loc, t in zip(locations, row_timestamps)
        if loc['cc']=='US'
    ]
}

In [47]:
print(len(usa_videos['data_points']))

59026


### Testing Results

In [50]:
print(usa_videos['data_points'][0])

{'location': {'lat': '39.47353', 'lon': '-118.77737', 'name': 'Fallon', 'admin1': 'Nevada', 'admin2': 'Churchill County', 'cc': 'US'}, 'timestamp': '2011-08-01 22:22:40'}


In [51]:
print(usa_videos['data_points'][1])

{'location': {'lat': '39.31762', 'lon': '-74.5946', 'name': 'Somers Point', 'admin1': 'New Jersey', 'admin2': 'Atlantic County', 'cc': 'US'}, 'timestamp': '2011-11-11 21:36:54'}


In [52]:
print(usa_videos['data_points'][1234])

{'location': {'lat': '40.40372', 'lon': '-73.99153', 'name': 'Highlands', 'admin1': 'New Jersey', 'admin2': 'Monmouth County', 'cc': 'US'}, 'timestamp': '2019-10-19 15:51:45'}


In [53]:
print(usa_videos['data_points'][22462])

{'location': {'lat': '44.16358', 'lon': '-93.9994', 'name': 'Mankato', 'admin1': 'Minnesota', 'admin2': 'Blue Earth County', 'cc': 'US'}, 'timestamp': '2020-10-31 20:43:04'}


In [54]:
print(usa_videos['data_points'][35121])

{'location': {'lat': '38.89511', 'lon': '-77.03637', 'name': 'Washington, D.C.', 'admin1': 'Washington, D.C.', 'admin2': '', 'cc': 'US'}, 'timestamp': '2020-11-15 01:02:28'}


### Saving All Datapoints

In [56]:
with open('usa_videos.json', 'w') as f:
    json.dump(usa_videos, f)

### Saving a Subset

In [63]:
np.random.seed(0)
small_videos_idxs = np.random.choice(len(usa_videos['data_points']), 2500, replace=False)

In [64]:
usa_videos_small = {
    'data_points': [
        dp for i, dp in enumerate(usa_videos['data_points'])
        if i in small_videos_idxs
    ]
}

In [66]:
print(len(usa_videos_small['data_points']))

2500


In [67]:
with open('usa_videos_2500.json', 'w') as f:
    json.dump(usa_videos_small, f)