# Exploration Geodata

Looking at where the videos were uploaded from using Python.

## Setup

### Imports

In [1]:
from datetime import date, datetime, timedelta, tzinfo
import json

import geopandas
import folium
import pandas as pd

from folium import plugins as folium_plugins

### Helpers

In [2]:
def load_geodata(dc_attack=False):
    """ Return DF """
    geodata_path = 'data/parler-videos-geocoded.csv'
    df_geodata = pd.read_csv(geodata_path)
    df_geodata['Timestamp'] = df_geodata['Timestamp'].map(lambda x: datetime.fromisoformat(x))
    
    if dc_attack:
        df_geodata = df_geodata[df_geodata['Timestamp'].map(lambda x: x.year) == 2021]
        df_geodata = df_geodata[df_geodata['Timestamp'].map(lambda x: x.month) == 1]
        df_geodata = df_geodata[df_geodata['Timestamp'].map(lambda x: x.day) == 6]
        df_geodata = df_geodata[
            (df_geodata['Longitude'] >= -77.6349) & \
            (df_geodata['Longitude'] <= -76.4803) & \
            (df_geodata['Latitude'] >= 38.5572) & \
            (df_geodata['Latitude'] <= 39.2300)]
    
    df_geodata = df_geodata.set_index('Timestamp')
    df_geodata = df_geodata.tz_localize('UTC').tz_convert('EST')
        
    print(f'Loaded {len(df_geodata)} data points.')
    return df_geodata


def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)


def iterweek(start_day, end_day):
    if start_day > end_day:
        print('Error: start day after end day')
        return None
    
    start_weekday = start_day.isocalendar()[2]
    if start_weekday != 1:
        start_day = start_day - timedelta(days=start_weekday - 1)
    
    end_weekday = end_day.isocalendar()[2]
    if end_weekday != 7:
        end_day = end_day + timedelta(days=(7 - end_weekday))
    
    num_weeks = (int((end_day - start_day).days) + 1) // 7
    week_start = start_day
    for i in range(num_weeks):
        week_end = week_start + timedelta(days=6)
        yield week_start, week_end
        week_start = week_end + timedelta(days=1)

## Data Exploration

### Loading Data

Data from the riots.

In [3]:
df_dc_riots = load_geodata(dc_attack=True)
df_dc_riots.head()

Loaded 1209 data points.


Unnamed: 0_level_0,Longitude,Latitude,ID
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-05 19:02:39-05:00,-77.031,38.896,fR3uMI0eP5i4
2021-01-05 19:02:39-05:00,-77.031,38.896,oUTz8VjR0TS0
2021-01-05 19:02:39-05:00,-77.031,38.896,g8U9LBKTG01A
2021-01-05 19:11:14-05:00,-77.0476,38.8962,jrd59p8tVRbY
2021-01-05 19:11:14-05:00,-77.0476,38.8962,vimMsf0RVPsw


Data with state and county information.

In [4]:
with open('usa_videos.json', 'r') as f:
    data = json.load(f)
    
data['data_points'][0]

{'location': {'lat': '39.47353',
  'lon': '-118.77737',
  'name': 'Fallon',
  'admin1': 'Nevada',
  'admin2': 'Churchill County',
  'cc': 'US'},
 'timestamp': '2011-08-01 22:22:40'}

In [5]:
geo_list = [{
    'timestamp': datetime.fromisoformat(p['timestamp']),
    'lat': p['location']['lat'],
    'lon': p['location']['lon'],
    'state': p['location']['admin1'],
    'county': p['location']['admin2'],
} for p in data['data_points']]

In [6]:
df = pd.DataFrame(geo_list)

# Only keep videos from 2020 and onwards
df = df[df['timestamp'].dt.year >= 2020]

print(len(df))
df.head()

57539


Unnamed: 0,timestamp,lat,lon,state,county
1487,2020-01-01 03:30:49,35.74541,-81.68482,North Carolina,Burke County
1488,2020-01-01 08:05:32,33.65344,-84.44937,Georgia,Fulton County
1489,2020-01-01 10:00:35,33.61252,-117.71283,California,Orange County
1490,2020-01-01 15:26:49,33.46697,-117.69811,California,Orange County
1491,2020-01-01 19:30:18,29.49523,-98.61863,Texas,Bexar County


### Cleaning Count Information and Adding FIPS

County data obtained from https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/us_county_data.csv

In [7]:
df_county_data = pd.read_csv('data/us_county_data.csv')
df_county_data = df_county_data[['FIPS_Code', 'State', 'Area_name']]

df_county_data.head()

Unnamed: 0,FIPS_Code,State,Area_name
0,0,US,United States
1,1000,AL,Alabama
2,1001,AL,Autauga County
3,1003,AL,Baldwin County
4,1005,AL,Barbour County


In [8]:
df_state_abrevs = df_county_data[df_county_data['FIPS_Code'] % 1000 == 0]
df_state_abrevs.head()

Unnamed: 0,FIPS_Code,State,Area_name
0,0,US,United States
1,1000,AL,Alabama
69,2000,AK,Alaska
102,4000,AZ,Arizona
118,5000,AR,Arkansas


Setting the county for Washington, D.C.

In [9]:
df.loc[(df['state'] == 'Washington, D.C.'), 'county'] = 'District of Columbia'

Extracting State names and county fips

In [10]:
county_name_dict = {}

for state_idx, state_row in df_state_abrevs.iterrows():
    state_abbrev = state_row['State']
    state_name = state_row['Area_name']
    
    county_name_dict[state_name] = {}
    
    for county_idx, county_row in df_county_data.iterrows():
        county_name = county_row['Area_name']
        county_fips = county_row['FIPS_Code']
        if county_row['State'] == state_abbrev:
            county_name_dict[state_name][county_name] = county_fips

Adding D.C.

In [11]:
county_name_dict['Washington, D.C.'] = {
    'District of Columbia': 11001
}

Adding Missing county

In [12]:
df.loc[((df['state'] == 'New York') &
        (df['county'] == '')), 'county'] = 'New York County'

Fixing Key Errors
* All: `'... city'` -> `'City of ...'`
* All: `'St. ... County'` -> `'Saint ... County'`
* New York: `'Bronx County'` -> `'Bronx'`
* Louisiana: `'... Parish'` -> `'... County'`
* Louisiana, Texas, Illinois: `'La Salle County'` -> `'LaSalle County'`
* Mississippi: `'DeSoto County'` -> `'De Soto County'`
* Florida: `'DeSoto County'` -> `'De Soto County'`
* Alaska: `'Prince of Wales-Outer Ketchikan Census A'` -> `'Annette Island Reserve'`

In [13]:
# St. -> State
counties_st_cleaned = {}
for state, county_fips in county_name_dict.items():
    new_state_counties = county_fips.copy()
    for county, fips in county_fips.items():
        if county.startswith('Ste.'):
            county_name = county[5:]
            new_state_counties[f'Sainte {county_name}'] = fips
        elif county.startswith('St.'):
            county_name = county[4:]
            new_state_counties[f'Saint {county_name}'] = fips
            
    counties_st_cleaned[state] = new_state_counties
county_name_dict = counties_st_cleaned.copy()

# ... city -> City of ...
counties_st_cleaned = {}
for state, county_fips in county_name_dict.items():
    new_state_counties = county_fips.copy()
    for county, fips in county_fips.items():
        if county.endswith('city'):
            city_name = county[:-5]
            new_state_counties[f'City of {city_name}'] = fips
    counties_st_cleaned[state] = new_state_counties
county_name_dict = counties_st_cleaned.copy()

# Louisiana
new_louisiana_counties = county_name_dict['Louisiana'].copy()
for county, fips in county_name_dict['Louisiana'].items():
    if county.endswith('Parish'):
        county_name = county[:-7]
        new_louisiana_counties[f'{county_name} County'] = fips
county_name_dict['Louisiana'] = new_louisiana_counties

county_name_dict['New York']['Bronx'] = county_name_dict['New York']['Bronx County']
county_name_dict['Louisiana']['LaSalle County'] = county_name_dict['Louisiana']['La Salle County']
county_name_dict['Texas']['LaSalle County'] = county_name_dict['Texas']['La Salle County']
county_name_dict['Illinois']['LaSalle County'] = county_name_dict['Illinois']['La Salle County']
county_name_dict['Mississippi']['De Soto County'] = county_name_dict['Mississippi']['DeSoto County']
county_name_dict['Florida']['De Soto County'] = county_name_dict['Florida']['DeSoto County']
county_name_dict['Alaska']['Annette Island Reserve'] = county_name_dict['Alaska']['Prince of Wales-Outer Ketchikan Census A']

Adding FIPS

In [14]:
df['fips'] = df.apply(lambda row: county_name_dict[row.state][row.county], axis=1)

df.head()

Unnamed: 0,timestamp,lat,lon,state,county,fips
1487,2020-01-01 03:30:49,35.74541,-81.68482,North Carolina,Burke County,37023
1488,2020-01-01 08:05:32,33.65344,-84.44937,Georgia,Fulton County,13121
1489,2020-01-01 10:00:35,33.61252,-117.71283,California,Orange County,6059
1490,2020-01-01 15:26:49,33.46697,-117.69811,California,Orange County,6059
1491,2020-01-01 19:30:18,29.49523,-98.61863,Texas,Bexar County,48029


### Videos Uploaded from each State

We group by state to plot a choropleth map

In [15]:
df_state_counts = df.copy()
df_state_counts['count'] = 1
df_state_counts = df_state_counts[['state', 'count']].groupby('state').sum()
df_state_counts = df_state_counts.reset_index()
df_state_counts.head()

Unnamed: 0,state,count
0,Alabama,637
1,Alaska,131
2,Arizona,1955
3,Arkansas,555
4,California,6249


We load the geojson files from github.

In [16]:
url_git = "https://raw.githubusercontent.com/python-visualization/folium/master"
url_state_map = f"{url_git}/examples/data/us-states.json"

In [17]:
m_counts = folium.Map(location=[40, -95], zoom_start=4)

folium.Choropleth(
    geo_data=url_state_map,
    name="choropleth",
    data=df_state_counts,
    columns=["state", "count"],
    key_on="feature.properties.name",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=.1,
    highlight=True,
    legend_name="Videos Coming From Each State",
).add_to(m_counts)

folium.LayerControl().add_to(m_counts)

m_counts

### Heatmap of Datapoints

In [18]:
usa_map_setup = {
    'lat': 37.0902,
    'lon': -95.7129,
    'zoom': 4,
}

heatmap_usa = folium.Map(
    location=[usa_map_setup['lat'], usa_map_setup['lon']],
    zoom_start=usa_map_setup['zoom'])
usa_heatmap_data = [[row['lat'], row['lon']] for index, row in df.iterrows()]

folium_plugins.HeatMap(
    usa_heatmap_data,
    radius=5,
    gradient={0.2: 'blue', 0.3: 'lime', 0.4: 'red'},
).add_to(heatmap_usa)

heatmap_usa