# Create Weekly Exports

#### Primary Author
Chris Carey

#### Description:
This notebook produces drops unneeded columns from point of interest (POI) weekly trip data and inverts them to produce weekly trip data grouped by home census block group (CBG).
 
#### Inputs:
```
data/cbg_attr_and_cluster.csv
data/device_count_nyc_weekly.csv
data/weekly_and_core_with_area.csv
data/weekly_trips_by_cbg.csv
```
 
#### Outputs:
```
exports/home_weekly.csv
exports/poi_weekly.csv
```

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
def peek(df):
    display(df.iloc[0:3, :])
    print(len(df))
    
def repair_dates(df):
    df['date_range_start'] = pd.to_datetime(df['date_range_start'], utc=True)
    df['date_range_start'] = df['date_range_start'].dt.tz_convert('US/Eastern')

## Export Weekly Trip Data by POI CBG

In [3]:
poi_df = pd.read_csv('./data/weekly_and_core_with_area.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
poi_df = poi_df[[
    'placekey', 'poi_cbg', 'date_range_start', 'date_range_end',
    'raw_visit_counts', 'area_square_feet'
]]
poi_df['contact_density'] = (poi_df['raw_visit_counts'] / poi_df['area_square_feet'])
poi_df = poi_df.drop(
    columns=['raw_visit_counts', 'area_square_feet', 'date_range_end'])

In [5]:
repair_dates(poi_df)

In [6]:
poi_df = poi_df.sort_values(by=['poi_cbg', 'date_range_start'])
peek(poi_df)

Unnamed: 0,placekey,poi_cbg,date_range_start,contact_density
2818653,22b-222@627-rwr-4n5,360050002001,2018-12-31 00:00:00-05:00,0.039394
2818654,22b-222@627-rwr-4n5,360050002001,2019-01-07 00:00:00-05:00,0.026984
2818655,22b-222@627-rwr-4n5,360050002001,2019-01-14 00:00:00-05:00,0.019014


3693204


In [7]:
poi_df.to_csv(f'./exports/poi_weekly.csv', index=False)

## Prepare Device Count Data

In [8]:
dc_df = pd.read_csv('./data/device_count_nyc_weekly.csv')

In [9]:
dc_df = dc_df.rename(columns={'origin_census_block_group': 'home_cbg'})
dc_df = dc_df.drop(columns=['date_range_end'])

In [10]:
repair_dates(dc_df)

## Export Weekly Trip Data by Home CBG

In [11]:
trips_df = pd.read_csv('./data/weekly_trips_by_cbg.csv')

In [12]:
trips_df = trips_df.rename(columns={'visitor_home_cbg_id': 'home_cbg'})
trips_df = trips_df.drop(columns=['poi_cbg', 'date_range_end', 'distance'])
repair_dates(trips_df)
peek(trips_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count
0,222-222@627-s94-nwk,2020-12-21 00:00:00-05:00,360470399002,4
1,222-222@627-s94-nwk,2020-12-21 00:00:00-05:00,360471142022,4
2,222-222@627-s94-nwk,2020-12-21 00:00:00-05:00,360470395002,4


27839727


In [13]:
merged_df = trips_df.merge(dc_df,
                           on=['home_cbg', 'date_range_start'],
                           how='inner')
peek(merged_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count,device_count
0,222-222@627-s94-nwk,2020-12-21 00:00:00-05:00,360470399002,4,192
1,22m-222@627-s96-pqf,2020-12-21 00:00:00-05:00,360470399002,5,192
2,22d-222@627-s7r-49z,2020-12-21 00:00:00-05:00,360470399002,4,192


27825638


In [15]:
population_df = pd.read_csv('./data/cbg_attr_and_cluster_1021.csv')

In [16]:
population_df = population_df[['census_block_group', 'Total Population']]
population_df = population_df.rename(columns={
    'census_block_group': 'home_cbg',
    'Total Population': 'population'
})

In [17]:
peek(population_df)

Unnamed: 0,home_cbg,population
0,360050001001,6864
1,360050002001,1744
2,360050002002,1569


6221


In [18]:
est_df = merged_df.merge(population_df, on=['home_cbg'], how='inner')

In [19]:
est_df['estimated_visitor_count'] = (est_df['visitor_count'] *
                                     est_df['population'] /
                                     est_df['device_count'])
peek(est_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count,device_count,population,estimated_visitor_count
0,222-222@627-s94-nwk,2020-12-21 00:00:00-05:00,360470399002,4,192,1487,30.979167
1,22m-222@627-s96-pqf,2020-12-21 00:00:00-05:00,360470399002,5,192,1487,38.723958
2,22d-222@627-s7r-49z,2020-12-21 00:00:00-05:00,360470399002,4,192,1487,30.979167


27738422


In [20]:
est_df = est_df.drop(columns=['device_count', 'population'])

In [21]:
# Sum visitor count and estimated visitor count per CBG.
total_df = est_df.groupby(by=['home_cbg', 'date_range_start']).agg({'visitor_count': 'sum', 'estimated_visitor_count': 'sum'}).reset_index()
peek(total_df)

Unnamed: 0,home_cbg,date_range_start,visitor_count,estimated_visitor_count
0,360050001001,2018-12-31 00:00:00-05:00,16,1056.0
1,360050001001,2019-01-07 00:00:00-05:00,8,406.755556
2,360050001001,2019-01-14 00:00:00-05:00,20,1064.186047


680133


In [22]:
pct_df = est_df.merge(total_df,
                      how='left',
                      on=['home_cbg', 'date_range_start'],
                      suffixes=('_to_poi', '_to_all'))
peek(pct_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count_to_poi,estimated_visitor_count_to_poi,visitor_count_to_all,estimated_visitor_count_to_all
0,222-222@627-s94-nwk,2020-12-21 00:00:00-05:00,360470399002,4,30.979167,41,317.536458
1,22m-222@627-s96-pqf,2020-12-21 00:00:00-05:00,360470399002,5,38.723958,41,317.536458
2,22d-222@627-s7r-49z,2020-12-21 00:00:00-05:00,360470399002,4,30.979167,41,317.536458


27738422


In [23]:
pct_df['pct_visitor_count'] = (pct_df['visitor_count_to_poi'] /
                               pct_df['visitor_count_to_all'])
pct_df['pct_estimated_visitor_count'] = (
    pct_df['estimated_visitor_count_to_poi'] /
    pct_df['estimated_visitor_count_to_all'])

In [24]:
pct_df = pct_df.sort_values(by=['home_cbg', 'date_range_start'])

In [25]:
# Sum values and percentages by home_cbg to verify they equal totals/100%.
pct_check_df = pct_df.groupby(by=['home_cbg', 'date_range_start']).agg({
    'visitor_count_to_poi': 'sum',
    'estimated_visitor_count_to_poi': 'sum',
    'visitor_count_to_all': 'first',
    'estimated_visitor_count_to_all': 'first',
    'pct_visitor_count': 'sum',
    'pct_estimated_visitor_count': 'sum',
}).reset_index()
peek(pct_check_df)

Unnamed: 0,home_cbg,date_range_start,visitor_count_to_poi,estimated_visitor_count_to_poi,visitor_count_to_all,estimated_visitor_count_to_all,pct_visitor_count,pct_estimated_visitor_count
0,360050001001,2018-12-31 00:00:00-05:00,16,1056.0,16,1056.0,1.0,1.0
1,360050001001,2019-01-07 00:00:00-05:00,8,406.755556,8,406.755556,1.0,1.0
2,360050001001,2019-01-14 00:00:00-05:00,20,1064.186047,20,1064.186047,1.0,1.0


680133


In [26]:
pct_df = pct_df.fillna(0)
pct_df = pct_df.rename(columns={'visitor_count_to_poi': 'visitor_count', 'estimated_visitor_count_to_poi': 'estimated_visitor_count'})
pct_df = pct_df.drop(columns=['visitor_count_to_all', 'estimated_visitor_count_to_all'])

In [27]:
peek(pct_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count,estimated_visitor_count,pct_visitor_count,pct_estimated_visitor_count
4722183,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360050001001,4,264.0,0.25,0.25
4722184,223-222@627-wg7-7h5,2018-12-31 00:00:00-05:00,360050001001,4,264.0,0.25,0.25
4722185,22v-222@627-wg4-t5f,2018-12-31 00:00:00-05:00,360050001001,4,264.0,0.25,0.25


27738422


In [28]:
cd_df = pct_df.merge(poi_df, how='inner', on=['placekey', 'date_range_start'])

In [29]:
cd_df = cd_df.fillna(0)
peek(cd_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count,estimated_visitor_count,pct_visitor_count,pct_estimated_visitor_count,poi_cbg,contact_density
0,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360050001001,4,264.0,0.25,0.25,360471194001,0.076046
1,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360050210021,4,6.697436,0.013158,0.013158,360471194001,0.076046
2,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360470285021,4,7.414691,0.007273,0.007273,360471194001,0.076046


27738422


In [30]:
cd_df['cdi'] = cd_df['contact_density'] * cd_df['estimated_visitor_count']
cd_df = cd_df.drop(columns=['poi_cbg', 'contact_density'])
peek(cd_df)

Unnamed: 0,placekey,date_range_start,home_cbg,visitor_count,estimated_visitor_count,pct_visitor_count,pct_estimated_visitor_count,cdi
0,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360050001001,4,264.0,0.25,0.25,20.076046
1,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360050210021,4,6.697436,0.013158,0.013158,0.509311
2,236-222@627-s7z-b6k,2018-12-31 00:00:00-05:00,360470285021,4,7.414691,0.007273,0.007273,0.563855


27738422


In [31]:
cd_df.to_csv(f'./exports/home_weekly.csv', index=False)