In [1]:
import pandas as pd

In [34]:
# read a portion of data to demonstrate our data processing pipeline
# after we finalize the way to preprocess, will apply the pipeline
# to all data in chunksized fashion.
data = pd.read_csv('social_dist_all_trimmed_new.csv', nrows = 100000)

In this notebook we want to get the data about "median_home_dwell_time", "full_time_work_behavior_devices", and "part_time_work_behavior_devices", as we confirmed before that these data are reliable enough for analysis. (unlike distance_traveled, which is full of outliers.) 

In [35]:
data.head()

Unnamed: 0.1,Unnamed: 0,date_range_start,date_range_end,state,state_code,cnamelong,county_code,origin_census_block_group,candidate_device_count,device_count,completely_home_device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,delivery_behavior_devices,median_home_dwell_time,median_non_home_dwell_time,median_percentage_time_home,distance_traveled_from_home
0,0,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1.0,AL,Colbert County,1033,10330210004,179,78,28,7,1,1,714,52,92,687
1,1,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1.0,AL,Jefferson County,1073,10730049022,1312,259,93,13,2,1,58,44,53,5381
2,2,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1.0,AL,Talladega County,1121,11210118001,299,91,45,3,1,1,487,0,100,1591
3,3,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1.0,AL,Tuscaloosa County,1125,11250106021,1059,392,171,26,8,1,860,20,97,4623
4,4,2019-01-01T00:00:00-09:00,2019-01-02T00:00:00-09:00,2.0,AK,Northwest Arctic Borough,2188,21880002003,36,10,3,1,1,1,1197,10,99,0


Even from the first five rows we can see that median_home_dwell_time varies in different census_blocks -- some census blocks have 58, and some have 1000. so using median_home_dwell_time for our analysis is probably a bad choice. We want to calculate the median_percentage_time_home, and use that as our variable for PSM. That means we need the column 'median_home_dwell_time' and 'median_non_home_dwell_time'.

In [36]:
# drop the columns we don't need.
data.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
           'candidate_device_count', 'completely_home_device_count', 
           'delivery_behavior_devices', 'median_percentage_time_home',
           'distance_traveled_from_home'], axis = 1, inplace = True)

In [37]:
data.head()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,median_home_dwell_time,median_non_home_dwell_time
0,2019-01-01T00:00:00-06:00,1.0,AL,Colbert County,1033,78,7,1,714,52
1,2019-01-01T00:00:00-06:00,1.0,AL,Jefferson County,1073,259,13,2,58,44
2,2019-01-01T00:00:00-06:00,1.0,AL,Talladega County,1121,91,3,1,487,0
3,2019-01-01T00:00:00-06:00,1.0,AL,Tuscaloosa County,1125,392,26,8,860,20
4,2019-01-01T00:00:00-09:00,2.0,AK,Northwest Arctic Borough,2188,10,1,1,1197,10


Now we want to aggregate all the rows of the same county on the same date together. We'll treat medians like means and add the products of median_home_dwell_time and device_count together. In the end we will devide this number by the number of devices in that specific county.

In [38]:
data['home_dwell_time'] = data['median_home_dwell_time'] * data['device_count']

In [39]:
data['non_home_dwell_time'] = data['median_non_home_dwell_time'] * data['device_count']

In [47]:
data.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
    .agg(device_count = ('device_count', 'sum'),\
         part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
         full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
         home_dwell_time = ('home_dwell_time', 'sum'), \
         non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01T00:00:00+10:00,66.0,GU,Guam,66010,1328,100,95,869872,6444
1,2019-01-01T00:00:00+10:00,69.0,MP,Rota Municipality,69100,9,1,1,2943,0
2,2019-01-01T00:00:00+10:00,69.0,MP,Saipan Municipality,69110,394,21,19,210899,4480
3,2019-01-01T00:00:00+10:00,69.0,MP,Tinian Municipality,69120,8,1,1,3440,0
4,2019-01-01T00:00:00-04:00,23.0,ME,Washington County,23029,106,3,2,53770,1038
...,...,...,...,...,...,...,...,...,...,...
3147,2019-01-01T00:00:00-10:00,15.0,HI,Honolulu County,15003,26668,870,306,17196806,1329406
3148,2019-01-01T00:00:00-10:00,15.0,HI,Kauai County,15007,1235,64,17,591858,65443
3149,2019-01-01T00:00:00-10:00,15.0,HI,Maui County,15009,4912,212,56,3065207,201779
3150,2019-01-01T00:00:00-11:00,60.0,AS,Eastern District,60010,26,3,3,15948,1120


In [48]:
groupByData = \
data.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
    .agg(device_count = ('device_count', 'sum'),\
         part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
         full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
         home_dwell_time = ('home_dwell_time', 'sum'), \
         non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()

Now we can apply this process to all rows.

In [42]:
from datetime import datetime

In [77]:
# create an empty dataframe with set column names = groupbyData's column names, 
# so we can "catch" the processed data.
aggregatedData = pd.DataFrame(columns = groupByData.columns)

In [78]:
aggregatedData

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time


In [79]:
counter = 0
for chunk in pd.read_csv("social_dist_all_trimmed_new.csv", chunksize = 1000000):
    # drop the columns we don't need.
    chunk.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
                'candidate_device_count', 'completely_home_device_count', 
                'delivery_behavior_devices', 'median_percentage_time_home',
                'distance_traveled_from_home'], axis = 1, inplace = True)
    
    # convert chunk's columns to the right datatype.
    chunk['device_count'] = pd.to_numeric(chunk['device_count'], errors='coerce')
    chunk['part_time_work_behavior_devices'] = pd.to_numeric(chunk['part_time_work_behavior_devices'], errors='coerce')
    chunk['full_time_work_behavior_devices'] = pd.to_numeric(chunk['full_time_work_behavior_devices'], errors='coerce')
    chunk['median_home_dwell_time'] = pd.to_numeric(chunk['median_home_dwell_time'], errors='coerce')
    chunk['median_non_home_dwell_time'] = pd.to_numeric(chunk['median_non_home_dwell_time'], errors='coerce')
    chunk['county_code'] = pd.to_numeric(chunk['county_code'], errors='coerce')
    chunk['state'] = pd.to_numeric(chunk['state'], errors='coerce')
    # drop the null rows (get rid of the row that is header)
    # also gets rid of 2 counties whose FIPS code is null.
    chunk.dropna(inplace = True)
    # convert string column to datetime object
    chunk['date_range_start'] = chunk['date_range_start'].apply(lambda x: datetime.strptime(x[0:10], '%Y-%m-%d'))
    # get the total home_dwell time + total non_home_dwell time
    chunk['home_dwell_time'] = chunk['median_home_dwell_time'] * data['device_count']
    chunk['non_home_dwell_time'] = chunk['median_non_home_dwell_time'] * data['device_count']
    
    chunk.drop(['median_home_dwell_time', 'median_non_home_dwell_time'], axis = 1, inplace = True)
    
    # groupby and aggregate data of the same county of the same date.
    aggregatedData = aggregatedData.append(chunk, ignore_index = True)
    
    aggregatedData = aggregatedData.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
                        .agg(device_count = ('device_count', 'sum'),\
                             part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
                             full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
                             home_dwell_time = ('home_dwell_time', 'sum'), \
                             non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()
    
    print(counter) # track progress
    print(aggregatedData.shape[0]) # how large is our data right now
    counter += 1

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0
16078
1
31642
2
45136
3
61012
4
74182
5
90190
6
103226
7
119274
8
132264
9
148347
10
163825
11
177399
12
193246
13
206447
14
222430
15
235492
16
251533
17
264531
18
280606
19
295997
20
309664
21
325489
22
338709
23
354688
24
367752
25
383789
26
396787
27
412857
28
428225
29
441896
30
457720
31
470943
32
486931
33
499984
34
516029
35
529020
36
545091
37
560551
38
574151
39
590011
40
603191
41
619198
42
632236
43
648282
44
661988
45
677344
46
692973
47
706397
48
722305
49
735441
50
751465
51
764477
52
780531
53
795342
54
809585
55
825311
56
838642
57
854583
58
867690
59
883726
60
896726
61
912784
62
928123
63
941832
64
957699
65
970862
66
986867
67
999887
68
1015942
69
1031267
70
1044986
71
1060864
72
1074027
73
1090036
74
1103047
75
1119100
76
1134233
77
1148166
78
1163993
79
1177214
80
1193181
81
1206260
82
1222307
83
1235299
84
1251375
85
1266898
86
1280432
87
1296308
88
1309465
89
1325449
90
1338475
91
1354518
92
1369029
93
1383565
94
1399231
95
1412621
96
1428537
97
1441670
98
145

In [80]:
aggregatedData.head()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01,1.0,AL,Autauga County,1001.0,4708.0,296.0,66.0,1768723.0,84592.0
1,2019-01-01,1.0,AL,Baldwin County,1003.0,19655.0,1125.0,340.0,6893812.0,323146.0
2,2019-01-01,1.0,AL,Barbour County,1005.0,1570.0,84.0,27.0,395535.0,21684.0
3,2019-01-01,1.0,AL,Bibb County,1007.0,1702.0,102.0,21.0,374234.0,20531.0
4,2019-01-01,1.0,AL,Blount County,1009.0,5224.0,315.0,84.0,1826813.0,134819.0


In [81]:
aggregatedData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889786 entries, 0 to 1889785
Data columns (total 10 columns):
 #   Column                           Dtype         
---  ------                           -----         
 0   date_range_start                 datetime64[ns]
 1   state                            float64       
 2   state_code                       object        
 3   cnamelong                        object        
 4   county_code                      float64       
 5   device_count                     float64       
 6   part_time_work_behavior_devices  float64       
 7   full_time_work_behavior_devices  float64       
 8   home_dwell_time                  float64       
 9   non_home_dwell_time              float64       
dtypes: datetime64[ns](1), float64(7), object(2)
memory usage: 144.2+ MB


Now let's check our aggregation is right by seeing the calculation done on the first million rows.

In [82]:
firstMillionRows = pd.read_csv('social_dist_all_trimmed_new.csv', nrows = 1000000)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [83]:
firstMillionRows.head()

Unnamed: 0.1,Unnamed: 0,date_range_start,date_range_end,state,state_code,cnamelong,county_code,origin_census_block_group,candidate_device_count,device_count,completely_home_device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,delivery_behavior_devices,median_home_dwell_time,median_non_home_dwell_time,median_percentage_time_home,distance_traveled_from_home
0,0.0,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1,AL,Colbert County,1033,10330210004,179,78,28,7,1,1,714,52,92,687
1,1.0,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1,AL,Jefferson County,1073,10730049022,1312,259,93,13,2,1,58,44,53,5381
2,2.0,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1,AL,Talladega County,1121,11210118001,299,91,45,3,1,1,487,0,100,1591
3,3.0,2019-01-01T00:00:00-06:00,2019-01-02T00:00:00-06:00,1,AL,Tuscaloosa County,1125,11250106021,1059,392,171,26,8,1,860,20,97,4623
4,4.0,2019-01-01T00:00:00-09:00,2019-01-02T00:00:00-09:00,2,AK,Northwest Arctic Borough,2188,21880002003,36,10,3,1,1,1,1197,10,99,0


In [84]:
firstMillionRows.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
                       'candidate_device_count', 'completely_home_device_count', 
                       'delivery_behavior_devices', 'median_percentage_time_home',
                       'distance_traveled_from_home'], axis = 1, inplace = True)

In [85]:
firstMillionRows['device_count'] = pd.to_numeric(firstMillionRows['device_count'], errors='coerce')
firstMillionRows['part_time_work_behavior_devices'] = pd.to_numeric(firstMillionRows['part_time_work_behavior_devices'], errors='coerce')
firstMillionRows['full_time_work_behavior_devices'] = pd.to_numeric(firstMillionRows['full_time_work_behavior_devices'], errors='coerce')
firstMillionRows['median_home_dwell_time'] = pd.to_numeric(firstMillionRows['median_home_dwell_time'], errors='coerce')
firstMillionRows['median_non_home_dwell_time'] = pd.to_numeric(firstMillionRows['median_non_home_dwell_time'], errors='coerce')
firstMillionRows['county_code'] = pd.to_numeric(firstMillionRows['county_code'], errors='coerce')
firstMillionRows['state'] = pd.to_numeric(firstMillionRows['state'], errors='coerce')
# drop the null rows (get rid of the row that is header)
# also gets rid of 2 counties whose FIPS code is null.
firstMillionRows.dropna(inplace = True)

In [86]:
firstMillionRows['date_range_start'] = firstMillionRows['date_range_start'].apply(lambda x: datetime.strptime(x[0:10], '%Y-%m-%d'))
# get the total home_dwell time + total non_home_dwell time
firstMillionRows['home_dwell_time'] = firstMillionRows['median_home_dwell_time'] * firstMillionRows['device_count']
firstMillionRows['non_home_dwell_time'] = firstMillionRows['median_non_home_dwell_time'] * firstMillionRows['device_count']

firstMillionRows.drop(['median_home_dwell_time', 'median_non_home_dwell_time'], axis = 1, inplace = True)

In [87]:
firstMillionRows = firstMillionRows.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
                        .agg(device_count = ('device_count', 'sum'),\
                             part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
                             full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
                             home_dwell_time = ('home_dwell_time', 'sum'), \
                             non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()

In [88]:
firstMillionRows.head()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01,1.0,AL,Autauga County,1001.0,4708.0,296.0,66.0,4019193.0,162117.0
1,2019-01-01,1.0,AL,Baldwin County,1003.0,19655.0,1125.0,340.0,14443397.0,684136.0
2,2019-01-01,1.0,AL,Barbour County,1005.0,1570.0,84.0,27.0,1009240.0,66210.0
3,2019-01-01,1.0,AL,Bibb County,1007.0,1702.0,102.0,21.0,1450137.0,87795.0
4,2019-01-01,1.0,AL,Blount County,1009.0,5224.0,315.0,84.0,4620005.0,261769.0


In [89]:
aggregatedData.head()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01,1.0,AL,Autauga County,1001.0,4708.0,296.0,66.0,1768723.0,84592.0
1,2019-01-01,1.0,AL,Baldwin County,1003.0,19655.0,1125.0,340.0,6893812.0,323146.0
2,2019-01-01,1.0,AL,Barbour County,1005.0,1570.0,84.0,27.0,395535.0,21684.0
3,2019-01-01,1.0,AL,Bibb County,1007.0,1702.0,102.0,21.0,374234.0,20531.0
4,2019-01-01,1.0,AL,Blount County,1009.0,5224.0,315.0,84.0,1826813.0,134819.0


... It seems like we have all the other columns right but we don't have the right home_dwell_time and non_home_dwell_time... what is going on???

Let's try the chunksized process with the first million rows, and see if the result for home_dwell_time alighs.

In [78]:
# first we need to get headers for aggregatedData.
data = pd.read_csv('social_dist_all_trimmed_new.csv', nrows = 100000)

In [79]:
# drop the columns we don't need.
data.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
           'candidate_device_count', 'completely_home_device_count', 
           'delivery_behavior_devices', 'median_percentage_time_home',
           'distance_traveled_from_home'], axis = 1, inplace = True)

In [80]:
data['home_dwell_time'] = data['median_home_dwell_time'] * data['device_count']

In [81]:
data['non_home_dwell_time'] = data['median_non_home_dwell_time'] * data['device_count']

In [82]:
groupByData = \
data.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
    .agg(device_count = ('device_count', 'sum'),\
         part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
         full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
         home_dwell_time = ('home_dwell_time', 'sum'), \
         non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()

In [98]:
aggregatedData = pd.DataFrame(columns = groupByData.columns)

In [99]:
aggregatedData

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time


In [100]:
counter = 0
for chunk in pd.read_csv("social_dist_all_trimmed_new.csv", chunksize = 100000, nrows = 1000000):
    # drop the columns we don't need.
    chunk.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
                'candidate_device_count', 'completely_home_device_count', 
                'delivery_behavior_devices', 'median_percentage_time_home',
                'distance_traveled_from_home'], axis = 1, inplace = True)
    
    # convert chunk's columns to the right datatype.
    chunk['device_count'] = pd.to_numeric(chunk['device_count'], errors='coerce')
    chunk['part_time_work_behavior_devices'] = pd.to_numeric(chunk['part_time_work_behavior_devices'], errors='coerce')
    chunk['full_time_work_behavior_devices'] = pd.to_numeric(chunk['full_time_work_behavior_devices'], errors='coerce')
    chunk['median_home_dwell_time'] = pd.to_numeric(chunk['median_home_dwell_time'], errors='coerce')
    chunk['median_non_home_dwell_time'] = pd.to_numeric(chunk['median_non_home_dwell_time'], errors='coerce')
    chunk['county_code'] = pd.to_numeric(chunk['county_code'], errors='coerce')
    chunk['state'] = pd.to_numeric(chunk['state'], errors='coerce')
    # drop the null rows (get rid of the row that is header)
    # also gets rid of 2 counties whose FIPS code is null.
    chunk.dropna(inplace = True)
    # convert string column to datetime object
    chunk['date_range_start'] = chunk['date_range_start'].apply(lambda x: datetime.strptime(x[0:10], '%Y-%m-%d'))
    # get the total home_dwell time + total non_home_dwell time
    chunk['home_dwell_time'] = chunk['median_home_dwell_time'] * data['device_count']
    chunk['non_home_dwell_time'] = chunk['median_non_home_dwell_time'] * data['device_count']
    
    chunk.drop(['median_home_dwell_time', 'median_non_home_dwell_time'], axis = 1, inplace = True)
    
    # groupby and aggregate data of the same county of the same date.
    aggregatedData = aggregatedData.append(chunk, ignore_index = True)
    
    aggregatedData = aggregatedData.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
                        .agg(device_count = ('device_count', 'sum'),\
                             part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
                             full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
                             home_dwell_time = ('home_dwell_time', 'sum'), \
                             non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()
    
    print(counter) # track progress
    print(aggregatedData.shape[0]) # how large is our data right now
    counter += 1

0
3140
1
3219


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


2
6330
3
6444
4
9476
5
9657
6
12545
7
12871
8
15411
9
16078


In [101]:
aggregatedData.head()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01,1.0,AL,Autauga County,1001.0,4708.0,296.0,66.0,1768723.0,84592.0
1,2019-01-01,1.0,AL,Baldwin County,1003.0,19655.0,1125.0,340.0,6893812.0,323146.0
2,2019-01-01,1.0,AL,Barbour County,1005.0,1570.0,84.0,27.0,395535.0,21684.0
3,2019-01-01,1.0,AL,Bibb County,1007.0,1702.0,102.0,21.0,374234.0,20531.0
4,2019-01-01,1.0,AL,Blount County,1009.0,5224.0,315.0,84.0,1826813.0,134819.0


As we can see, the home_dwell_time for row 1 is wrong, but other columns are good.. It got to be our chunksized process that is wrong.

Okay.... I swear I didn't mean for this to happen but the row for getting the home_dwell_time I put
```python
chunk['home_dwell_time'] = chunk['median_home_dwell_time'] * data['device_count']
```
where it should be
```python
chunk['home_dwell_time'] = chunk['median_home_dwell_time'] * chunk['device_count']
```
I cannot believe I didn't spot it. Maybe it was because I was copying the code from what I've done above... 

In [103]:
aggregatedData = pd.DataFrame(columns = groupByData.columns)

In [104]:
counter = 0
for chunk in pd.read_csv("social_dist_all_trimmed_new.csv", chunksize = 100000, nrows = 1000000):
    # drop the columns we don't need.
    chunk.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
                'candidate_device_count', 'completely_home_device_count', 
                'delivery_behavior_devices', 'median_percentage_time_home',
                'distance_traveled_from_home'], axis = 1, inplace = True)
    
    # convert chunk's columns to the right datatype.
    chunk['device_count'] = pd.to_numeric(chunk['device_count'], errors='coerce')
    chunk['part_time_work_behavior_devices'] = pd.to_numeric(chunk['part_time_work_behavior_devices'], errors='coerce')
    chunk['full_time_work_behavior_devices'] = pd.to_numeric(chunk['full_time_work_behavior_devices'], errors='coerce')
    chunk['median_home_dwell_time'] = pd.to_numeric(chunk['median_home_dwell_time'], errors='coerce')
    chunk['median_non_home_dwell_time'] = pd.to_numeric(chunk['median_non_home_dwell_time'], errors='coerce')
    chunk['county_code'] = pd.to_numeric(chunk['county_code'], errors='coerce')
    chunk['state'] = pd.to_numeric(chunk['state'], errors='coerce')
    # drop the null rows (get rid of the row that is header)
    # also gets rid of 2 counties whose FIPS code is null.
    chunk.dropna(inplace = True)
    # convert string column to datetime object
    chunk['date_range_start'] = chunk['date_range_start'].apply(lambda x: datetime.strptime(x[0:10], '%Y-%m-%d'))
    # get the total home_dwell time + total non_home_dwell time
    chunk['home_dwell_time'] = chunk['median_home_dwell_time'] * chunk['device_count']
    chunk['non_home_dwell_time'] = chunk['median_non_home_dwell_time'] * chunk['device_count']
    
    chunk.drop(['median_home_dwell_time', 'median_non_home_dwell_time'], axis = 1, inplace = True)
    
    # groupby and aggregate data of the same county of the same date.
    aggregatedData = aggregatedData.append(chunk, ignore_index = True)
    
    aggregatedData = aggregatedData.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
                        .agg(device_count = ('device_count', 'sum'),\
                             part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
                             full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
                             home_dwell_time = ('home_dwell_time', 'sum'), \
                             non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()
    
    print(counter) # track progress
    print(aggregatedData.shape[0]) # how large is our data right now
    counter += 1

0
3140
1
3219


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


2
6330
3
6444
4
9476
5
9657
6
12545
7
12871
8
15411
9
16078


In [105]:
aggregatedData.head()

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01,1.0,AL,Autauga County,1001.0,4708.0,296.0,66.0,4019193.0,162117.0
1,2019-01-01,1.0,AL,Baldwin County,1003.0,19655.0,1125.0,340.0,14443397.0,684136.0
2,2019-01-01,1.0,AL,Barbour County,1005.0,1570.0,84.0,27.0,1009240.0,66210.0
3,2019-01-01,1.0,AL,Bibb County,1007.0,1702.0,102.0,21.0,1450137.0,87795.0
4,2019-01-01,1.0,AL,Blount County,1009.0,5224.0,315.0,84.0,4620005.0,261769.0


Now the home_dwell_time of aggregated Data from chunksized process is equal to the one we get from aggregating first 1 million rows: for 2019-01-01 of county 1001, home_dwell_time = 4019193 for both. 

In [106]:
aggregatedData = pd.DataFrame(columns = groupByData.columns)

In [108]:
counter = 0
for chunk in pd.read_csv("social_dist_all_trimmed_new.csv", chunksize = 1000000):
    # drop the columns we don't need.
    chunk.drop(['Unnamed: 0', 'date_range_end', 'origin_census_block_group', 
                'candidate_device_count', 'completely_home_device_count', 
                'delivery_behavior_devices', 'median_percentage_time_home',
                'distance_traveled_from_home'], axis = 1, inplace = True)
    
    # convert chunk's columns to the right datatype.
    chunk['device_count'] = pd.to_numeric(chunk['device_count'], errors='coerce')
    chunk['part_time_work_behavior_devices'] = pd.to_numeric(chunk['part_time_work_behavior_devices'], errors='coerce')
    chunk['full_time_work_behavior_devices'] = pd.to_numeric(chunk['full_time_work_behavior_devices'], errors='coerce')
    chunk['median_home_dwell_time'] = pd.to_numeric(chunk['median_home_dwell_time'], errors='coerce')
    chunk['median_non_home_dwell_time'] = pd.to_numeric(chunk['median_non_home_dwell_time'], errors='coerce')
    chunk['county_code'] = pd.to_numeric(chunk['county_code'], errors='coerce')
    chunk['state'] = pd.to_numeric(chunk['state'], errors='coerce')
    # drop the null rows (get rid of the row that is header)
    # also gets rid of 2 counties whose FIPS code is null.
    chunk.dropna(inplace = True)
    # convert string column to datetime object
    chunk['date_range_start'] = chunk['date_range_start'].apply(lambda x: datetime.strptime(x[0:10], '%Y-%m-%d'))
    # get the total home_dwell time + total non_home_dwell time
    chunk['home_dwell_time'] = chunk['median_home_dwell_time'] * chunk['device_count']
    chunk['non_home_dwell_time'] = chunk['median_non_home_dwell_time'] * chunk['device_count']
    
    chunk.drop(['median_home_dwell_time', 'median_non_home_dwell_time'], axis = 1, inplace = True)
    
    # groupby and aggregate data of the same county of the same date.
    aggregatedData = aggregatedData.append(chunk, ignore_index = True)
    
    aggregatedData = aggregatedData.groupby(['date_range_start', 'state', 'state_code', 'cnamelong', 'county_code'])\
                        .agg(device_count = ('device_count', 'sum'),\
                             part_time_work_behavior_devices = ('part_time_work_behavior_devices', 'sum'),
                             full_time_work_behavior_devices = ('full_time_work_behavior_devices', 'sum'),\
                             home_dwell_time = ('home_dwell_time', 'sum'), \
                             non_home_dwell_time = ('non_home_dwell_time', 'sum')).reset_index()
    
    print(counter) # track progress
    print(aggregatedData.shape[0]) # how large is our data right now
    counter += 1

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0
16078
1
31642
2
45136
3
61012
4
74182
5
90190
6
103226
7
119274
8
132264
9
148347
10
163825
11
177399
12
193246
13
206447
14
222430
15
235492
16
251533
17
264531
18
280606
19
295997
20
309664
21
325489
22
338709
23
354688
24
367752
25
383789
26
396787
27
412857
28
428225
29
441896
30
457720
31
470943
32
486931
33
499984
34
516029
35
529020
36
545091
37
560551
38
574151
39
590011
40
603191
41
619198
42
632236
43
648282
44
661988
45
677344
46
692973
47
706397
48
722305
49
735441
50
751465
51
764477
52
780531
53
795342
54
809585
55
825311
56
838642
57
854583
58
867690
59
883726
60
896726
61
912784
62
928123
63
941832
64
957699
65
970862
66
986867
67
999887
68
1015942
69
1031267
70
1044986
71
1060864
72
1074027
73
1090036
74
1103047
75
1119100
76
1134233
77
1148166
78
1163993
79
1177214
80
1193181
81
1206260
82
1222307
83
1235299
84
1251375
85
1266898
86
1280432
87
1296308
88
1309465
89
1325449
90
1338475
91
1354518
92
1369029
93
1383565
94
1399231
95
1412621
96
1428537
97
1441670
98
145

It is good I caught my error early on. If we continued with this wrong data, we could've gotten wrong results without knowing it. 

In [110]:
aggregatedData.head(1)

Unnamed: 0,date_range_start,state,state_code,cnamelong,county_code,device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,home_dwell_time,non_home_dwell_time
0,2019-01-01,1.0,AL,Autauga County,1001.0,4708.0,296.0,66.0,4019193.0,162117.0


This matches our result from aggregating first million rows. Let's save this file.

In [112]:
aggregatedData.to_csv('aggregated_data.csv', index = False)