<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/colab/notebooks/Clean%20NY%202020%20Social%20Distancing%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clean New York 2020 Social Distancing Dataset

**Instructions:**

1. Mount the google drive to access the data by following **Steps to Mount the Drive**.
2. Provide the drive file path to the data, city name, and year in the fourth code cell. 

**Steps to Mount the Drive:**

1. Execute the second code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [None]:
pip install geopandas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import libraries

import os
import geopandas
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

import altair as alt
from vega_datasets import data

# # Allow altair to visualize large datasets
# alt.data_transformers.disable_max_rows()

In [None]:
city = 'New York'
city_directory = 'ny'
year = '2020'

# Provide a list of CSV file paths to read
path = Path('drive/MyDrive/big-data-project/data/clean-data/')
path_list = [path/city_directory/'social'/year]

In [None]:
#Appending into dataframe

def get_df(path_list):
    df = []
    first = True
    for directory in tqdm(path_list, desc='Years'):
        f_list = os.listdir(directory)
        csv_files = [directory/f for f in f_list if f.endswith('.csv')]
        for csv in tqdm(csv_files, desc='Days'):
            if first:
                df = pd.read_csv(csv, dtype={'cbg': object})
                first = False
            else:
                df = df.append(pd.read_csv(csv, dtype={'cbg': object}), ignore_index=True)
    return df.sort_values(by=['date_range_start'])

In [None]:
%%time

df = get_df(path_list)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,cbg,date_range_start,date_range_end,device_count,distance_traveled_from_home,bucketed_distance_traveled,median_dwell_at_bucketed_distance_traveled,completely_home_device_count,median_home_dwell_time,bucketed_home_dwell_time,at_home_by_each_hour,part_time_work_behavior_devices,full_time_work_behavior_devices,destination_cbgs,delivery_behavior_devices,median_non_home_dwell_time,candidate_device_count,bucketed_away_from_home_time,median_percentage_time_home,bucketed_percentage_time_home,mean_home_dwell_time,mean_non_home_dwell_time,mean_distance_traveled_from_home
1847850,59305,360810361001,2020-01-01T00:00:00-05:00,2020-01-02T00:00:00-05:00,105,3069.0,"{""16001-50000"":2,""0"":36,"">50000"":3,""2001-8000""...","{""16001-50000"":97,"">50000"":47,""<1000"":145,""200...",35,384,"{""721-1080"":3,""361-720"":10,""61-360"":10,""<60"":3...","[30,31,40,42,48,48,40,48,51,47,45,45,47,44,39,...",2,1,"{""360810329002"":1,""360810427002"":1,""3608701270...",1,65,183,"{""21-45"":3,""481-540"":2,""46-60"":1,""1201-1320"":3...",84,"{""26-50"":1,""51-75"":7,""0-25"":33,"">100"":1,""76-10...",,,
1850452,148260,360050227021,2020-01-01T00:00:00-05:00,2020-01-02T00:00:00-05:00,74,6196.0,"{""16001-50000"":8,""0"":26,"">50000"":4,""2001-8000""...","{""16001-50000"":49,"">50000"":489,""<1000"":126,""20...",29,758,"{""721-1080"":11,""361-720"":4,""61-360"":14,""<60"":2...","[35,34,39,34,36,40,40,42,40,40,40,40,40,38,35,...",1,1,"{""340130041002"":1,""360610139003"":1,""3600502090...",1,18,165,"{""21-45"":1,""481-540"":1,""541-600"":2,""46-60"":2,""...",94,"{""26-50"":1,""51-75"":11,""0-25"":16,"">100"":1,""76-1...",,,
1850451,148066,360811010012,2020-01-01T00:00:00-05:00,2020-01-02T00:00:00-05:00,201,2018.0,"{""16001-50000"":26,""0"":86,"">50000"":6,""2001-8000...","{""16001-50000"":51,"">50000"":276,""<1000"":39,""200...",87,140,"{""721-1080"":30,""361-720"":22,""61-360"":27,""<60"":...","[67,66,63,67,68,68,75,82,73,74,76,70,69,73,67,...",2,3,"{""360810934020"":1,""360810062025"":1,""3608100860...",1,13,533,"{""21-45"":20,""481-540"":2,""541-600"":9,""46-60"":6,...",95,"{""26-50"":8,""51-75"":8,""0-25"":64,"">100"":3,""76-10...",,,
1850450,148065,360810667011,2020-01-01T00:00:00-05:00,2020-01-02T00:00:00-05:00,81,2498.0,"{""16001-50000"":5,""0"":22,"">50000"":2,""2001-8000""...","{""16001-50000"":40,"">50000"":245,""<1000"":93,""200...",24,1011,"{""721-1080"":16,""361-720"":11,""61-360"":9,""<60"":1...","[31,33,41,42,41,42,47,50,51,53,52,51,54,50,50,...",2,1,"{""360810208001"":1,""360810515001"":1,""3608105210...",1,65,143,"{""21-45"":5,""721-840"":1,""301-360"":1,""<20"":33,""6...",92,"{""26-50"":2,""51-75"":4,""0-25"":10,"">100"":3,""76-10...",,,
1850449,148061,360610263004,2020-01-01T00:00:00-05:00,2020-01-02T00:00:00-05:00,61,4793.0,"{""16001-50000"":1,""0"":23,"">50000"":1,""2001-8000""...","{""16001-50000"":777,"">50000"":314,""<1000"":202,""2...",21,635,"{""721-1080"":5,""361-720"":7,""61-360"":5,""<60"":17,...","[26,25,27,27,29,29,28,28,26,29,34,31,33,30,31,...",1,1,"{""360610265002"":2,""120950147041"":1,""3600502470...",1,34,142,"{""21-45"":6,""481-540"":1,""721-840"":3,""301-360"":2...",89,"{""26-50"":3,""51-75"":2,""0-25"":17,"">100"":7,""76-10...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2211312,76456,360810457001,2020-12-31T00:00:00-05:00,2021-01-01T00:00:00-05:00,61,2637.0,"{""16001-50000"":4,""0"":24,"">50000"":4,""2001-8000""...","{""16001-50000"":53,"">50000"":24,""<1000"":170,""200...",24,826,"{""721-1080"":21,""361-720"":6,""61-360"":4,""<60"":8,...","[48,48,47,47,48,50,39,44,40,38,35,31,33,25,26,...",6,3,"{""360810992002"":1,""360810086001"":1,""3608107130...",2,42,116,"{""21-45"":5,""481-540"":2,""541-600"":1,""721-840"":1...",94,"{""0-25"":6,""76-100"":40,""51-75"":11,""26-50"":3}",863.0,181.0,7071.0
2211311,76454,360471014001,2020-12-31T00:00:00-05:00,2021-01-01T00:00:00-05:00,38,330.0,"{""0"":16,"">50000"":1,""2001-8000"":4,""1-1000"":8,""1...","{"">50000"":58,""1001-2000"":438,""2001-8000"":21,""<...",17,974,"{""721-1080"":6,""361-720"":1,""61-360"":5,""<60"":8,""...","[21,21,19,21,21,24,24,23,21,23,22,17,23,24,22,...",1,1,"{""360810432001"":1,""360470994001"":1,""3604711820...",1,10,75,"{""21-45"":1,""721-840"":1,""301-360"":1,""<20"":20,""6...",99,"{""0-25"":5,""76-100"":28,""51-75"":1,""26-50"":1}",804.0,194.0,1415.0
2211310,76453,360470352001,2020-12-31T00:00:00-05:00,2021-01-01T00:00:00-05:00,36,778.0,"{""16001-50000"":5,""0"":14,"">50000"":1,""2001-8000""...","{""16001-50000"":120,"">50000"":22,""<1000"":101,""20...",14,425,"{""721-1080"":9,""361-720"":1,""61-360"":3,""<60"":8,""...","[16,18,14,14,14,12,14,14,16,14,14,21,17,19,15,...",1,1,"{""360470362003"":1,""120110901011"":1,""1208600420...",1,16,81,"{""21-45"":1,""541-600"":1,""46-60"":1,""721-840"":3,""...",97,"{""0-25"":6,""76-100"":22,""51-75"":6,""26-50"":1}",572.0,153.0,3272.0
2211308,76236,360810972021,2020-12-31T00:00:00-05:00,2021-01-01T00:00:00-05:00,114,1947.0,"{""16001-50000"":5,""0"":57,"">50000"":4,""2001-8000""...","{""16001-50000"":102,"">50000"":409,""<1000"":169,""2...",55,712,"{""721-1080"":15,""361-720"":21,""61-360"":16,""<60"":...","[64,68,67,69,70,67,71,68,68,64,62,66,55,54,49,...",1,1,"{""360810208001"":1,""360810992002"":1,""3605930180...",1,4,225,"{""21-45"":5,""541-600"":5,""46-60"":1,""721-840"":1,""...",100,"{""0-25"":16,""76-100"":77,""51-75"":14,""26-50"":7}",736.0,181.0,3250.0


In [None]:
# Selecting the columsn that we need for the analysis
# col = ['date_range_start',
#        'cbg',
#        'device_count',  # Total active devices
#        'completely_home_device_count',
#        'part_time_work_behavior_devices',
#        'full_time_work_behavior_devices',
#        'delivery_behavior_devices',
#        'median_percentage_time_home',
#        'median_home_dwell_time',
#        'mean_home_dwell_time',
#        'median_non_home_dwell_time',
#        'mean_non_home_dwell_time',
#        'distance_traveled_from_home', # Median Distance traveled
#        'mean_distance_traveled_from_home']
col = ['date_range_start',
       'cbg',
       'device_count',  # Total active devices
       'completely_home_device_count',
       'part_time_work_behavior_devices',
       'full_time_work_behavior_devices']
devices_mobility = df[col]

In [None]:
#Renaming columns

devices_mobility = devices_mobility.rename(columns={'date_range_start':'date','device_count':'total', 'completely_home_device_count':'completely_home','part_time_work_behavior_devices':'part_time_work', 'full_time_work_behavior_devices':'full_time_work'})
devices_mobility

Unnamed: 0,date,cbg,total,completely_home,part_time_work,full_time_work
1847850,2020-01-01T00:00:00-05:00,360810361001,105,35,2,1
1850452,2020-01-01T00:00:00-05:00,360050227021,74,29,1,1
1850451,2020-01-01T00:00:00-05:00,360811010012,201,87,2,3
1850450,2020-01-01T00:00:00-05:00,360810667011,81,24,2,1
1850449,2020-01-01T00:00:00-05:00,360610263004,61,21,1,1
...,...,...,...,...,...,...
2211312,2020-12-31T00:00:00-05:00,360810457001,61,24,6,3
2211311,2020-12-31T00:00:00-05:00,360471014001,38,17,1,1
2211310,2020-12-31T00:00:00-05:00,360470352001,36,14,1,1
2211308,2020-12-31T00:00:00-05:00,360810972021,114,55,1,1


In [None]:
# devices_mobility

In [None]:
#Listing out the device count columns

device_columns = ['completely_home','part_time_work','full_time_work']

In [None]:
#Calculating percentage for all the device columns

for column in device_columns :
    devices_mobility['percentage_'+column]= devices_mobility.apply(lambda row: row[column]/row['total']*100.0 if row['total']!=0.0 else 0.0, axis=1)
devices_mobility

Unnamed: 0,date,cbg,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work
1847850,2020-01-01T00:00:00-05:00,360810361001,105,35,2,1,33.333333,1.904762,0.952381
1850452,2020-01-01T00:00:00-05:00,360050227021,74,29,1,1,39.189189,1.351351,1.351351
1850451,2020-01-01T00:00:00-05:00,360811010012,201,87,2,3,43.283582,0.995025,1.492537
1850450,2020-01-01T00:00:00-05:00,360810667011,81,24,2,1,29.629630,2.469136,1.234568
1850449,2020-01-01T00:00:00-05:00,360610263004,61,21,1,1,34.426230,1.639344,1.639344
...,...,...,...,...,...,...,...,...,...
2211312,2020-12-31T00:00:00-05:00,360810457001,61,24,6,3,39.344262,9.836066,4.918033
2211311,2020-12-31T00:00:00-05:00,360471014001,38,17,1,1,44.736842,2.631579,2.631579
2211310,2020-12-31T00:00:00-05:00,360470352001,36,14,1,1,38.888889,2.777778,2.777778
2211308,2020-12-31T00:00:00-05:00,360810972021,114,55,1,1,48.245614,0.877193,0.877193


In [None]:
#Normalising the percentage columns

for column in device_columns:
        devices_mobility['norm_'+column] = (devices_mobility['percentage_'+ column] - devices_mobility['percentage_'+column].mean()) / devices_mobility['percentage_'+column].std()

devices_mobility

Unnamed: 0,date,cbg,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work
1847850,2020-01-01T00:00:00-05:00,360810361001,105,35,2,1,33.333333,1.904762,0.952381,-0.333820,-0.903751,-0.982197
1850452,2020-01-01T00:00:00-05:00,360050227021,74,29,1,1,39.189189,1.351351,1.351351,0.078979,-1.018687,-0.889712
1850451,2020-01-01T00:00:00-05:00,360811010012,201,87,2,3,43.283582,0.995025,1.492537,0.367607,-1.092691,-0.856984
1850450,2020-01-01T00:00:00-05:00,360810667011,81,24,2,1,29.629630,2.469136,1.234568,-0.594907,-0.786538,-0.916783
1850449,2020-01-01T00:00:00-05:00,360610263004,61,21,1,1,34.426230,1.639344,1.639344,-0.256778,-0.958875,-0.822952
...,...,...,...,...,...,...,...,...,...,...,...,...
2211312,2020-12-31T00:00:00-05:00,360810457001,61,24,6,3,39.344262,9.836066,4.918033,0.089911,0.743473,-0.062923
2211311,2020-12-31T00:00:00-05:00,360471014001,38,17,1,1,44.736842,2.631579,2.631579,0.470053,-0.752801,-0.592943
2211310,2020-12-31T00:00:00-05:00,360470352001,36,14,1,1,38.888889,2.777778,2.777778,0.057810,-0.722437,-0.559053
2211308,2020-12-31T00:00:00-05:00,360810972021,114,55,1,1,48.245614,0.877193,0.877193,0.717398,-1.117163,-0.999626


In [None]:
#Conversion to datetime format

devices_mobility['date'] = pd.to_datetime(devices_mobility['date'],utc= True)
devices_mobility

Unnamed: 0,date,cbg,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work
1847850,2020-01-01 05:00:00+00:00,360810361001,105,35,2,1,33.333333,1.904762,0.952381,-0.333820,-0.903751,-0.982197
1850452,2020-01-01 05:00:00+00:00,360050227021,74,29,1,1,39.189189,1.351351,1.351351,0.078979,-1.018687,-0.889712
1850451,2020-01-01 05:00:00+00:00,360811010012,201,87,2,3,43.283582,0.995025,1.492537,0.367607,-1.092691,-0.856984
1850450,2020-01-01 05:00:00+00:00,360810667011,81,24,2,1,29.629630,2.469136,1.234568,-0.594907,-0.786538,-0.916783
1850449,2020-01-01 05:00:00+00:00,360610263004,61,21,1,1,34.426230,1.639344,1.639344,-0.256778,-0.958875,-0.822952
...,...,...,...,...,...,...,...,...,...,...,...,...
2211312,2020-12-31 05:00:00+00:00,360810457001,61,24,6,3,39.344262,9.836066,4.918033,0.089911,0.743473,-0.062923
2211311,2020-12-31 05:00:00+00:00,360471014001,38,17,1,1,44.736842,2.631579,2.631579,0.470053,-0.752801,-0.592943
2211310,2020-12-31 05:00:00+00:00,360470352001,36,14,1,1,38.888889,2.777778,2.777778,0.057810,-0.722437,-0.559053
2211308,2020-12-31 05:00:00+00:00,360810972021,114,55,1,1,48.245614,0.877193,0.877193,0.717398,-1.117163,-0.999626


In [None]:
# Normalize Time

devices_mobility['date'] = devices_mobility['date'].dt.normalize()
devices_mobility

Unnamed: 0,date,cbg,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work
1847850,2020-01-01 00:00:00+00:00,360810361001,105,35,2,1,33.333333,1.904762,0.952381,-0.333820,-0.903751,-0.982197
1850452,2020-01-01 00:00:00+00:00,360050227021,74,29,1,1,39.189189,1.351351,1.351351,0.078979,-1.018687,-0.889712
1850451,2020-01-01 00:00:00+00:00,360811010012,201,87,2,3,43.283582,0.995025,1.492537,0.367607,-1.092691,-0.856984
1850450,2020-01-01 00:00:00+00:00,360810667011,81,24,2,1,29.629630,2.469136,1.234568,-0.594907,-0.786538,-0.916783
1850449,2020-01-01 00:00:00+00:00,360610263004,61,21,1,1,34.426230,1.639344,1.639344,-0.256778,-0.958875,-0.822952
...,...,...,...,...,...,...,...,...,...,...,...,...
2211312,2020-12-31 00:00:00+00:00,360810457001,61,24,6,3,39.344262,9.836066,4.918033,0.089911,0.743473,-0.062923
2211311,2020-12-31 00:00:00+00:00,360471014001,38,17,1,1,44.736842,2.631579,2.631579,0.470053,-0.752801,-0.592943
2211310,2020-12-31 00:00:00+00:00,360470352001,36,14,1,1,38.888889,2.777778,2.777778,0.057810,-0.722437,-0.559053
2211308,2020-12-31 00:00:00+00:00,360810972021,114,55,1,1,48.245614,0.877193,0.877193,0.717398,-1.117163,-0.999626


In [None]:
# Check the number of rows for each CBG
print('Min: {}\nMax: {}'.format(min(set(devices_mobility.groupby('cbg').count()['date'])), max(set(devices_mobility.groupby('cbg').count()['date']))))

Min: 1
Max: 366


In [None]:
for col in devices_mobility.columns:
    print('\n')
    print(col)
#     if devices_mobility[col].dtype
#     print(devices_mobility[col].dtype)
    print('Min: {}\nMax: {}'.format(devices_mobility[col].min(), devices_mobility[col].max()))



date
Min: 2020-01-01 00:00:00+00:00
Max: 2020-12-31 00:00:00+00:00


cbg
Min: 360050001001
Max: 360859901000


total
Min: 5
Max: 12047


completely_home
Min: 1
Max: 6564


part_time_work
Min: 1
Max: 1675


full_time_work
Min: 1
Max: 387


percentage_completely_home
Min: 0.4098360655737705
Max: 94.44444444444444


percentage_part_time_work
Min: 0.10559662090813093
Max: 66.66666666666666


percentage_full_time_work
Min: 0.05678591709256105
Max: 53.84615384615385


norm_completely_home
Min: -2.6547110942593712
Max: 3.9741129867470444


norm_part_time_work
Min: -1.2774130697364041
Max: 12.546417085216044


norm_full_time_work
Min: -1.1898037765546834
Max: 11.279059114325067


In [None]:
# Print all Null and NaN values

print('We have {} null values.'.format(len(devices_mobility[devices_mobility.isna().any(axis=1)])))

We have 0 null values.


In [None]:
# Grouping based on day

grouped_dm = devices_mobility.groupby(devices_mobility['date']).mean()
grouped_dm

Unnamed: 0_level_0,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01 00:00:00+00:00,63.947108,22.696511,2.041102,1.285487,34.331163,3.650978,2.582719,-0.263480,-0.541085,-0.604270
2020-01-02 00:00:00+00:00,62.679841,17.663586,5.937530,5.262151,27.482716,9.418901,8.447394,-0.746250,0.656834,0.755215
2020-01-03 00:00:00+00:00,62.834981,17.845982,6.197226,5.092793,27.976160,9.903114,8.142616,-0.711466,0.757398,0.684565
2020-01-04 00:00:00+00:00,62.050750,21.446537,2.992020,1.689275,33.942585,5.117607,3.271717,-0.290872,-0.236487,-0.444554
2020-01-05 00:00:00+00:00,62.661404,23.729665,2.929346,1.474482,37.064090,4.989433,2.923406,-0.070826,-0.263107,-0.525295
...,...,...,...,...,...,...,...,...,...,...
2020-12-27 00:00:00+00:00,47.963688,19.315974,1.951585,1.325530,37.910624,4.878801,3.900465,-0.011151,-0.286083,-0.298804
2020-12-28 00:00:00+00:00,47.990602,18.083466,2.728576,2.223320,35.553871,6.652046,5.857633,-0.177287,0.082196,0.154885
2020-12-29 00:00:00+00:00,48.470279,17.939283,2.529084,1.965737,34.662580,5.831772,4.999802,-0.240117,-0.088164,-0.043968
2020-12-30 00:00:00+00:00,48.406882,17.282141,2.510116,1.937072,33.070879,5.850389,4.963771,-0.352321,-0.084298,-0.052320


In [None]:
grouped_dm.reset_index()

Unnamed: 0,date,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work
0,2020-01-01 00:00:00+00:00,63.947108,22.696511,2.041102,1.285487,34.331163,3.650978,2.582719,-0.263480,-0.541085,-0.604270
1,2020-01-02 00:00:00+00:00,62.679841,17.663586,5.937530,5.262151,27.482716,9.418901,8.447394,-0.746250,0.656834,0.755215
2,2020-01-03 00:00:00+00:00,62.834981,17.845982,6.197226,5.092793,27.976160,9.903114,8.142616,-0.711466,0.757398,0.684565
3,2020-01-04 00:00:00+00:00,62.050750,21.446537,2.992020,1.689275,33.942585,5.117607,3.271717,-0.290872,-0.236487,-0.444554
4,2020-01-05 00:00:00+00:00,62.661404,23.729665,2.929346,1.474482,37.064090,4.989433,2.923406,-0.070826,-0.263107,-0.525295
...,...,...,...,...,...,...,...,...,...,...,...
361,2020-12-27 00:00:00+00:00,47.963688,19.315974,1.951585,1.325530,37.910624,4.878801,3.900465,-0.011151,-0.286083,-0.298804
362,2020-12-28 00:00:00+00:00,47.990602,18.083466,2.728576,2.223320,35.553871,6.652046,5.857633,-0.177287,0.082196,0.154885
363,2020-12-29 00:00:00+00:00,48.470279,17.939283,2.529084,1.965737,34.662580,5.831772,4.999802,-0.240117,-0.088164,-0.043968
364,2020-12-30 00:00:00+00:00,48.406882,17.282141,2.510116,1.937072,33.070879,5.850389,4.963771,-0.352321,-0.084298,-0.052320


In [None]:
#Visualising mobility

title = '{} {}'.format(city,year)
alt.Chart(grouped_dm.reset_index()).mark_line().encode(
    x=alt.X('date', title='Date'),
    y=alt.Y(alt.repeat('row'), type='quantitative')
).properties(
    width=300,
    height=250
).repeat(
    row=['percentage_completely_home','percentage_part_time_work','percentage_full_time_work']
).interactive()

![ny-daily-2020](https://github.com/chouhandiksha/bigdataproject/raw/main/media/social-dist/ny-daily-2020.png)

In [None]:
#Adding month column for further usage

devices_mobility['month'] = devices_mobility['date'].dt.month
devices_mobility

Unnamed: 0,date,cbg,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work,month
1847850,2020-01-01 00:00:00+00:00,360810361001,105,35,2,1,33.333333,1.904762,0.952381,-0.333820,-0.903751,-0.982197,1
1850452,2020-01-01 00:00:00+00:00,360050227021,74,29,1,1,39.189189,1.351351,1.351351,0.078979,-1.018687,-0.889712,1
1850451,2020-01-01 00:00:00+00:00,360811010012,201,87,2,3,43.283582,0.995025,1.492537,0.367607,-1.092691,-0.856984,1
1850450,2020-01-01 00:00:00+00:00,360810667011,81,24,2,1,29.629630,2.469136,1.234568,-0.594907,-0.786538,-0.916783,1
1850449,2020-01-01 00:00:00+00:00,360610263004,61,21,1,1,34.426230,1.639344,1.639344,-0.256778,-0.958875,-0.822952,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2211312,2020-12-31 00:00:00+00:00,360810457001,61,24,6,3,39.344262,9.836066,4.918033,0.089911,0.743473,-0.062923,12
2211311,2020-12-31 00:00:00+00:00,360471014001,38,17,1,1,44.736842,2.631579,2.631579,0.470053,-0.752801,-0.592943,12
2211310,2020-12-31 00:00:00+00:00,360470352001,36,14,1,1,38.888889,2.777778,2.777778,0.057810,-0.722437,-0.559053,12
2211308,2020-12-31 00:00:00+00:00,360810972021,114,55,1,1,48.245614,0.877193,0.877193,0.717398,-1.117163,-0.999626,12


In [None]:
#Grouping data on basis of month value

grouped_dm = devices_mobility.groupby([devices_mobility['month']]).mean()
grouped_dm

Unnamed: 0_level_0,total,completely_home,part_time_work,full_time_work,percentage_completely_home,percentage_part_time_work,percentage_full_time_work,norm_completely_home,norm_part_time_work,norm_full_time_work
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,56.747442,15.731918,5.235401,4.372121,27.152249,9.372379,7.942775,-0.769546,0.647172,0.63824
2,60.324868,16.804988,5.29205,4.08575,27.244239,8.859695,6.98712,-0.763061,0.540694,0.41671
3,57.413595,21.522013,3.796671,3.05123,38.200698,6.793587,5.69395,0.009297,0.111592,0.116942
4,53.385405,29.499757,1.776196,1.560461,54.043534,4.047668,3.708108,1.126113,-0.458698,-0.343394
5,55.731906,28.235325,1.977632,1.620823,49.802733,4.288053,3.745494,0.827165,-0.408774,-0.334728
6,54.955332,23.980395,2.416256,1.845153,42.53301,5.061646,4.173453,0.314697,-0.248109,-0.235523
7,53.642778,21.457478,2.609792,1.925425,38.356485,5.569414,4.469662,0.020279,-0.142652,-0.166859
8,56.267639,22.633664,2.698432,1.951091,35.76923,5.844539,4.651017,-0.162105,-0.085513,-0.124819
9,52.261083,18.865013,2.88858,2.195664,35.339143,6.287344,5.160846,-0.192424,0.006452,-0.006637
10,52.358595,19.19382,2.971899,2.26846,35.500757,6.331513,5.223213,-0.181031,0.015625,0.007821


In [None]:
d = []
for col in ['percentage_completely_home','percentage_part_time_work','percentage_full_time_work']:
        for m in range(1,13):
            d.append({'month': m, 'column': col, 'value': grouped_dm[col][m]})
d = alt.Data(values=d)

In [None]:
#Visualizing percentage of mobility of each mobility type.

title = '{} {}'.format(city,year)
alt.Chart(d, title=title).mark_line().encode(
    x=alt.X('month:O', title='Month'),
    y=alt.Y('value:Q', type='quantitative', title='Percentage'),
    color='column:N'
).properties(
    width=300,
    height=250
).interactive()

![ny-monthly-2020](https://github.com/chouhandiksha/bigdataproject/raw/main/media/social-dist/ny-monthly-2020.png)