# Clean Los Angeles 2020 Social Distancing Dataset

**Instructions:**

1. Mount the google drive to access the data by following **Steps to Mount the Drive**.
2. Provide the drive file path to the data, city name, and year in the fourth code cell. 

**Steps to Mount the Drive:**

1. Execute the second code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A link to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell.




In [None]:
pip install geopandas

In [2]:
import os
import geopandas
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

import altair as alt
from vega_datasets import data

# # Allow altair to visualize large datasets
# alt.data_transformers.disable_max_rows()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fexperimentsandconfigs%20https%3a%2f%2fwww.googleapis.com%2fauth%2fphotos.native&response_type=code

Enter your authorization code:
4/1AY0e-g5R89CV8GFooLj0t5ygGzJadgWSsQkql7uOAkm1jzjd3g7_JSBNl58
Mounted at /content/drive


In [4]:
#Provide the drive file path to the data, city name, and year

city = 'Los Angeles'
city_directory = 'la'
year = '2020'

# Provide a list of CSV file paths to read
path = Path('drive/MyDrive/big-data-project/data/clean-data')
path_list = [path/city_directory/'social'/year]

In [6]:
#Appending into dataframe

def get_df(path_list):
    df = []
    first = True
    for directory in tqdm(path_list, desc='Years'):
        f_list = os.listdir(directory)
        csv_files = [directory/f for f in f_list if f.endswith('.csv')]
        for csv in tqdm(csv_files, desc='Days'):
            if first:
                df = pd.read_csv(csv, dtype={'cbg': object})
                first = False
            else:
                df = df.append(pd.read_csv(csv, dtype={'cbg': object}), ignore_index=True)
    return df.sort_values(by=['date_range_start'])

In [None]:
%%time

df = get_df(path_list)

In [None]:
df

In [None]:
# Selecting the columsn that we need for the analysis
# col = ['date_range_start',
#        'cbg',
#        'device_count',  # Total active devices
#        'completely_home_device_count',
#        'part_time_work_behavior_devices',
#        'full_time_work_behavior_devices',
#        'delivery_behavior_devices',
#        'median_percentage_time_home',
#        'median_home_dwell_time',
#        'mean_home_dwell_time',
#        'median_non_home_dwell_time',
#        'mean_non_home_dwell_time',
#        'distance_traveled_from_home', # Median Distance traveled
#        'mean_distance_traveled_from_home']
col = ['date_range_start',
       'cbg',
       'device_count',  # Total active devices
       'completely_home_device_count',
       'part_time_work_behavior_devices',
       'full_time_work_behavior_devices']
devices_mobility = df[col]

In [None]:
#Renaming columns
devices_mobility = devices_mobility.rename(columns={'date_range_start':'date','device_count':'total', 'completely_home_device_count':'completely_home','part_time_work_behavior_devices':'part_time_work', 'full_time_work_behavior_devices':'full_time_work'})
devices_mobility

In [None]:
# devices_mobility

In [None]:
#Listing out the device count columns
device_columns = ['completely_home','part_time_work','full_time_work']

In [None]:
#Calculating percentage for all the device columns
for column in device_columns :
    devices_mobility['percentage_'+column]= devices_mobility.apply(lambda row: row[column]/row['total']*100.0 if row['total']!=0.0 else 0.0, axis=1)
devices_mobility

In [None]:
#Normalising the percentage columns
for column in device_columns:
        devices_mobility['norm_'+column] = (devices_mobility['percentage_'+ column] - devices_mobility['percentage_'+column].mean()) / devices_mobility['percentage_'+column].std()

devices_mobility

In [None]:
#Conversion of datetime format
devices_mobility['date'] = pd.to_datetime(devices_mobility['date'],utc= True)
devices_mobility

In [None]:
# Normalize time
devices_mobility['date'] = devices_mobility['date'].dt.normalize()
devices_mobility

In [None]:
# Check the number of rows for each CBG
print('Min: {}\nMax: {}'.format(min(set(devices_mobility.groupby('cbg').count()['date'])), max(set(devices_mobility.groupby('cbg').count()['date']))))

In [None]:
for col in devices_mobility.columns:
    print('\n')
    print(col)
#     if devices_mobility[col].dtype
#     print(devices_mobility[col].dtype)
    print('Min: {}\nMax: {}'.format(devices_mobility[col].min(), devices_mobility[col].max()))

In [None]:
# Print all Null and NaN values
print('We have {} null values.'.format(len(devices_mobility[devices_mobility.isna().any(axis=1)])))

In [None]:
# Grouping based on day
grouped_dm = devices_mobility.groupby(devices_mobility['date']).mean()
grouped_dm

In [None]:
grouped_dm.reset_index()

In [None]:
#Visualising mobility

title = '{} {}'.format(city,year)
alt.Chart(grouped_dm.reset_index()).mark_line().encode(
    x=alt.X('date', title='Date'),
    y=alt.Y(alt.repeat('row'), type='quantitative')
).properties(
    width=300,
    height=250
).repeat(
    row=['percentage_completely_home','percentage_part_time_work','percentage_full_time_work']
).interactive()

![la-daily-2020](https://github.com/chouhandiksha/bigdataproject/raw/main/media/social-dist/la-daily-2020.png)

In [None]:
#Adding month column for further usage
devices_mobility['month'] = devices_mobility['date'].dt.month
devices_mobility

In [None]:
#Grouping data on basis of month value
grouped_dm = devices_mobility.groupby([devices_mobility['month']]).mean()
grouped_dm

In [None]:
d = []
for col in ['percentage_completely_home','percentage_part_time_work','percentage_full_time_work']:
        for m in range(1,13):
            d.append({'month': m, 'column': col, 'value': grouped_dm[col][m]})
d = alt.Data(values=d)

In [None]:
#Visualizing data by mobility type

title = '{} {}'.format(city,year)
alt.Chart(d, title=title).mark_line().encode(
    x=alt.X('month:O', title='Month'),
    y=alt.Y('value:Q', type='quantitative', title='Percentage'),
    color='column:N'
).properties(
    width=300,
    height=250
).interactive()

![la-monthly-2020](https://github.com/chouhandiksha/bigdataproject/raw/main/media/social-dist/la-monthly-2020.png)