<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/main/notebooks/Extract%20Chicago%20from%20Social%20Distancing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Chicago Social Distancing Data

**Instructions:**

1. Mount the google drive to access the data by following **Steps to Mount the Drive**.
2. Provide the drive file path to the 2020 source data, 2019 source data, the 2020 target directory, 2019 target directory, and the 5 digit combined FIPS codes in the third code cell.

**Steps to Mount the Drive:**

1. Execute the second code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

import os
from tqdm.notebook import tqdm, trange

In [4]:
# Chicago Counties
fips = ['17031','17037','17043','17063','17091','17089','17093','17111','17197','17097','18073','18089','18111','18127','55059']


path_2020 = 'drive/MyDrive/big-data-project/data/unprocessed/social-distancing/2020/'
target_2020 = 'drive/MyDrive/big-data-project/data/clean-data/ch/social/2020/'

path_2019 = 'drive/MyDrive/big-data-project/data/unprocessed/social-distancing/2019/'
target_2019 = 'drive/MyDrive/big-data-project/data/clean-data/ch/social/2019/'


In [5]:
def filter_fips(df, fips=fips):
    return df[df['cbg'].astype(str).str[:5].isin(fips)]

In [6]:
def get_fips(fname, filepath, fips, dirout):
    df = pd.read_csv(filepath, compression='gzip', dtype={'origin_census_block_group': object})
    df = df.rename(columns={'origin_census_block_group':'cbg'})
    df = filter_fips(df)
    df.to_csv(dirout)

In [7]:
months = os.listdir(path_2020)
# print(months)
for month in tqdm(months, desc='months'):
    days = os.listdir(path_2020+month)
    days = [day for day in days if day!='.DS_Store']
    for day in tqdm(days, desc='days'):
        sub_path = path_2020 + month + '/' + day +'/'
        f_list = os.listdir(sub_path)
        csv_files = [file for file in f_list if file.endswith('.gz')]
        csv_files = [file for file in csv_files if file!= []]
        for file in csv_files:
            f_path = os.path.join(sub_path, file)
            # Do something with file
            target_dir = os.path.join(target_2020, file)[:-3]  # remove .gz extension
            get_fips(file, f_path, fips, target_dir)

HBox(children=(FloatProgress(value=0.0, description='months', max=12.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=29.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…





In [8]:
months = os.listdir(path_2019)
# print(months)
for month in tqdm(months, desc='months'):
    days = os.listdir(path_2019+month)
    days = [day for day in days if day!='.DS_Store']
    for day in tqdm(days, desc='days'):
        sub_path = path_2019 + month + '/' + day +'/'
        f_list = os.listdir(sub_path)
        csv_files = [file for file in f_list if file.endswith('.gz')]
        csv_files = [file for file in csv_files if file!= []]
        for file in csv_files:
            f_path = os.path.join(sub_path, file)
            # Do something with file
            target_dir = os.path.join(target_2019, file)[:-3]  # remove .gz extension
            get_fips(file, f_path, fips, target_dir)

HBox(children=(FloatProgress(value=0.0, description='months', max=12.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=28.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…



