<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/colab/sketch/justin/SPARK-Extract%20New%20York%20from%20Social%20Distancing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract New York Social Distancing Data
**Instructions:**

1. Mount the google drive to access the data by following **Steps to Mount the Drive**.
2. Provide the drive file path to the 2020 source data, 2019 source data, the 2020 target directory, 2019 target directory, and the 5 digit combined FIPS codes in the third code cell. 

**Steps to Mount the Drive:**

1. Execute the second code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm, trange

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# New York Counties
fips = ['36061','36047','36005','36085','36081']
f_string = " ( '36061','36047','36005','36085','36081' ) "

path_2020 = 'drive/MyDrive/big-data-project/data/unprocessed/social-distancing/2020/'
target_2020 = 'drive/MyDrive/big-data-project/data/clean-data/ny/social/2020/'

path_2019 = 'drive/MyDrive/big-data-project/data/unprocessed/social-distancing/2019/'
target_2019 = 'drive/MyDrive/big-data-project/data/clean-data/ny/social/2019/'


In [4]:
# Install required dependancies
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 69kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 37.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=92f7177f4a4097d7e2d71995f8ad4b61aa36e357a06fedcc5be1621337027727
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1
The 

In [5]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [7]:
def filter_fips(df, fips=fips):
    return df[df['cbg'].astype(str).str[:5].isin(fips)]

In [8]:
def get_fips(fname, filepath, fips, dirout):
    df_soc = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(filepath)
    df_soc.createOrReplaceTempView('T')
    df_soc = spark.sql('SELECT * FROM T WHERE SUBSTRING(`origin_census_block_group`,1,5) IN ' + f_string)
    df = df_soc.toPandas()
    df = df.rename(columns={'origin_census_block_group':'cbg'})
    df.to_csv(dirout)

In [9]:
months = os.listdir(path_2020)
# print(months)
for month in tqdm(months, desc='months'):
    days = os.listdir(path_2020+month)
    days = [day for day in days if day!='.DS_Store']
    for day in tqdm(days, desc='days'):
        sub_path = path_2020 + month + '/' + day +'/'
        f_list = os.listdir(sub_path)
        csv_files = [file for file in f_list if file.endswith('.gz')]
        csv_files = [file for file in csv_files if file!= []]
        for file in csv_files:
            f_path = os.path.join(sub_path, file)
            # Do something with file
            target_dir = os.path.join(target_2020, file)[:-3]  # remove .gz extension
            get_fips(file, f_path, fips, target_dir)

HBox(children=(FloatProgress(value=0.0, description='months', max=12.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=29.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…





In [10]:
months = os.listdir(path_2019)
# print(months)
for month in tqdm(months, desc='months'):
    days = os.listdir(path_2019+month)
    days = [day for day in days if day!='.DS_Store']
    for day in tqdm(days, desc='days'):
        sub_path = path_2019 + month + '/' + day +'/'
        f_list = os.listdir(sub_path)
        csv_files = [file for file in f_list if file.endswith('.gz')]
        csv_files = [file for file in csv_files if file!= []]
        for file in csv_files:
            f_path = os.path.join(sub_path, file)
            # Do something with file
            target_dir = os.path.join(target_2019, file)[:-3]  # remove .gz extension
            get_fips(file, f_path, fips, target_dir)

HBox(children=(FloatProgress(value=0.0, description='months', max=12.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=28.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=30.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='days', max=31.0, style=ProgressStyle(description_width='i…



