In [154]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [155]:
######################### Initialize SparkSession #########################
spark = SparkSession \
    .builder \
    .appName("SparkSQL") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [156]:
######################### Import oca_index data set  #########################
# cases has basic information about each case. (One row for each case)
# addresses has address of the properties that the case concerns. (Can be one or more entries for each case)
######################### Import oca_index data set  #########################

# load cases and addresses tables
cases = spark.read.format('csv').options(
    header='true', inferschema='true').load("data/cases.csv")
addresses = spark.read.format('csv').options(
    header='true', inferschema='true').load("data/addresses.csv")

# create temp view for spark sql 
cases.createOrReplaceTempView("cases")
addresses.createOrReplaceTempView("addresses")

# print schema
cases.printSchema()
addresses.printSchema()



root
 |-- indexnumberid: string (nullable = true)
 |-- court: string (nullable = true)
 |-- fileddate: string (nullable = true)
 |-- propertytype: string (nullable = true)
 |-- classification: string (nullable = true)
 |-- specialtydesignationtypes: string (nullable = true)
 |-- status: string (nullable = true)
 |-- disposeddate: string (nullable = true)
 |-- disposedreason: string (nullable = true)
 |-- firstpaper: string (nullable = true)
 |-- primaryclaimtotal: string (nullable = true)
 |-- dateofjurydemand: string (nullable = true)

root
 |-- indexnumberid: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postalcode: string (nullable = true)



In [157]:
######################### Total filling of eviction in NYC after NYC lockdown (03/20/2020) #########################

# These classifications are consider eviction fillings: Holdover, Non-Payment

# These court are in the NYC: Bronx County Civil Court, Kings County Civil Court, New York County Civil Court, Queens County Civil Court, 
# Richmond County Civil Court, Redhook Community Justice Center and Harlem Community Justice Center

# We only retreive the cases after 03/20/2020 when NYC declared a city lockdown 



######################### Total filling of eviction in NYC after NYC lockdown (03/20/2020) #########################
query = """
select count(*)
from cases
where fileddate > '2020-03-20'
  and classification in ('Holdover','Non-Payment')
  and court in (
					'Bronx County Civil Court',
					'Kings County Civil Court',
					'New York County Civil Court',
					'Queens County Civil Court',
					'Richmond County Civil Court',
					'Redhook Community Justice Center',
					'Harlem Community Justice Center'
				)

"""
total_filling_after_lockdown = spark.sql(query).show()

+--------+
|count(1)|
+--------+
|   42724|
+--------+



In [158]:
######################### case file date and disposed date #########################

# Each row represents each eviction case's fileddate, disposeddate, week_filed and week_disposed
# This is used for later data cleanning and integration

######################### case file date and disposed date #########################
query_after_lockdown_cases = """
select fileddate,
       disposeddate,
       cast(date_trunc('week', fileddate) as date)    as week_filed,
       cast(date_trunc('week', disposeddate) as date) as week_disposed
from cases
where classification in ('Holdover', 'Non-Payment')
  and court in ('Bronx County Civil Court',
                'Kings County Civil Court',
                'New York County Civil Court',
                'Queens County Civil Court',
                'Richmond County Civil Court',
                'Redhook Community Justice Center',
                'Harlem Community Justice Center')
  and fileddate > '2020-03-20'
  --and propertytype = 'Residential' # commented out to show Statewide evictions, which includes commercial
order by fileddate asc
"""
after_lockdown_cases = spark.sql(query_after_lockdown_cases).show()

+----------+------------+----------+-------------+
| fileddate|disposeddate|week_filed|week_disposed|
+----------+------------+----------+-------------+
|2020-03-24|        null|2020-03-23|         null|
|2020-04-14|  2021-03-08|2020-04-13|   2021-03-08|
|2020-04-14|        null|2020-04-13|         null|
|2020-04-17|        null|2020-04-13|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-06-15|        null|2020-06-15|         null|
|2020-06-23|        null|2020-06-22|         null|
|2020-06-23|        null|2020-06-22|         null|
|2020-06-25|        null|2020-06-22|         null|
|2020-06-25|        null|2020-06-22|         null|
|2020-06-26|        null|2020-06-22|         null|
|2020-06-26|        null|2020-06-22|         null|
|2020-06-29|        null|2020-06-29|         null|
|2020-06-30|        null|2020-0

In [159]:
######################### case file date and disposed date with running sum and total active cases #########################

# Each row represents each week (starting from 03/20/2020), the number of cases has been filed, the number of cases has been disposed,
# the cumulative cases for each one
# and the total active cases (filed_cases - disposed_cases)

# Will draw a time series graph using this data

######################### case file date and disposed date with running sum and total active cases #########################
query_cases_time_and_summary = """
with after_lockdown as (
    select fileddate,
           cast(date_trunc('week', fileddate) as date)    as week_filed,
           disposeddate,
           cast(date_trunc('week', disposeddate) as date) as week_disposed
    from cases
    where classification in ('Holdover', 'Non-Payment')
      and court in ('Bronx County Civil Court',
                    'Kings County Civil Court',
                    'New York County Civil Court',
                    'Queens County Civil Court',
                    'Richmond County Civil Court',
                    'Redhook Community Justice Center',
                    'Harlem Community Justice Center')
      and fileddate > '2020-03-20'
    order by fileddate asc),

     group_by_week as (
         select week_filed                                        as first_day_of_week,
                count(*) filter (where week_filed is not null)    as cases_filed,
                count(*) filter (where week_disposed is not null) as cases_disposed
         from after_lockdown
         group by week_filed
         order by week_filed)

select first_day_of_week,
       cases_filed,
       cases_disposed,
       sum(cases_filed) over (order by first_day_of_week)      as cumulative_cases_filed,
       sum(cases_disposed) over (order by first_day_of_week)   as cumulative_cases_disposed,
       (sum(cases_filed) over (order by first_day_of_week) -
        sum(cases_disposed) over (order by first_day_of_week)) as active_cases
from group_by_week
"""

cases_time_and_summary = spark.sql(query_cases_time_and_summary).show()

+-----------------+-----------+--------------+----------------------+-------------------------+------------+
|first_day_of_week|cases_filed|cases_disposed|cumulative_cases_filed|cumulative_cases_disposed|active_cases|
+-----------------+-----------+--------------+----------------------+-------------------------+------------+
|       2020-03-23|          1|             0|                     1|                        0|           1|
|       2020-04-13|          3|             1|                     4|                        1|           3|
|       2020-05-11|          4|             0|                     8|                        1|           7|
|       2020-06-15|          1|             0|                     9|                        1|           8|
|       2020-06-22|          6|             0|                    15|                        1|          14|
|       2020-06-29|         22|             2|                    37|                        3|          34|
|       2020-07-06|

In [160]:
######################### time serise data visz #########################
# TODO:



In [161]:
######################### Case by zip code  #########################
query_cases_by_zipcode = """
select substr(postalcode, 1, 5) as zip_code
from cases
         left join addresses address on cases.indexnumberid = address.indexnumberid
where classification in ('Holdover', 'Non-Payment')
  and court in ('Bronx County Civil Court',
                'Kings County Civil Court',
                'New York County Civil Court',
                'Queens County Civil Court',
                'Richmond County Civil Court',
                'Redhook Community Justice Center',
                'Harlem Community Justice Center')
  and fileddate > '2020-03-20'
  and postalcode is not null
order by fileddate asc
"""

cases_by_zipcode = spark.sql(query_cases_by_zipcode).show()

+--------+
|zip_code|
+--------+
|   11372|
|   10456|
|   10456|
|   11208|
|   11203|
|   11203|
|   11203|
|   11203|
|   11213|
|   10001|
|   10001|
|   10462|
|   10457|
|   11420|
|   10465|
|   10451|
|   10031|
|   10028|
|   10017|
|   10469|
+--------+
only showing top 20 rows



In [162]:
######################### case by zip code data visz #########################
# TODO:


In [163]:
spark.stop()