In [38]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from geopy.geocoders import Nominatim

In [39]:
######################### Initialize SparkSession #########################
spark = SparkSession \
    .builder \
    .appName("SparkSQL") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

## Import data set

In [40]:
######################### Import data set  #########################
# cases has basic information about each case. (One row for each case)
# addresses has address of the properties that the case concerns. (Can be one or more entries for each case)
######################### Import oca_index data set  #########################

# load cases and addresses tables
eviction_cases = spark.read.format('csv').options(
    header='true', inferschema='true').load("data/eviction_cases.csv")
eviction_addresses = spark.read.format('csv').options(
    header='true', inferschema='true').load("data/eviction_addresses.csv")

# create temp view for spark sql 
eviction_cases.createOrReplaceTempView("eviction_cases")
eviction_addresses.createOrReplaceTempView("eviction_addresses")

# print schema
eviction_cases.printSchema()
eviction_addresses.printSchema()



root
 |-- indexnumberid: string (nullable = true)
 |-- court: string (nullable = true)
 |-- fileddate: string (nullable = true)
 |-- propertytype: string (nullable = true)
 |-- classification: string (nullable = true)
 |-- specialtydesignationtypes: string (nullable = true)
 |-- status: string (nullable = true)
 |-- disposeddate: string (nullable = true)
 |-- disposedreason: string (nullable = true)
 |-- firstpaper: string (nullable = true)
 |-- primaryclaimtotal: string (nullable = true)
 |-- dateofjurydemand: string (nullable = true)

root
 |-- indexnumberid: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postalcode: string (nullable = true)



## Total filling of eviction in NYC after NYC lockdown (03/20/2020)

In [41]:
######################### Total filling of eviction in NYC after NYC lockdown (03/20/2020) #########################

# These classifications are consider eviction fillings: Holdover, Non-Payment

# These court are in the NYC: Bronx County Civil Court, Kings County Civil Court, New York County Civil Court, Queens County Civil Court, 
# Richmond County Civil Court, Redhook Community Justice Center and Harlem Community Justice Center

# We only retreive the cases after 03/20/2020 when NYC declared a city lockdown 



######################### Total filling of eviction in NYC after NYC lockdown (03/20/2020) #########################
query = """
select count(*)
from eviction_cases
where fileddate > '2020-03-20'
  and classification in ('Holdover','Non-Payment')
  and court in (
					'Bronx County Civil Court',
					'Kings County Civil Court',
					'New York County Civil Court',
					'Queens County Civil Court',
					'Richmond County Civil Court',
					'Redhook Community Justice Center',
					'Harlem Community Justice Center'
				)

"""
total_eviction_filling_after_lockdown = spark.sql(query).show()

+--------+
|count(1)|
+--------+
|   42724|
+--------+



## Per case filed date and disposed date

In [42]:
######################### case file date and disposed date #########################

# Each row represents each eviction case's fileddate, disposeddate, week_filed and week_disposed
# This is used for later data cleanning and integration

######################### case file date and disposed date #########################
query_after_lockdown_eviction_cases = """
select fileddate,
       disposeddate,
       cast(date_trunc('week', fileddate) as date)    as week_filed,
       cast(date_trunc('week', disposeddate) as date) as week_disposed
from eviction_cases
where classification in ('Holdover', 'Non-Payment')
  and court in ('Bronx County Civil Court',
                'Kings County Civil Court',
                'New York County Civil Court',
                'Queens County Civil Court',
                'Richmond County Civil Court',
                'Redhook Community Justice Center',
                'Harlem Community Justice Center')
  and fileddate > '2020-03-20'
  --and propertytype = 'Residential' # commented out to show Statewide evictions, which includes commercial
order by fileddate asc
"""
after_lockdown_eviction_cases = spark.sql(query_after_lockdown_eviction_cases)
after_lockdown_eviction_cases.show()

+----------+------------+----------+-------------+
| fileddate|disposeddate|week_filed|week_disposed|
+----------+------------+----------+-------------+
|2020-03-24|        null|2020-03-23|         null|
|2020-04-14|  2021-03-08|2020-04-13|   2021-03-08|
|2020-04-14|        null|2020-04-13|         null|
|2020-04-17|        null|2020-04-13|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-05-12|        null|2020-05-11|         null|
|2020-06-15|        null|2020-06-15|         null|
|2020-06-23|        null|2020-06-22|         null|
|2020-06-23|        null|2020-06-22|         null|
|2020-06-25|        null|2020-06-22|         null|
|2020-06-25|        null|2020-06-22|         null|
|2020-06-26|        null|2020-06-22|         null|
|2020-06-26|        null|2020-06-22|         null|
|2020-06-29|        null|2020-06-29|         null|
|2020-06-30|        null|2020-0

## Per week case filed date and disposed date with running sum and total active cases

In [43]:
######################### case file date and disposed date with running sum and total active cases #########################

# Each row represents each week (starting from 03/20/2020), the number of cases has been filed, the number of cases has been disposed,
# the cumulative cases for each one
# and the total active cases (filed_cases - disposed_cases)

# Will draw a time series graph using this data

######################### case file date and disposed date with running sum and total active cases #########################
query_eviction_cases_time_and_summary = """
with after_lockdown as (
    select fileddate,
           cast(date_trunc('week', fileddate) as date)    as week_filed,
           disposeddate,
           cast(date_trunc('week', disposeddate) as date) as week_disposed
    from eviction_cases
    where classification in ('Holdover', 'Non-Payment')
      and court in ('Bronx County Civil Court',
                    'Kings County Civil Court',
                    'New York County Civil Court',
                    'Queens County Civil Court',
                    'Richmond County Civil Court',
                    'Redhook Community Justice Center',
                    'Harlem Community Justice Center')
      and fileddate > '2020-03-20'
    order by fileddate asc),

     group_by_week as (
         select week_filed                                        as first_day_of_week,
                count(*) filter (where week_filed is not null)    as cases_filed,
                count(*) filter (where week_disposed is not null) as cases_disposed
         from after_lockdown
         group by week_filed
         order by week_filed)

select first_day_of_week,
       cases_filed,
       cases_disposed,
       sum(cases_filed) over (order by first_day_of_week)      as cumulative_cases_filed,
       sum(cases_disposed) over (order by first_day_of_week)   as cumulative_cases_disposed,
       (sum(cases_filed) over (order by first_day_of_week) -
        sum(cases_disposed) over (order by first_day_of_week)) as active_cases
from group_by_week
"""

eviction_cases_time_and_summary = spark.sql(query_eviction_cases_time_and_summary)
eviction_cases_time_and_summary.show()

+-----------------+-----------+--------------+----------------------+-------------------------+------------+
|first_day_of_week|cases_filed|cases_disposed|cumulative_cases_filed|cumulative_cases_disposed|active_cases|
+-----------------+-----------+--------------+----------------------+-------------------------+------------+
|       2020-03-23|          1|             0|                     1|                        0|           1|
|       2020-04-13|          3|             1|                     4|                        1|           3|
|       2020-05-11|          4|             0|                     8|                        1|           7|
|       2020-06-15|          1|             0|                     9|                        1|           8|
|       2020-06-22|          6|             0|                    15|                        1|          14|
|       2020-06-29|         22|             2|                    37|                        3|          34|
|       2020-07-06|

In [44]:
######################### time serise data visz #########################
# TODO:



## Case by zip code

In [45]:
######################### Case by zip code  #########################
query_eviction_cases_by_zipcode = """
with cases_zip as (select substr(postalcode, 1, 5) as zip_code
                   from eviction_cases
                            left join eviction_addresses on eviction_cases.indexnumberid = eviction_addresses.indexnumberid
                   where classification in ('Holdover', 'Non-Payment')
                     and court in ('Bronx County Civil Court',
                                   'Kings County Civil Court',
                                   'New York County Civil Court',
                                   'Queens County Civil Court',
                                   'Richmond County Civil Court',
                                   'Redhook Community Justice Center',
                                   'Harlem Community Justice Center')
                     and fileddate > '2020-03-20'
                     and postalcode is not null
                     and cast(substr(postalcode, 1, 5) as int) > 1
                     and cast(substr(postalcode, 1, 5) as int) < 20000
                   order by fileddate asc)

select zip_code,
       count(*) as total
from cases_zip
group by zip_code
order by zip_code
"""

eviction_cases_by_zipcode = spark.sql(query_eviction_cases_by_zipcode)
eviction_cases_by_zipcode.show()

+--------+-----+
|zip_code|total|
+--------+-----+
|   10001|  232|
|   10002|  189|
|   10003|  151|
|   10004|   24|
|   10005|   53|
|   10006|   17|
|   10007|   13|
|   10009|  108|
|   10010|   45|
|   10011|  207|
|   10012|   94|
|   10013|   62|
|   10014|  138|
|   10015|    1|
|   10016|  184|
|   10017|   83|
|   10018|  128|
|   10019|  342|
|   10020|    1|
|   10021|  108|
+--------+-----+
only showing top 20 rows



In [67]:
# geolocator client
geolocator = Nominatim(user_agent="my-app")
locations = geolocator.geocode({"postalcode": 10010})


In [73]:
# Zip code to lon lat
lat = locations.raw['lat']
long = locations.raw['lon']
print(lat, long)
# print(locations.raw)
# geolocator.reverse((lat, long))


40.73988035963598 -73.98521246413527


## Export cleaned data

In [46]:
after_lockdown_eviction_cases
eviction_cases_by_zipcode.coalesce(1).write.format('com.databricks.spark.csv').option('header', 'true').mode('overwrite').save('data/cleaned_data/eviction_cases_by_zipcode')
eviction_cases_time_and_summary.coalesce(1).write.format('com.databricks.spark.csv').option('header', 'true').mode('overwrite').save('data/cleaned_data/eviction_cases_time_and_summary')

