# Exercises

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

### This exercises uses the ```case.csv```, ```dept.csv```, and ```source.csv``` files from the san antonio 311 call dataset.

### 1. Read the case, department, and source data into their own spark dataframes.

In [2]:
# Read case in CSV file 
case = (spark.read.csv("case.csv",
                     sep=",",
                     header=True,
                     inferSchema=True)
     )

In [3]:
case.show(3, False, True)

-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 1/1/18 0:42                           
 case_closed_date     | 1/1/18 12:29                          
 SLA_due_date         | 9/26/20 0:42                          
 case_late            | NO                                    
 num_days_late        | -998.5087616000001                    
 case_closed          | YES                                   
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999.0                                 
 case_status          | Closed                                
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO ST, San Antonio, 78207  
 council_district     | 5                                     
-RECORD 1----------------------------------------------

In [4]:
# Read dept in CSV file 
dept = (spark.read.csv("dept.csv",
                     sep=",",
                     header=True,
                     inferSchema=True)
     )

In [5]:
dept.show(3, False, True)

-RECORD 0----------------------------------------
 dept_division          | 311 Call Center        
 dept_name              | Customer Service       
 standardized_dept_name | Customer Service       
 dept_subject_to_SLA    | YES                    
-RECORD 1----------------------------------------
 dept_division          | Brush                  
 dept_name              | Solid Waste Management 
 standardized_dept_name | Solid Waste            
 dept_subject_to_SLA    | YES                    
-RECORD 2----------------------------------------
 dept_division          | Clean and Green        
 dept_name              | Parks and Recreation   
 standardized_dept_name | Parks & Recreation     
 dept_subject_to_SLA    | YES                    
only showing top 3 rows



In [6]:
# Read source in CSV file 
source = (spark.read.csv("source.csv",
                     sep=",",
                     header=True,
                     inferSchema=True)
     )

In [7]:
source.show(3, False, True)

-RECORD 0---------------------------
 source_id       | 100137           
 source_username | Merlene Blodgett 
-RECORD 1---------------------------
 source_id       | 103582           
 source_username | Carmen Cura      
-RECORD 2---------------------------
 source_id       | 106463           
 source_username | Richard Sanchez  
only showing top 3 rows



### 2. Let's see how writing to the local disk works in spark:

- Write the code necessary to store the source data in both csv and json format, store these as ```sources_csv```  and  ```sources_json```

***- sources_json***

In [8]:
# write data to a destination using .write property

source.write.json("sources_json", mode="overwrite")

***- sources_csv***

In [9]:
source.write.csv("sources_csv", mode="overwrite")

- Inspect your folder structure. What do you notice?

it creates a folder with the giving name but inside the folder have two files  one is partition

#### 3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types

In [10]:
#check the data types
source.dtypes

[('source_id', 'string'), ('source_username', 'string')]

In [11]:
source.show(2)

+---------+----------------+
|source_id| source_username|
+---------+----------------+
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
+---------+----------------+
only showing top 2 rows



**note:** soucer_id shold be  numerical

In [12]:
# source_id as an int instead of string
source =source.withColumn('source_id', col('source_id').cast('int'))

In [13]:
#check the type again
source.dtypes

[('source_id', 'int'), ('source_username', 'string')]

____________

In [14]:
#check the types in dept
dept.dtypes

[('dept_division', 'string'),
 ('dept_name', 'string'),
 ('standardized_dept_name', 'string'),
 ('dept_subject_to_SLA', 'string')]

In [15]:
dept.show(3)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|                YES|
|          Brush|Solid Waste Manag...|           Solid Waste|                YES|
|Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows



***note:*** dept_subject_to_SLA can be changed to boolean value

In [16]:
# use .withColumn to change columns from string to boolean values

dept = dept.withColumn('dept_subject_to_SLA', expr('dept_subject_to_SLA == "YES"'))

In [17]:
dept.show(2)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|               true|
|          Brush|Solid Waste Manag...|           Solid Waste|               true|
+---------------+--------------------+----------------------+-------------------+
only showing top 2 rows



In [18]:
#check the type in case
case.dtypes

[('case_id', 'int'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('SLA_due_date', 'string'),
 ('case_late', 'string'),
 ('num_days_late', 'double'),
 ('case_closed', 'string'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'int')]

In [19]:
case.show(3, False, True)

-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 1/1/18 0:42                           
 case_closed_date     | 1/1/18 12:29                          
 SLA_due_date         | 9/26/20 0:42                          
 case_late            | NO                                    
 num_days_late        | -998.5087616000001                    
 case_closed          | YES                                   
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999.0                                 
 case_status          | Closed                                
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO ST, San Antonio, 78207  
 council_district     | 5                                     
-RECORD 1----------------------------------------------

**note** all the dates are in string type should be datetime format, case_closed and case_late to boolean, council_district as a string

In [20]:
# use .withColumn to change columns from string to boolean values

case = case.withColumn('case_closed', expr('case_closed == "YES"'))\
.withColumn('case_late', expr('case_late == "YES"'))

In [21]:
case.select("case_closed", "case_late").show(2)

+-----------+---------+
|case_closed|case_late|
+-----------+---------+
|       true|    false|
|       true|    false|
+-----------+---------+
only showing top 2 rows



In [22]:
# council_district as a string instead of int
case = case.withColumn('council_district', col('council_district').cast('string'))

In [23]:
# to_timestamp, fmt

fmt = "M/d/yy H:mm"

case = case.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))\
.withColumn('case_closed_date', to_timestamp('case_closed_date', fmt))\
.withColumn('SLA_due_date', to_timestamp('SLA_due_date', fmt))

In [24]:
#check all the changes
case.dtypes

[('case_id', 'int'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('SLA_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string')]

In [25]:
case.show(3, False, True)

-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 2018-01-01 00:42:00                   
 case_closed_date     | 2018-01-01 12:29:00                   
 SLA_due_date         | 2020-09-26 00:42:00                   
 case_late            | false                                 
 num_days_late        | -998.5087616000001                    
 case_closed          | true                                  
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999.0                                 
 case_status          | Closed                                
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO ST, San Antonio, 78207  
 council_district     | 5                                     
-RECORD 1----------------------------------------------

_________________

###  new features

In [26]:
# Rename 'SLA_due_date' to 'case_due_date' using .withColumnRenamed

case = case.withColumnRenamed('SLA_due_date', 'case_due_date')

In [27]:
case = case.withColumn('request_address', trim(lower(case.request_address)))

In [28]:
# convert the number of days a case is late to a number of weeks

case = case.withColumn('num_weeks_late', expr('num_days_late/7'))

case.select("num_days_late", "num_weeks_late").show(5)

+-------------------+--------------------+
|      num_days_late|      num_weeks_late|
+-------------------+--------------------+
| -998.5087616000001|        -142.6441088|
|-2.0126041669999997|-0.28751488099999994|
|       -3.022337963|-0.43176256614285713|
|       -15.01148148| -2.1444973542857144|
|0.37216435200000003|         0.053166336|
+-------------------+--------------------+
only showing top 5 rows



In [29]:
# use format_string function to pad zeros for council_district

case = case.withColumn('council_district', format_string('%03d', col('council_district').cast('int')))

In [30]:
# create a new column for zipcode:

case = case.withColumn('zipcode', regexp_extract('request_address', r"(\d+$)", 1))

case.select('zipcode').show(5)

+-------+
|zipcode|
+-------+
|  78207|
|  78223|
|  78223|
|  78223|
|  78228|
+-------+
only showing top 5 rows



In [31]:
case.show(1, False, True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

In [32]:
#create three new columns 'case_age', 'days_to_closed', 'case_lifetime'

case = (
    case.withColumn(
        "case_age", datediff(current_timestamp(), "case_opened_date")
    )
    .withColumn(
        "days_to_closed", datediff("case_closed_date", "case_opened_date")
    )
    .withColumn(
        "case_lifetime",
        when(expr("! case_closed"), col("case_age")).otherwise(
            col("days_to_closed")
        ),
    )
)

### 1. How old is the latest (in terms of days past SLA) currently open issue? How long has the oldest (in terms of days since opened) currently opened issue been open?

In [33]:
case.show(1, False, True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

In [34]:
case.filter(case.case_status == 'Open').orderBy(desc(case.case_age)).show(1, False, True)

-RECORD 0------------------------------------------------------
 case_id              | 1013225646                             
 case_opened_date     | 2017-01-01 13:48:00                    
 case_closed_date     | null                                   
 case_due_date        | 2017-01-17 08:30:00                    
 case_late            | true                                   
 num_days_late        | 348.6458333                            
 case_closed          | false                                  
 dept_division        | Code Enforcement                       
 service_request_type | No Address Posted                      
 SLA_days             | 15.77859954                            
 case_status          | Open                                   
 source_id            | svcCRMSS                               
 request_address      | 7299  shadow ridge, san antonio, 78250 
 council_district     | 006                                    
 num_weeks_late       | 49.8065476142857

In [35]:
case.filter(case.case_status == 'Open').orderBy(desc(case.SLA_days)).show(1, False, True)

-RECORD 0--------------------------------------------------------
 case_id              | 1013896575                               
 case_opened_date     | 2017-09-22 08:27:00                      
 case_closed_date     | null                                     
 case_due_date        | 2021-08-11 08:30:00                      
 case_late            | false                                    
 num_days_late        | -1318.354167                             
 case_closed          | false                                    
 dept_division        | Signals                                  
 service_request_type | Signal Timing Modification By Engineer   
 SLA_days             | 1419.00191                               
 case_status          | Open                                     
 source_id            | CRM_Listener                             
 request_address      | 4200  harry wurzbach, san antonio, 78209 
 council_district     | 010                                      
 num_weeks

### 2. How many Stray Animal cases are there?

In [36]:
#total cases of Stray Animal
case.filter(case.service_request_type == 'Stray Animal').count()

26760

In [37]:
# open cases of Stray Animal
case.filter(case.service_request_type == 'Stray Animal').where(case.case_closed).count()

26745

In [38]:
# closed cases of Stray Animal
case.filter(case.service_request_type == 'Stray Animal').where( case.case_closed == False).count()

15

### 3. How many service requests that are assigned to the Field Operations department (dept_division) are not classified as "Officer Standby" request type (service_request_type)?

In [54]:
case.filter(case.dept_division == "Field Operations")\
.where(case.service_request_type != 'Officer Standby').count()

113902

In [53]:
# (case.filter(expr('dept_division == "Field Operations"'))\
# .where(exp('service_request_type != "Officer Standby"'))).count()

AnalysisException: cannot resolve '`service_request_type != "Officer Standby"`' given input columns: [SLA_days, case_age, case_closed, case_closed_date, case_closed_year, case_due_date, case_id, case_late, case_lifetime, case_opened_date, case_status, council_district, days_to_closed, dept_division, num_days_late, num_hours_late, num_weeks_late, request_address, service_request_type, source_id, zipcode];
'Filter EXP('service_request_type != "Officer Standby")
+- Filter (dept_division#23 = Field Operations department)
   +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#1115, num_weeks_late#535, zipcode#579, case_age#682, days_to_closed#700, case_lifetime#719, case_closed_year#1135, (num_days_late#21 * cast(24 as double)) AS num_hours_late#1614]
      +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#1115, num_weeks_late#535, zipcode#579, case_age#682, days_to_closed#700, case_lifetime#719, year(cast(case_closed_date#404 as date)) AS case_closed_year#1135]
         +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, cast(council_district#563 as string) AS council_district#1115, num_weeks_late#535, zipcode#579, case_age#682, days_to_closed#700, case_lifetime#719]
            +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#563, num_weeks_late#535, zipcode#579, case_age#682, days_to_closed#700, CASE WHEN NOT case_closed#331 THEN case_age#682 ELSE days_to_closed#700 END AS case_lifetime#719]
               +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#563, num_weeks_late#535, zipcode#579, case_age#682, datediff(cast(case_closed_date#404 as date), cast(case_opened_date#389 as date)) AS days_to_closed#700]
                  +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#563, num_weeks_late#535, zipcode#579, datediff(cast(current_timestamp() as date), cast(case_opened_date#389 as date)) AS case_age#682]
                     +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#563, num_weeks_late#535, regexp_extract(request_address#520, (\d+$), 1) AS zipcode#579]
                        +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, format_string(%03d, cast(council_district#374 as int)) AS council_district#563, num_weeks_late#535]
                           +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#520, council_district#374, (num_days_late#21 / cast(7 as double)) AS num_weeks_late#535]
                              +- Project [case_id#16, case_opened_date#389, case_closed_date#404, case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, trim(lower(request_address#28), None) AS request_address#520, council_district#374]
                                 +- Project [case_id#16, case_opened_date#389, case_closed_date#404, SLA_due_date#419 AS case_due_date#505, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, council_district#374]
                                    +- Project [case_id#16, case_opened_date#389, case_closed_date#404, to_timestamp('SLA_due_date, Some(M/d/yy H:mm)) AS SLA_due_date#419, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, council_district#374]
                                       +- Project [case_id#16, case_opened_date#389, to_timestamp('case_closed_date, Some(M/d/yy H:mm)) AS case_closed_date#404, SLA_due_date#19, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, council_district#374]
                                          +- Project [case_id#16, to_timestamp('case_opened_date, Some(M/d/yy H:mm)) AS case_opened_date#389, case_closed_date#18, SLA_due_date#19, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, council_district#374]
                                             +- Project [case_id#16, case_opened_date#17, case_closed_date#18, SLA_due_date#19, case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, cast(council_district#29 as string) AS council_district#374]
                                                +- Project [case_id#16, case_opened_date#17, case_closed_date#18, SLA_due_date#19, (case_late#20 = YES) AS case_late#346, num_days_late#21, case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, council_district#29]
                                                   +- Project [case_id#16, case_opened_date#17, case_closed_date#18, SLA_due_date#19, case_late#20, num_days_late#21, (case_closed#22 = YES) AS case_closed#331, dept_division#23, service_request_type#24, SLA_days#25, case_status#26, source_id#27, request_address#28, council_district#29]
                                                      +- Relation[case_id#16,case_opened_date#17,case_closed_date#18,SLA_due_date#19,case_late#20,num_days_late#21,case_closed#22,dept_division#23,service_request_type#24,SLA_days#25,case_status#26,source_id#27,request_address#28,council_district#29] csv


### 4. Convert the council_district column to a string column.

In [40]:
# council_district as a string instead of int
case = case.withColumn('council_district', col('council_district').cast('string'))

### 5. Extract the year from the case_closed_date column.

In [41]:
case = case.withColumn("case_closed_year", year("case_closed_date"))

In [42]:
case.show(2, False, True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

### 6. Convert num_days_late from days to hours in new columns num_hours_late.

In [46]:
case.withColumn('num_hours_late', case.num_days_late * 24).show(1, False, True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

In [47]:
case = case.withColumn('num_hours_late', case.num_days_late * 24)

### 7. Join the case data with the source and department data.

In [49]:
dept.show(3)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|               true|
|          Brush|Solid Waste Manag...|           Solid Waste|               true|
|Clean and Green|Parks and Recreation|    Parks & Recreation|               true|
+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows



In [55]:
case.show(1, False, True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

In [57]:
df = (
    case
    # left join on dept_division
    .join(dept, "dept_division", "left")
    # drop all the columns except for standardized name, as it has much fewer unique values
    .drop(dept.dept_division)
    .drop(dept.dept_name)
    .withColumnRenamed("standardized_dept_name", "department")
)



In [58]:
df.show(2, False, True)

-RECORD 0----------------------------------------------------
 dept_division        | Field Operations                     
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

### 8. Are there any cases that do not have a request source?

In [124]:
# missing values 
df.filter("source_id is null").show(5,False, True)

(0 rows)



In [122]:
#other way
df.where(df.source_id.isNull()).show(3, False, True)

(0 rows)



In [118]:
df.source_id.isNull()

Column<'(source_id IS NULL)'>

In [96]:
df.select("service_request_type").filter("service_request_type is null").show()

+--------------------+
|service_request_type|
+--------------------+
+--------------------+



### 9. What are the top 10 service request types in terms of number of requests?

In [61]:
df.groupBy("service_request_type").count().show(truncate= False)

+--------------------------------------+-----+
|service_request_type                  |count|
+--------------------------------------+-----+
|Minimum Housing-Owner Occupied        |8543 |
|Tree Removal                          |298  |
|Service Information                   |160  |
|Sign Maintenance                      |82   |
|Park Building Maint Invest            |48   |
|Brush Property Damage                 |184  |
|Graffiti: Private Property (Corridors)|8525 |
|Traffic Sign Graffiti                 |2123 |
|License Renewal Invoice               |1349 |
|Used/Scrap Tire Facility Registration |19   |
|Guardrail- New Request                |100  |
|Markings Installation SMO (NEW)       |8    |
|CCO_Request for Research/Information_1|2    |
|Sewer Line Broken                     |1107 |
|Zoning: Multi-Family In Single        |735  |
|Engineering Investigation             |489  |
|Zoning: Setbacks                      |809  |
|Traffic Sign Faded                    |2122 |
|Permits, Fen

In [65]:
df.groupBy("service_request_type").count().orderBy(desc("count")).show(10, truncate= False)

+--------------------------------+-----+
|service_request_type            |count|
+--------------------------------+-----+
|No Pickup                       |86855|
|Overgrown Yard/Trash            |65895|
|Bandit Signs                    |32910|
|Damaged Cart                    |30338|
|Front Or Side Yard Parking      |28794|
|Stray Animal                    |26760|
|Aggressive Animal(Non-Critical) |24882|
|Cart Exchange Request           |22024|
|Junk Vehicle On Private Property|21473|
|Pot Hole Repair                 |20616|
+--------------------------------+-----+
only showing top 10 rows



### 10. What are the top 10 service request types in terms of average days late?

In [129]:
df.groupBy("service_request_type").mean("num_days_late").orderBy(desc('avg(num_days_late)')).show(10, truncate= False)

+--------------------------------------+------------------+
|service_request_type                  |avg(num_days_late)|
+--------------------------------------+------------------+
|Zoning: Junk Yards                    |175.9563621042095 |
|Labeling for Used Mattress            |162.43032902285717|
|Record Keeping of Used Mattresses     |153.99724039428568|
|Signage Requied for Sale of Used Mattr|151.63868055333333|
|Storage of Used Mattress              |142.112556415     |
|Zoning: Recycle Yard                  |135.92851612479797|
|Donation Container Enforcement        |131.75610506358706|
|License Requied Used Mattress Sales   |128.79828704142858|
|Traffic Signal Graffiti               |77.90021217000002 |
|Complaint                             |72.51790932659713 |
+--------------------------------------+------------------+
only showing top 10 rows



In [130]:
df.groupBy("service_request_type").mean("num_days_late").orderBy('avg(num_days_late)').show(10, truncate= False)

+--------------------------------------+-------------------+
|service_request_type                  |avg(num_days_late) |
+--------------------------------------+-------------------+
|CCO_Request for Research/Information_1|null               |
|Request for Research/Information      |null               |
|Engineering Design                    |-1399.1272335      |
|Signal Timing Modification By Engineer|-1247.0797799732143|
|Stray Animal                          |-998.804572616083  |
|Major Park Improvement Install        |-280.2546235360405 |
|Sidewalk Cost Sharing Program         |-184.87626063647144|
|Multi Tenant Exterior                 |-135.71588128047625|
|CPS Energy Towers                     |-129.84778717829747|
|CPS Energy Wood Poles                 |-129.30905202721226|
+--------------------------------------+-------------------+
only showing top 10 rows



In [133]:
df.where(df.service_request_type == 'Request for Research/Information').show(10, False, True)

-RECORD 0-------------------------------------------------------
 dept_division        | District 2                              
 case_id              | 1014206419                              
 case_opened_date     | 2018-01-30 16:54:00                     
 case_closed_date     | null                                    
 case_due_date        | null                                    
 case_late            | false                                   
 num_days_late        | null                                    
 case_closed          | false                                   
 service_request_type | Request for Research/Information        
 SLA_days             | null                                    
 case_status          | Open                                    
 source_id            | 139868                                  
 request_address      | 5118  village trail, san antonio, 78218 
 council_district     | 002                                     
 num_weeks_late       | n

### 11. Does number of days late depend on department?

In [75]:
df.groupBy("department").mean("num_days_late").show(20)

+--------------------+-------------------+
|          department| avg(num_days_late)|
+--------------------+-------------------+
|         Solid Waste| -2.193864424022545|
|Animal Care Services|-226.16549770717506|
|Trans & Cap Impro...|-20.509793501785314|
|  Parks & Recreation| -5.283345998745901|
|    Customer Service|  59.49019459221518|
|        Metro Health| -4.904223205386017|
|        City Council|               null|
|DSD/Code Enforcement| -38.32346772537388|
+--------------------+-------------------+



In [76]:
df.groupBy("department").count().show(20)

+--------------------+------+
|          department| count|
+--------------------+------+
|         Solid Waste|279270|
|Animal Care Services|116915|
|Trans & Cap Impro...| 96193|
|  Parks & Recreation| 19907|
|    Customer Service|  2849|
|        Metro Health|  5163|
|        City Council|    33|
|DSD/Code Enforcement|321374|
+--------------------+------+



In [79]:
df.show(1, False, True)

-RECORD 0----------------------------------------------------
 dept_division        | Field Operations                     
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  el paso st, san antonio, 78207 
 council_district     | 005                                  
 num_weeks_late       | -142.6441088                         
 zipcode

In [83]:
#df.filter(df.deparment == "City Council").show(5,True, False)

In [134]:
case.filter(case.dept_division == "Field Operations")\
.where(case.service_request_type != 'Officer Standby').count()

113902

### 12. How do number of days late depend on department and request type?

In [84]:
df.groupBy("department", "service_request_type").mean("num_days_late").show(50, truncate= False)

+------------------------+----------------------------------------+-------------------+
|department              |service_request_type                    |avg(num_days_late) |
+------------------------+----------------------------------------+-------------------+
|Trans & Cap Improvements|Flashing Beacon New Request             |-68.19004095633333 |
|Solid Waste             |Request for Commercial Service          |4.081734210910448  |
|DSD/Code Enforcement    |Graffiti: Commercial (Occupied)         |-58.870344428275864|
|DSD/Code Enforcement    |DSDB Ordered/Hold Harmless/Regular: ASEC|-22.5688841082     |
|Metro Health            |Public Right of Way                     |-1.1180635370092933|
|Parks & Recreation      |Amenity Park Improvement                |10.114974371919642 |
|Trans & Cap Improvements|Speed Limit Sign                        |-12.105217054218754|
|Parks & Recreation      |Pools                                   |-1.8421989853587784|
|DSD/Code Enforcement    |Emerge