## SQL Options
This is the first thing to do when working with pyspark. The spark variable will also provide access to a UI to monitor jobs.

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SQLOptions").getOrCreate()

In [4]:
path = 'Datasets/'
    

In [5]:
crime = spark.read.csv(path+"rec-crime-pfa.csv",header=True,inferSchema=True)

In [6]:
crime.limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Rolling year total number of offences
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561


In [7]:
crime.printSchema()

root
 |-- 12 months ending: string (nullable = true)
 |-- PFA: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Offence: string (nullable = true)
 |-- Rolling year total number of offences: integer (nullable = true)



#### Renamce column

In [8]:
df = crime.withColumnRenamed("Rolling year total number of offences","Count")

In [9]:
df.printSchema()

root
 |-- 12 months ending: string (nullable = true)
 |-- PFA: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Offence: string (nullable = true)
 |-- Count: integer (nullable = true)



## Creating View of dataframe
This is very similar to how we create a view of a table in SQL. Similarly, it is also temporary and only exists for the current session

In [10]:
df.createOrReplaceTempView("tempview")

### Selecting data from view

In [11]:
spark.sql("SELECT * FROM tempview").limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561


### Selecting columns from View where count>100
We can literally write SQL statements

In [12]:
spark.sql("SELECT * FROM tempview WHERE Count>1000").limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561
4,31/03/2003,Avon and Somerset,South West,Drug offences,2308


In [13]:
sql_results = spark.sql("SELECT Region, Offence FROM tempview").limit(5).toPandas()
sql_results

Unnamed: 0,Region,Offence
0,South West,All other theft offences
1,South West,Bicycle theft
2,South West,Criminal damage and arson
3,South West,Death or serious injury caused by illegal driving
4,South West,Domestic burglary


In [14]:
spark.sql("SELECT Region, sum(Count) AS Total FROM tempview GROUP BY Region").limit(5).toPandas()


Unnamed: 0,Region,Total
0,Fraud: CIFAS,7678981
1,North West,30235732
2,British Transport Police,3029117
3,Wales,11137260
4,London,42691902


### Another way of utilizing Pyspark for SQL

In [16]:
from pyspark.ml.feature import SQLTransformer

### Note we cannot use anything else in place of __THIS__ 
It is similar to a reference. When combined with transform, __THIS__ points to the dataframe passed in the argument

In [17]:
sqlTrans = SQLTransformer(statement = "SELECT PFA,Region,Offence FROM __THIS__")

In [18]:
sqlTrans.transform(df).show(5)

+-----------------+----------+--------------------+
|              PFA|    Region|             Offence|
+-----------------+----------+--------------------+
|Avon and Somerset|South West|All other theft o...|
|Avon and Somerset|South West|       Bicycle theft|
|Avon and Somerset|South West|Criminal damage a...|
|Avon and Somerset|South West|Death or serious ...|
|Avon and Somerset|South West|   Domestic burglary|
+-----------------+----------+--------------------+
only showing top 5 rows



In [19]:
type(sqlTrans)

pyspark.ml.feature.SQLTransformer

In [20]:
sqlTrans = SQLTransformer(statement = "SELECT Offence,sum(Count) FROM __THIS__ GROUP BY Offence")

In [21]:
sqlTrans.transform(df).show()

+--------------------+----------+
|             Offence|sum(Count)|
+--------------------+----------+
|Public order offe...|  10925676|
|       Bicycle theft|   5297006|
|Residential burglary|   1671469|
|Violence without ...|  16590158|
|All other theft o...|  30979393|
|             Robbery|   3788128|
|               CIFAS|   7678981|
|      Fraud offences|   2596554|
|     Sexual offences|   4006741|
|Criminal damage a...|  37767463|
|            Homicide|     34154|
|Possession of wea...|   1555951|
|          UK Finance|   2925861|
|Stalking and hara...|   5587434|
|Theft from the pe...|   5105153|
|         Shoplifting|  16781641|
|       Drug offences|   9999435|
|    Vehicle offences|  26075670|
|   Domestic burglary|  11694636|
|Miscellaneous cri...|   3143136|
+--------------------+----------+
only showing top 20 rows



In [22]:
sqlTrans = SQLTransformer(statement = "SELECT sum(Count) FROM __THIS__ ")

In [23]:
sqlTrans.transform(df).show()

+----------+
|sum(Count)|
+----------+
| 244720928|
+----------+



### Working with Expressions.
Pyspark's wide sql functionality also provides ways to utilize expressions to work with dataframes

In [24]:
from pyspark.sql.functions import expr

In [25]:
df.withColumn("percentage",expr('round((count/244720928)*100,2)')).show()

+----------------+-----------------+----------+--------------------+-----+----------+
|12 months ending|              PFA|    Region|             Offence|Count|percentage|
+----------------+-----------------+----------+--------------------+-----+----------+
|      31/03/2003|Avon and Somerset|South West|All other theft o...|25959|      0.01|
|      31/03/2003|Avon and Somerset|South West|       Bicycle theft| 3090|       0.0|
|      31/03/2003|Avon and Somerset|South West|Criminal damage a...|26202|      0.01|
|      31/03/2003|Avon and Somerset|South West|Death or serious ...|    2|       0.0|
|      31/03/2003|Avon and Somerset|South West|   Domestic burglary|14561|      0.01|
|      31/03/2003|Avon and Somerset|South West|       Drug offences| 2308|       0.0|
|      31/03/2003|Avon and Somerset|South West|      Fraud offences| 5339|       0.0|
|      31/03/2003|Avon and Somerset|South West|            Homicide|   19|       0.0|
|      31/03/2003|Avon and Somerset|South West|Miscell

In [26]:
df.select("*",expr('round((count/244720928)*100,2)')).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count,"round(((CAST(count AS DOUBLE) / CAST(244720928 AS DOUBLE)) * CAST(100 AS DOUBLE)), 2)"
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959,0.01
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090,0.00
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202,0.01
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2,0.00
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561,0.01
...,...,...,...,...,...,...
46464,31/12/2018,Wiltshire,South West,Stalking and harassment,2380,0.00
46465,31/12/2018,Wiltshire,South West,Theft from the person,347,0.00
46466,31/12/2018,Wiltshire,South West,Vehicle offences,2895,0.00
46467,31/12/2018,Wiltshire,South West,Violence with injury,5701,0.00


In [27]:
df.selectExpr("*",'round((count/244720928)*100,2) AS Percent').toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count,Percent
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959,0.01
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090,0.00
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202,0.01
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2,0.00
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561,0.01
...,...,...,...,...,...,...
46464,31/12/2018,Wiltshire,South West,Stalking and harassment,2380,0.00
46465,31/12/2018,Wiltshire,South West,Theft from the person,347,0.00
46466,31/12/2018,Wiltshire,South West,Vehicle offences,2895,0.00
46467,31/12/2018,Wiltshire,South West,Violence with injury,5701,0.00


In [28]:
df.selectExpr("*",'round((count/244720928)*100,2) AS Percent').filter("Region='South West'").toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count,Percent
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959,0.01
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090,0.00
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202,0.01
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2,0.00
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561,0.01
...,...,...,...,...,...,...
5265,31/12/2018,Wiltshire,South West,Stalking and harassment,2380,0.00
5266,31/12/2018,Wiltshire,South West,Theft from the person,347,0.00
5267,31/12/2018,Wiltshire,South West,Vehicle offences,2895,0.00
5268,31/12/2018,Wiltshire,South West,Violence with injury,5701,0.00
