In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark data cleaning and engineering") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
file_location = "datasets/corona_data.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option("nanValue", ' ') \
  .option("nullValue", ' ') \
  .load(file_location)

display(df)

DataFrame[_c0: int, State: string, Country: string, Lat: double, Long: double, Date: string, Confirmed: int, Death: int, Recovered: int, state_cleaned: string, City: string]

In [3]:
df.show()

+---+----------------+--------------------+--------+---------+----------+---------+-----+---------+--------------------+----+
|_c0|           State|             Country|     Lat|     Long|      Date|Confirmed|Death|Recovered|       state_cleaned|City|
+---+----------------+--------------------+--------+---------+----------+---------+-----+---------+--------------------+----+
|  0|            null|            Thailand|    15.0|    101.0|2020-01-22|        2|    0|        0|             Bangkok|null|
|  1|            null|               Japan|    36.0|    138.0|2020-01-22|        2|    0|        0|             Hiraide|null|
|  2|            null|           Singapore|  1.2833| 103.8333|2020-01-22|        0|    0|        0|           Singapore|null|
|  3|            null|               Nepal| 28.1667|    84.25|2020-01-22|        0|    0|        0|           Kathmandu|null|
|  4|            null|            Malaysia|     2.5|    112.5|2020-01-22|        0|    0|        0|             Sarawa

In [4]:
df.count()

28143

In [8]:
import pyspark.sql.functions as F

In [9]:
corona_df_max = df.join(df.groupBy("Country", "State_cleaned").agg(F.max("Date").alias("Date")), on=["Country", "State_cleaned",
                                                                                                    "Date"], how="inner" )

In [10]:
corona_df_max.show()

+--------------------+--------------------+----------+-----+----------------+--------+---------+---------+-----+---------+----+
|             Country|       state_cleaned|      Date|  _c0|           State|     Lat|     Long|Confirmed|Death|Recovered|City|
+--------------------+--------------------+----------+-----+----------------+--------+---------+---------+-----+---------+----+
|            Thailand|             Bangkok|2020-03-20|27666|            null|    15.0|    101.0|      322|    1|       42|null|
|               Japan|             Hiraide|2020-03-20|27667|            null|    36.0|    138.0|      963|   33|      191|null|
|           Singapore|           Singapore|2020-03-20|27668|            null|  1.2833| 103.8333|      385|    0|      124|null|
|               Nepal|           Kathmandu|2020-03-20|27669|            null| 28.1667|    84.25|        1|    0|        1|null|
|            Malaysia|             Sarawak|2020-03-20|27670|            null|     2.5|    112.5|     103

In [12]:
from pyspark.sql.functions import *

In [14]:
df.select("Country", "State_cleaned", "Confirmed", "Recovered").filter(col("Country").isin("Australia", "China")).groupBy("Country").sum().show()

+---------+--------------+--------------+
|  Country|sum(Confirmed)|sum(Recovered)|
+---------+--------------+--------------+
|    China|       3287028|       1570179|
|Australia|          4996|           601|
+---------+--------------+--------------+



In [15]:
df.select("Country", "State_cleaned", "Confirmed", "Recovered").filter(col("Country").isin("Australia", "China")).cube("Country").sum().show()

+---------+--------------+--------------+
|  Country|sum(Confirmed)|sum(Recovered)|
+---------+--------------+--------------+
|Australia|          4996|           601|
|     null|       3292024|       1570780|
|    China|       3287028|       1570179|
+---------+--------------+--------------+



In [17]:
df.select("Country", "State_cleaned", "Confirmed", "Recovered").filter(col("Country").isin("Australia", "China")).rollup("Country", "State_cleaned").sum().show()

+---------+------------------+--------------+--------------+
|  Country|     State_cleaned|sum(Confirmed)|sum(Recovered)|
+---------+------------------+--------------+--------------+
|    China|          Shandong|         31306|         18332|
|    China|         Chongqing|         26929|         16257|
|    China|           Ningxia|          3257|          2351|
|    China|              null|       3287028|       1570179|
|Australia|Northern Territory|            15|             0|
|    China|          Shanghai|         16214|         10137|
|    China|             Anhui|         44677|         28737|
|    China|           Qinghai|           893|           662|
|    China|         Hong Kong|          4314|          1485|
|    China|            Shanxi|          6199|          4055|
|    China|            Hainan|          7740|          4864|
|     null|              null|       3292024|       1570780|
|    China|           Guangxi|         11590|          6437|
|    China|           Ji

In [18]:
from pyspark import StorageLevel
df.persist(StorageLevel.MEMORY_AND_DISK)


DataFrame[_c0: int, State: string, Country: string, Lat: double, Long: double, Date: string, Confirmed: int, Death: int, Recovered: int, state_cleaned: string, City: string]

In [19]:
%timeit df.count()

62 ms ± 2.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
!nvidia-smi

Mon Sep 14 20:27:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 451.22       Driver Version: 451.22       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1650   WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   47C    P8     6W /  N/A |    134MiB /  4096MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|       

In [24]:
import urllib.request
url = 'https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10-1.jar'
filename = 'rapid_cudf.jar'
urllib.request.urlretrieve(url, filename)

('rapid_cudf.jar', <http.client.HTTPMessage at 0x1f677fcf9c8>)

In [25]:
url = 'https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar'
filename = 'rapid_xgboost.jar'
urllib.request.urlretrieve(url, filename)

('rapid_xgboost.jar', <http.client.HTTPMessage at 0x1f677f8b208>)

In [26]:
url = 'https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar'
filename = 'rapid_xgboost_spark.jar'
urllib.request.urlretrieve(url, filename)

('rapid_xgboost_spark.jar', <http.client.HTTPMessage at 0x1f677f6bbc8>)

In [27]:
# !wget https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10-1.jar
# !wget https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar
# !wget https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar

In [28]:
import os

In [29]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars rapid_cudf.jar,rapid_xgboost.jar,rapid_xgboost_spark.jar pyspark-shell'

In [32]:
!pip install findspark

Collecting findspark
  Downloading findspark-1.4.2-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: findspark
Successfully installed findspark-1.4.2
