<a href="https://colab.research.google.com/github/ducline/edit-data_processing/blob/main/spark_streaming/example1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 1
- Defining a sample dataset
- Splitting dataset in many CSVs and uploading them in the input folder in async mode
- Use Spark streaming to read from input folder
- Checking results from query in memory

# Setting up PySpark

In [None]:
%pip install pyspark



# Reading sample dataset

# Splitting dataset in many CSVs and uploading them in async mode

In [16]:
from pyspark import SparkFiles
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').config('spark.ui.port', '4050').getOrCreate()
url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv"

from pyspark.sql.types import *
schema = StructType([
    StructField('Area Name', StringType(), True),
    StructField('Area ID', StringType(), True),
    StructField('Park Name', StringType(), True),
    StructField('Park ID', StringType(), True),
    StructField('Squirrel ID', StringType(), True),
    StructField('Primary Fur Color', StringType(), True),
    StructField('Highlights in Fur Color', StringType(), True),
    StructField('Color Notes', StringType(), True),
    StructField('Location', StringType(), True),
    StructField('Above Ground (Height in Feet)', StringType(), True),
    StructField('Specific Location', StringType(), True),
    StructField('Activities', StringType(), True),
    StructField('Interactions with Humans', StringType(), True),
    StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
    StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("squirrel-data.csv"), header=True, schema=schema)

df.show()

from pyspark.sql import DataFrame
import time
import asyncio

async def test():
    print("dd")

async def splitDf(df: DataFrame, weight: float, files: int):
    weights = [weight for i in range(files)]
    dfs = df.randomSplit(weights)
    return dfs

async def writeFile(dfs: list[DataFrame], path: str, seconds_per_file: int):
    for i in range(len(dfs)):
        df = dfs[i]
        #print(f"Writing file {path}file_{i}.csv with {df.count()} lines")
        df.write.mode("overwrite").format("csv").save(path)
        await asyncio.sleep(seconds_per_file)

async def main(df):
    files = 10
    seconds_per_file = 5

    df = df.cache()

    dfs = await splitDf(df, 1.0, files)
    asyncio.create_task(writeFile(dfs, "/content/input/", seconds_per_file))

# In case of an already running event loop, use create_task or ensure_future
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.create_task(main(df))

# Start read of file stream (csv) from input folder
stream1 = spark.readStream.format('csv').schema(schema).option('header', True).load('/content/input/*')

# Check if dataframe is streaming
print(stream1.isStreaming)

# Start write as streaming into memory
query = (stream1.writeStream
    .format('memory')
    .queryName('my_query')
    .outputMode('append')
    .start()
)



+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-01|             Gray|                  White|       NULL|Ground Plane| 

# Read CSVs as streaming

In [14]:
# delete input folder
! rm -rf /content/input
! rm -rf /content/checkpoint

# Checking results using query in memory

In [17]:
spark.sql("select * from my_query").show()

+-----------------+-------+--------------------+-------+-----------+-----------------+-----------------------+-----------+--------------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|        Area Name|Area ID|           Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|            Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+-----------------+-------+--------------------+-------+-----------+-----------------+-----------------------+-----------+--------------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|         BROOKLYN|      D|       McCarren Park|     22|    D-22-06|         Cinnamon|              

In [12]:
query.stop()