# Stream generation

This notebook serves to generate data that will simulate streaming queue. Each file will be one record in json.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, row_number
)
from pyspark.sql import Window
import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Streaming Prepare Data')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

output_path = os.path.join(project_path, 'data/questions-queue')

In [None]:
questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

In [None]:
numbered = (
    questionsDF
    .limit(1000)
    .select('question_id', 'creation_date', 'title')
    .withColumn('r', row_number().over(Window().partitionBy().orderBy('creation_date')))
)

In [None]:
numbered.printSchema()

In [None]:
for i in range(numbered.count()):
    print(i)
    (
        numbered
        .filter(col('r') == (i + 1))
        .write
        .mode('append')
        .option('path', output_path)
        .format('json')
        .save()
    )

In [None]:
spark.read.format('json').load(output_path).count()

In [None]:
spark.stop()