## Spark Streaming

In [None]:
from pyspark.sql import SparkSession
stream_session= SparkSession.builder.appName("streaming").config("spark.executor.memory", "4g").getOrCreate()

Here,the spark streaming session is demonstrating the records that were passed via kafka and pushed into csv files as batches.These are then  then  merged together with the master csv file gun-violence.csv.

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType

# Define the schema
schema = StructType([
    StructField("incident_id", StringType(), True),
    StructField("date", TimestampType(), True),
    StructField("state", StringType(), True),
    StructField("city_or_county", StringType(), True),
    StructField("address", StringType(), True),
    StructField("n_killed", IntegerType(), True),
    StructField("n_injured", IntegerType(), True),
    StructField("incident_url", StringType(), True),
    StructField("source_url", StringType(), True),
    StructField("incident_url_fields_missing", BooleanType(), True),
    StructField("congressional_district", IntegerType(), True),
    StructField("gun_stolen", StringType(), True),
    StructField("gun_type", StringType(), True),
    StructField("incident_characteristics", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("location_description", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("n_guns_involved", IntegerType(), True),
    StructField("notes", StringType(), True),
    StructField("participant_age", StringType(), True),
    StructField("participant_age_group", StringType(), True),
    StructField("participant_gender", StringType(), True),
    StructField("participant_name", StringType(), True),
    StructField("participant_relationship", StringType(), True),
    StructField("participant_status", StringType(), True),
    StructField("participant_type", StringType(), True),
    StructField("sources", StringType(), True),
    StructField("state_house_district", IntegerType(), True),
    StructField("state_senate_district", IntegerType(), True)
])


In [3]:
incident_stream=stream_session.readStream.schema(schema).csv('newDir/')

incquery=incident_stream.writeStream.queryName("incidenttable").format("memory").outputMode("append").start()

24/03/11 00:50:13 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/90/k1t6q47n6wq06bm2v8x2fdf40000gn/T/temporary-e7be039c-1bdc-4794-8c89-d72796dd04f4. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/11 00:50:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/03/11 00:50:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [None]:
import time
for i in range (30):
    stream_session.sql("SELECT count(*) AS TotalIncidents FROM incidenttable").show()
    time.sleep(3)

In [13]:
import csv
import os
# Output directory containing batch files
input_directory = 'newDir/'

# Output file for the combined data
output_file = 'gun-violence.csv'

# Initialize a flag to check if the output file header is written
header_written = False

try:
    # Open the output file in append mode
    with open(output_file, 'a', newline='') as output_csv:
        csv_writer = csv.writer(output_csv)
        
        # Iterate through batch files
        for filename in sorted(os.listdir(input_directory)):
            if filename.endswith('.csv'):
                batch_file_path = os.path.join(input_directory, filename)
                
                # Open the batch file
                with open(batch_file_path, 'r') as batch_file:
                    csv_reader = csv.reader(batch_file)
                    
                    # Skip the header if it's already written to the output file
                    if header_written:
                        next(csv_reader)
                    else:
                        header_written = True
                    
                    # Iterate through rows in the batch file and append them to the output file
                    for row in csv_reader:
                        csv_writer.writerow(row)
    
    print(f"All batch files are merged into {output_file}")

except Exception as e:
    print(f"Error occurred: {str(e)}")


All batch files are merged into gun-violence-new.csv
