In [None]:
# Query 1 RDD

import csv,time
from io import StringIO
from pyspark.sql import SparkSession

sc = SparkSession \
    .builder \
    .appName("Query 1 RDD") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() \
    .sparkContext

def parse_csv_line(line):
    f = StringIO(line)
    reader = csv.reader(f)
    return next(reader) 

def help1(data):
    try:
        age=int(data)
        if age<18 and age>0:
            return "child"
        if age<25:
            return "young adult"
        if age<65 :
            return "adult"
        if age>64:
            return "old"
        else:
            return "no individual victim"
    except:
        return "error"
    

start_time = time.time()
    
rdd1  = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")\
.map(parse_csv_line)

rdd2= sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")\
.map(parse_csv_line)

header1 = rdd1.first()
header2 = rdd2.first()

# Filter out the header
rdd1_data = rdd1.filter(lambda line: line != header1)
rdd2_data = rdd2.filter(lambda line: line != header2)

crime_data = rdd1_data.union(rdd2_data) \
.filter(lambda pair: pair[9].find("AGGRAVATED") != -1 ) \
.map(lambda data: help1(data[11])) \
.map(lambda data: (data[1], 1))  \
.reduceByKey(lambda a, b: a + b) \
.sortBy( lambda pair : pair[1], ascending=False )

print(crime_data.collect())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

s3_path = "s3://groups-bucket-dblab-905418150721/group35/query1_RDD/"
crime_data.write.mode("overwrite").parquet(s3_path)


In [None]:
# Query 1 Data_frame

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType


spark = SparkSession \
    .builder \
    .appName("Query 1 Dataframe") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() 


schema = StructType([
    StructField("DR_NO", StringType(), True),                  # Division of Records Number
    StructField("Date Rptd", TimestampType(), True),           # Date Reported
    StructField("DATE OCC", TimestampType(), True),            # Date Occurred
    StructField("TIME OCC", StringType(), True),               # Time Occurred
    StructField("AREA", StringType(), True),                   # Area ID
    StructField("AREA NAME", StringType(), True),              # Area Name
    StructField("Rpt Dist No", StringType(), True),            # Reporting District Number
    StructField("Part 1-2", IntegerType(), True),              # Part 1-2 Classification
    StructField("Crm Cd", StringType(), True),                 # Crime Code
    StructField("Crm Cd Desc", StringType(), True),            # Crime Code Description
    StructField("Mocodes", StringType(), True),                # Modus Operandi
    StructField("Vict Age", IntegerType(), True),              # Victim's Age
    StructField("Vict Sex", StringType(), True),               # Victim's Sex
    StructField("Vict Descent", StringType(), True),           # Victim's Descent
    StructField("Premis Cd", StringType(), True),              # Premise Code
    StructField("Premis Desc", StringType(), True),            # Premise Description
    StructField("Weapon Used Cd", StringType(), True),         # Weapon Used Code
    StructField("Weapon Desc", StringType(), True),            # Weapon Description
    StructField("Status", StringType(), True),                 # Case Status
    StructField("Status Desc", StringType(), True),            # Status Description
    StructField("Crm Cd 1", StringType(), True),               # Crime Code 1
    StructField("Crm Cd 2", StringType(), True),               # Crime Code 2
    StructField("Crm Cd 3", StringType(), True),               # Crime Code 3
    StructField("Crm Cd 4", StringType(), True),               # Crime Code 4
    StructField("LOCATION", StringType(), True),               # Crime Location
    StructField("Cross Street", StringType(), True),           # Cross Street
    StructField("LAT", FloatType(), True),                     # Latitude
    StructField("LON", FloatType(), True)                      # Longitude
])



dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True,schema=schema)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True,schema=schema)

dataframe=dataframe1.union(dataframe2)

