In [3]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("ArrestAnalysis")
sc = SparkContext.getOrCreate(conf=conf)

In [4]:
# It is assumed that the data has already been uploaded in CSV format and is available in HDFS or the local file system
data_path = "/content/nypd-arrest-data-year-to-date-1.csv"

In [8]:
# Reading data in RDD
rdd = sc.textFile(data_path)

# Skip the title
header = rdd.first()
rdd = rdd.filter(lambda line: line != header)

In [6]:
def parseLine(line):
    fields = line.split(',')
    try:
        age_group = fields[11] # We assume that the age group is in the 12th column
        arrest_boro = fields[8] # We assume that the area of arrest is in the 9th column
        offense_type = fields[5] # We assume that the type of crime is in the 6th column
        return (offense_type, (arrest_boro, age_group, 1))
    except:
        return ()

In [7]:
# Data processing
parsedData = rdd.map(parseLine).filter(lambda x: x)

# Aggregation of data by type of crime with counting of cases by districts and age groups
offenseSummary = parsedData.map(lambda x: (x[0], x[1][2])).reduceByKey(lambda x, y: x + y)

In [9]:
# Conclusion of the top 5 most frequent crimes
topOffenses = offenseSummary.takeOrdered(5, key=lambda x: -x[1])
print("Топ-5 преступлений по количеству арестов:")
for offense in topOffenses:
    print(offense)

Топ-5 преступлений по количеству арестов:
('UNCLASSIFIED"', 22789)
('ASSAULT 3 & RELATED OFFENSES', 15289)
('235', 13494)
('348', 10683)
('117', 5736)
