##  Aggregations


### Step 1: Initialize PySpark Session


In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, lit , avg, coalesce , struct,array , explode, create_map,approx_count_distinct,sumDistinct, sum, mean,asc,desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

from pyspark.sql.window import Window

from pyspark.sql import functions as F

# Create a Spark session
spark = SparkSession.builder.appName("day4").getOrCreate()


In [4]:
# Load the Chipotle dataset into a Spark DataFrame
data_path = "./US_Crime_Rates_1960_2014.csv"  # Replace with the actual path
US_df = spark.read.csv(data_path, header=True, inferSchema=True)

data_path = "./US_Crime_Rates_1960_2014.csv"  # Replace with the actual path
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Load the Chipotle dataset into a Spark DataFrame
data_path = "./titanic.csv"  # Replace with the actual path
titanic_df = spark.read.csv(data_path, header=True, inferSchema=True)


In [5]:
US_df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Population: integer (nullable = true)
 |-- Total: integer (nullable = true)
 |-- Violent: integer (nullable = true)
 |-- Property: integer (nullable = true)
 |-- Murder: integer (nullable = true)
 |-- Forcible_Rape: integer (nullable = true)
 |-- Robbery: integer (nullable = true)
 |-- Aggravated_assault: integer (nullable = true)
 |-- Burglary: integer (nullable = true)
 |-- Larceny_Theft: integer (nullable = true)
 |-- Vehicle_Theft: integer (nullable = true)



In [6]:
US_df.show()

+----+----------+--------+-------+--------+------+-------------+-------+------------------+--------+-------------+-------------+
|Year|Population|   Total|Violent|Property|Murder|Forcible_Rape|Robbery|Aggravated_assault|Burglary|Larceny_Theft|Vehicle_Theft|
+----+----------+--------+-------+--------+------+-------------+-------+------------------+--------+-------------+-------------+
|1960| 179323175| 3384200| 288460| 3095700|  9110|        17190| 107840|            154320|  912100|      1855400|       328200|
|1961| 182992000| 3488000| 289390| 3198600|  8740|        17220| 106670|            156760|  949600|      1913000|       336000|
|1962| 185771000| 3752200| 301510| 3450700|  8530|        17550| 110860|            164570|  994300|      2089600|       366800|
|1963| 188483000| 4109500| 316970| 3792500|  8640|        17650| 116470|            174210| 1086400|      2297800|       408300|
|1964| 191141000| 4564600| 364220| 4200400|  9360|        21420| 130390|            203050| 12132

### count

Question: How many records are there in the US_Crime_Rates_1960_2014_df DataFrame?

In [7]:
count_us = US_df.count()

print(f"Total number of records : ", count_us)

Total number of records :  55


### countDistinct
Question: How many distinct years are present in the US_Crime_Rates_1960_2014_df DataFrame?
Answer:

In [8]:
#findinf the distinct values in the "Year" column:
distinct_years = US_df.select("Year").distinct().count()

#printing the result
print(f"Number of distinct years:" , distinct_years)

Number of distinct years: 55


### approx_count_distinct

Question: Estimate the approximate number of distinct values in the "Total" column of the US_Crime_Rates_1960_2014_df DataFrame.

In [9]:
#calculating the approximate count of distinct values in the given column


approx_distinct_total = US_df.select(approx_count_distinct("Total").alias("ApproxDistinctTotal")).first()[0]

#printing the outcome
print(f"Approximate distinct values in 'Total' column:", approx_distinct_total)


23/09/03 20:06:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Approximate distinct values in 'Total' column: 55


###  first and last

Question: Find the first and last year in the US_Crime_Rates_1960_2014_df DataFrame.

In [10]:
#calculating the minimum year in the us_df and alias it as first_year
first_year = US_df.selectExpr("min(Year) as FirstYear").first()["FirstYear"]

#calculating the minimum year in the us_df and alias it as first_year
last_year = US_df.selectExpr("max(Year) as LastYear").first()["LastYear"]


#printing the outcome

print(f"First Year:", first_year)
print(f"Last Year:", last_year)

First Year: 1960
Last Year: 2014


### min and max

Question: Find the minimum and maximum population values in the US_Crime_Rates_1960_2014_df DataFrame.

In [11]:
#calculating the minimum population in the dataframe and alias it as "MinPopulation"
min_population = US_df.selectExpr("min(Population) as MinPopulation").first()["MinPopulation"]

#calculating the maximum population in the dataframe and alias it as "MaxPopulation"

max_population = US_df.selectExpr("max(Population) as MaxPopulation").first()["MaxPopulation"]

print(f"Minimum Population: {min_population}")
print(f"Maximum Population: {max_population}")

Minimum Population: 179323175
Maximum Population: 318857056


### sumDistinct

Question: Calculate the sum of distinct "Property" values for each year in the US_Crime_Rates_1960_2014_df DataFrame.

In [12]:
#calculating the sum of distinct values in the "Property" column for each year and aliasing the result as "SumDistinctProperty"
distinct_property_sum = US_df.groupBy("Year").agg(sumDistinct("Property").alias("SumDistinctProperty"))

#displaying the resulting dataframe
distinct_property_sum.show()





+----+-------------------+
|Year|SumDistinctProperty|
+----+-------------------+
|1990|           12655500|
|1975|           10252700|
|1977|            9955000|
|2003|           10442862|
|2007|            9843481|
|1974|            9278700|
|2006|            9983568|
|1978|           10123400|
|1961|            3198600|
|2013|            8650761|
|1988|           12356900|
|1997|           11558175|
|1994|           12131900|
|1968|            6125200|
|2014|            8277829|
|1973|            7842200|
|1979|           11041500|
|1971|            7771700|
|1966|            4793300|
|2004|           10319386|
+----+-------------------+
only showing top 20 rows



### avg

Question: Calculate the average "Murder" rate for the entire dataset in the US_Crime_Rates_1960_2014_df DataFrame.
Answer:

In [13]:
average_murder_rate = US_df.select(avg("Murder")).first()[0]

print(f"Average murder rate: {average_murder_rate}")

Average murder rate: 17317.236363636363


### Aggregating to Complex Types

Question: Calculate the total sum of "Violent" and "Property" crimes for each year in the US_Crime_Rates_1960_2014_df DataFrame. Store the results in a struct type column.

In [14]:
# Group the DataFrame 'US_df' by the "Year" column and calculate the sum of "Violent" crimes for each yearaliasing the result as "TotalViolentCrimes"

violent_crimes = US_df.groupBy("Year").agg(sum(col("Violent")).alias("TotalViolentCrimes"))
property_crimes = US_df.groupBy("Year").agg(sum(col("Property")).alias("TotalPropertyCrimes"))

#joining the two DataFrames on the "Year" column and create a struct column
crime_totals = violent_crimes.join(property_crimes, "Year", "inner") \
    .select("Year", struct(col("TotalViolentCrimes"), col("TotalPropertyCrimes")).alias("CrimeSums")) \
    .orderBy("Year")  # Sort by Year in ascending order

# Show the resulting DataFrame
crime_totals.show(truncate=False)

+----+-------------------+
|Year|CrimeSums          |
+----+-------------------+
|1960|{288460, 3095700}  |
|1961|{289390, 3198600}  |
|1962|{301510, 3450700}  |
|1963|{316970, 3792500}  |
|1964|{364220, 4200400}  |
|1965|{387390, 4352000}  |
|1966|{430180, 4793300}  |
|1967|{499930, 5403500}  |
|1968|{595010, 6125200}  |
|1969|{661870, 6749000}  |
|1970|{738820, 7359200}  |
|1971|{816500, 7771700}  |
|1972|{834900, 7413900}  |
|1973|{875910, 7842200}  |
|1974|{974720, 9278700}  |
|1975|{1039710, 10252700}|
|1976|{1004210, 10345500}|
|1977|{1029580, 9955000} |
|1978|{1085550, 10123400}|
|1979|{1208030, 11041500}|
+----+-------------------+
only showing top 20 rows



### Grouping

Question: In the given US_Crime_Rates_1960_2014_df DataFrame, you are tasked with finding the average of all crimes combined for each year. Calculate the sum of all crime categories (Violent, Property, Murder, Forcible_Rape, Robbery, Aggravated_assault, Burglary, Larceny_Theft, Vehicle_Theft) for each year and then determine the average of these combined crime sums. Provide the result as the average of all crimes across the entire dataset.

In [25]:
total_sum = US_df.withColumn("Total_crime_sum",
                                        col('violent') + col('property') + col('murder') + col('forcible_rape') + 
                                        col('robbery') + col('Aggravated_assault') + col('Burglary')+ col('Larceny_Theft')
                                        + col('Vehicle_Theft')).alias('Total_crime_sum')

# total_sum.show()


avegare_crime = total_sum.agg(avg('total_crime_sum'))
print(f"Average of all crime: {avegare_crime.collect()[0][0]}")

total_sum.select('year','total_crime_sum').show()

Average of all crime: 21201546.145454545
+----+---------------+
|year|total_crime_sum|
+----+---------------+
|1960|        6768320|
|1961|        6975980|
|1962|        7504420|
|1963|        8218940|
|1964|        9129240|
|1965|        9478780|
|1966|       10446960|
|1967|       11806860|
|1968|       13440420|
|1969|       14821740|
|1970|       16196040|
|1971|       17176400|
|1972|       16497600|
|1973|       17436220|
|1974|       20506940|
|1975|       22584730|
|1976|       22699410|
|1977|       21969060|
|1978|       22417910|
|1979|       24499060|
+----+---------------+
only showing top 20 rows



### Window Functions

Question: Calculate the cumulative sum of "Property" values over the years using a window function in the US_Crime_Rates_1960_2014_df DataFrame.

In [26]:
#Creating a window specification
window_spec = Window.partitionBy("Year").orderBy("Year")

# Calculate the cumulative sum of "Property" values using the window function
US_df = US_df.withColumn(
    "CumulativePropertySum",
    sum("Property").over(window_spec)
)

# Select only the columns you want in the final DataFrame
US_df = US_df.select("Year", "Population", "Total", "Violent", "Property", "Murder", "Forcible_Rape", "Robbery", "Aggravated_assault", "Burglary", "Larceny_Theft", "Vehicle_Theft","CumulativePropertySum")

# Show the DataFrame
US_df.show()

+----+----------+--------+-------+--------+------+-------------+-------+------------------+--------+-------------+-------------+---------------------+
|Year|Population|   Total|Violent|Property|Murder|Forcible_Rape|Robbery|Aggravated_assault|Burglary|Larceny_Theft|Vehicle_Theft|CumulativePropertySum|
+----+----------+--------+-------+--------+------+-------------+-------+------------------+--------+-------------+-------------+---------------------+
|1960| 179323175| 3384200| 288460| 3095700|  9110|        17190| 107840|            154320|  912100|      1855400|       328200|              3095700|
|1961| 182992000| 3488000| 289390| 3198600|  8740|        17220| 106670|            156760|  949600|      1913000|       336000|              3198600|
|1962| 185771000| 3752200| 301510| 3450700|  8530|        17550| 110860|            164570|  994300|      2089600|       366800|              3450700|
|1963| 188483000| 4109500| 316970| 3792500|  8640|        17650| 116470|            174210| 10

In [30]:
#created a pivot table of year where the value of robbery of the year is shown.

pivot_table_robery = US_df.groupBy('year').pivot('year').sum('robbery').sort(asc('year'))
pivot_table_robery.show()


+----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|year|  1960|  1961|  1962|  1963|  1964|  1965|  1966|  1967|  1968|  1969|  1970|  1971|  1972|  1973|  1974|  1975|  1976|  1977|  1978|  1979|1980|1981|1982|1983|1984|1985|1986|1987|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998|1999|2000|2001|2002|2003|2004|2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|
+----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|1960|107840|  null|  null|  null|

In [None]:
crime_totals.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Total_Violent: long (nullable = true)
 |-- Total_Property: long (nullable = true)
 |-- Total_Murder: long (nullable = true)
 |-- Total_Forcible_Rape: long (nullable = true)
 |-- Total_Robbery: long (nullable = true)
 |-- Total_Aggravated_assault: long (nullable = true)
 |-- Total_Burglary: long (nullable = true)
 |-- Total_Larceny_Theft: long (nullable = true)
 |-- Total_Vehicle_Theft: long (nullable = true)

