In [25]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Lab2").getOrCreate()

df = spark.read.csv('owid-covid-data.csv', header=True, inferSchema=True)

df.show()

num_partitions = df.rdd.getNumPartitions()

print(f"Number of partitions: {num_partitions}")

+--------+---------+-----------+----------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-----------+---------+------------------------+----------------------+------------------+-------------------------------+-------------+--------------+-----------+------------------+-----------------+-----------------------+--------------+----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-------------------------------------+------------------------------+-------------------------------

In [26]:
# list of countries
df.select("continent").distinct().show()

+-------------+
|    continent|
+-------------+
|       Europe|
|       Africa|
|North America|
|South America|
|      Oceania|
|         Asia|
|         NULL|
+-------------+



In [27]:
# list of locations
df.select("location").distinct().show()

+--------------------+
|            location|
+--------------------+
|            Anguilla|
|         Afghanistan|
|              Africa|
|British Virgin Is...|
|             Algeria|
|           Argentina|
|              Angola|
|             Belgium|
|Bonaire Sint Eust...|
|             Albania|
|               Benin|
|             Bahamas|
|             Belarus|
|      American Samoa|
|             Andorra|
|             Bolivia|
|          Bangladesh|
|            Barbados|
|              Bhutan|
|               Aruba|
+--------------------+
only showing top 20 rows




Hash Partitioning (Default) and some transformation pipeline

grouping by continent, sorting locations a-z, and removing duplicates

In [28]:
rdd = df.rdd    

# Partition by continent using key-value pairs ex (Asia, Afghanistan)
part_rdd_continent = rdd.map(lambda row: (row['continent'], row['location']))

num_partitions = 4 #pag split ng data based on the continent, to process it more efficiently across multiple cores
part_rdd_continent = part_rdd_continent.partitionBy(num_partitions)

#group all values yung location that share the same key continent.
gBy_continent = part_rdd_continent.groupByKey()

#remove duplicates tas sort the values alphabetically
results_continent = gBy_continent.mapValues(lambda locations: sorted(set(locations))).collect()


for continent, locations in results_continent:
    print(f"Continent: {continent}")
    for location in locations:
        print(location)
    print()

Continent: Asia
Afghanistan
Armenia
Azerbaijan
Bahrain
Bangladesh
Bhutan
Brunei
Cambodia
China
East Timor
Georgia
Hong Kong
India
Indonesia
Iran
Iraq
Israel
Japan
Jordan
Kazakhstan
Kuwait
Kyrgyzstan
Laos
Lebanon
Macao
Malaysia
Maldives
Mongolia
Myanmar
Nepal
North Korea
Northern Cyprus
Oman
Pakistan
Palestine
Philippines
Qatar
Saudi Arabia
Singapore
South Korea
Sri Lanka
Syria
Taiwan
Tajikistan
Thailand
Turkey
Turkmenistan
United Arab Emirates
Uzbekistan
Vietnam
Yemen

Continent: None
Africa
Asia
Europe
European Union (27)
High-income countries
Low-income countries
Lower-middle-income countries
North America
Oceania
South America
Upper-middle-income countries
World

Continent: Europe
Albania
Andorra
Austria
Belarus
Belgium
Bosnia and Herzegovina
Bulgaria
Croatia
Cyprus
Czechia
Denmark
England
Estonia
Faroe Islands
Finland
France
Germany
Gibraltar
Greece
Guernsey
Hungary
Iceland
Ireland
Isle of Man
Italy
Jersey
Kosovo
Latvia
Liechtenstein
Lithuania
Luxembourg
Malta
Moldova
Monaco
Monten

In [29]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import when, col, sum

#Filter out rows with null values
df_filtered = df.filter((df.continent.isNotNull()) & (df.location.isNotNull()) & (df.new_cases.isNotNull()))

#Group by continent and country, then sum new cases
df_grouped = df_filtered.groupBy("continent", "location").agg(sum("new_cases").alias("total_new_cases"))

#Define a window to order data by total_new_cases within each continent, still useful for ordering within partitions
window_spec = Window.partitionBy("continent").orderBy("total_new_cases")

#Assign partition IDs based on explicit conditions (strict range partitioning)
df_partitioned = df_grouped.withColumn(
    "partition_id",
    when(col("total_new_cases") <= 100000, 1)
    .when((col("total_new_cases") > 100000) & (col("total_new_cases") <= 1000000), 2)
    .when((col("total_new_cases") > 1000000) & (col("total_new_cases") <= 10000000), 3)
    .otherwise(4)  #For cases > 10,000,000
)

rdd = df_partitioned.select("continent", "location", "total_new_cases", "partition_id").rdd

#Check partitioning of the DataFrame (optional, for verification)
df_partition_info = df_partitioned.rdd.mapPartitionsWithIndex(
    lambda index, iter: [(index, list(iter))]
).collect()

#Organize the data by partition range (same as before)
partition_ranges = {
    1: "0 - 100,000 cases",
    2: "100,001 - 1,000,000 cases",
    3: "1,000,001 - 10,000,000 cases",
    4: "More than 10,000,000 cases"
}

partitioned_data = {1: [], 2: [], 3: [], 4: []}

for index, data in df_partition_info:
    for row in data:
        partition_id = row.partition_id
        partitioned_data[partition_id].append(f" - {row.location}: {row.total_new_cases:,} cases")


for partition_id, countries in partitioned_data.items():
    print(f"**{partition_ranges[partition_id]}**")
    for country in countries:
        print(country)
    print()
    

**0 - 100,000 cases**
 - Antigua and Barbuda: 9,106 cases
 - British Virgin Islands: 7,557 cases
 - Bhutan: 62,697 cases
 - Anguilla: 3,904 cases
 - American Samoa: 8,359 cases
 - Bahamas: 39,127 cases
 - Andorra: 48,015 cases
 - Benin: 28,036 cases
 - Bonaire Sint Eustatius and Saba: 11,922 cases
 - Bermuda: 18,860 cases
 - Aruba: 44,224 cases
 - Belize: 71,416 cases
 - Cook Islands: 7,345 cases
 - Comoros: 9,109 cases
 - Cayman Islands: 31,472 cases
 - Chad: 7,702 cases
 - Djibouti: 15,690 cases
 - Curacao: 45,883 cases
 - Central African Republic: 15,441 cases
 - Cape Verde: 64,474 cases
 - Dominica: 16,047 cases
 - East Timor: 23,460 cases
 - Burkina Faso: 22,139 cases
 - Congo: 25,227 cases
 - Cote d'Ivoire: 88,434 cases
 - Burundi: 54,674 cases
 - Fiji: 69,054 cases
 - Gabon: 49,051 cases
 - Eswatini: 75,356 cases
 - Faroe Islands: 34,658 cases
 - Falkland Islands: 1,923 cases
 - Eritrea: 10,189 cases
 - Guinea: 38,572 cases
 - French Polynesia: 79,387 cases
 - Equatorial Guinea: