In [3]:
from pyspark import SparkContext
import os
os.chdir('/Users/chkapsalis/Documents/GitHub/Big_Data_Architectures/Assignments/my_assignment_3')

# For some reason i need to run this every time in order to get it work
import os
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home" 

spark = SparkContext("local[1]", "app")


25/03/28 13:03:02 WARN Utils: Your hostname, ChristoorossAir resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
25/03/28 13:03:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/28 13:03:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# This exercise is asking for a vanilla spark application - no usage of Spark DFs will be made.
# As such, the ingestion of the selected csv file will be different; i need to manually remove its header column!
# Then i will be ready to ingest it:
cols = 'As Of,NAV per Share,Daily NAV Change,Daily NAV Change %'.split(',')

rdd = spark.textFile('file:///' + os.getcwd() + '/BGF-World-Technology-Fund-Class-A2-EUR_fund_noH.csv')

In [5]:
# Peaking into the RDD to understand its value structure
#print(rdd.collect())

In [42]:
# Question 1 - Min and Max Daily and Percentage Change for the FULL history
#i have an rdd of multiple rdds, each
# representing a single day of trading blackrock's fund

In [28]:
import datetime 

def parse(line):
    fields = line.split(',')
    try:
        date = datetime.datetime.strptime(fields[0], "%d/%b/%Y")
        nav = float(fields[1])
        daily = float(fields[2])
        pct = float(fields[3])
        return (nav, daily, pct, date)
    except:
        return None  # ignoring invalid lines

In [29]:
parsed_rdd = rdd.map(parse).filter(lambda x: x is not None)   # filtering out the lines for which 
# the parser returned None; i.e., the invalid lines

In [30]:
print("First few rows in parsed RDD:")
for row in parsed_rdd.take(5):
    print(repr(row))


First few rows in parsed RDD:
(79.42, -1.1, -1.37, datetime.datetime(2025, 3, 27, 0, 0))
(80.52, -1.33, -1.62, datetime.datetime(2025, 3, 26, 0, 0))
(81.85, 0.3, 0.37, datetime.datetime(2025, 3, 25, 0, 0))
(81.55, 2.4, 3.03, datetime.datetime(2025, 3, 24, 0, 0))
(79.15, -0.97, -1.21, datetime.datetime(2025, 3, 21, 0, 0))


In [33]:
# So i can iterate over them and calculate the max & min daily and pct changes
# At the same time, I can accumulate the total count of values, their total sum, and the min and max vals
# Then i am able to calculate the avg
# Then i will iterate over it again to calculate the std

# So i will first utilize the 'aggregate' transformation
# Aggregate: zerovalue like (count, sum of NAV, min/max of daily change, min/max of pct change)
zero = (0, 0.0, float('inf'), float('-inf'), float('inf'), float('-inf'))  # count, sum_nav, min_daily, max_daily, min_pct, max_pct

# Within-partition processing (happening directly on the nodes where data is stored)
# Each worker will be appointed (in the background) various lines of this file; 
# I want each of them to count the number of NAV values they come across and their sum,
# plus the min daily/pct and max daily/pct values they come across.


def seqOp(acc, v):
    # de-constructing the standardized form of the accumulator
    count, total_nav, min_daily, max_daily, min_pct, max_pct = acc
    # de-constructing the standardized form of parsed lines
    nav, daily, pct, date = v

    # updating the values of the accumulator:
    return (
        count + 1,
        total_nav + nav,
        min(min_daily, daily),
        max(max_daily, daily),
        min(min_pct, pct),
        max(max_pct, pct)
    )

# Combination of intermediate results to produce final results:
# I need the sum of NAV counts, the sum of NAV sums, the min out of all daily/pct mins, and the max out of all daily/pct maxs
def combOp(acc1, acc2):
    return (
        acc1[0] + acc2[0],
        acc1[1] + acc2[1],
        min(acc1[2], acc2[2]),
        max(acc1[3], acc2[3]),
        min(acc1[4], acc2[4]),
        max(acc1[5], acc2[5])
    )



In [34]:
count_nav, sum_nav, min_daily, max_daily, min_pct, max_pct = parsed_rdd.aggregate(zero, seqOp, combOp)

In [36]:
avg_nav = sum_nav / count_nav

print(f"Total NAV Entries: {count_nav}")
print(f"Average NAV: {avg_nav:.4f}")
print(f"Min Daily Change: {min_daily}, Max Daily Change: {max_daily}")
print(f"Min % Change: {min_pct}, Max % Change: {max_pct}")


Total NAV Entries: 2936
Average NAV: 37.0629
Min Daily Change: -969.7, Max Daily Change: 3.78
Min % Change: -99.0, Max % Change: 8.18


In [37]:
# STD calculation - requires a second pass
std_dev = parsed_rdd.map(lambda x: (x[0] - avg_nav) ** 2) \
            .mean() ** 0.5

In [38]:
print('NAV STD:', std_dev)

NAV STD: 24.17502997438014


In [41]:
# Question 3 - filtering only for 2020 and re-iterating
rdd_2020 = parsed_rdd.filter(lambda x: x[-1].year == 2020)
navs_2020 = rdd_2020.map(lambda x: x[1])  # i only want to keep the second value of its rdd, i.e. the corresponding daily nav value

count_2020 = navs_2020.count()
sum_2020 = navs_2020.sum()
avg_2020 = sum_2020 / count_2020

std_2020 = navs_2020.map(lambda x: (x - avg_2020) ** 2).mean() ** 0.5

print(f"2020 NAV Average: {avg_2020:.4f}")
print(f"2020 NAV Std Dev: {std_2020:.4f}")

2020 NAV Average: 0.1133
2020 NAV Std Dev: 0.8117


In [None]:
# Question 4 - grouping my key==month

# from parsed_rdd, i will apply a transformation so as to create a new rdd containing sub-rdds 
# made of (year_month, nav, 1) pairs  - the "1" values will help in conveniently calculating the counts
monthly_navs = parsed_rdd.map(lambda x: ((x[-1].year, x[-1].month), (x[1], 1)))  

# summing navs and nav value counts per month
monthly_sums_counts = monthly_navs.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))
#print(monthly_sums_counts.collect())

[((2025, 3), (-7.47, 19)), ((2025, 2), (-7.719999999999999, 20)), ((2025, 1), (2.9400000000000004, 22)), ((2024, 12), (2.78, 19)), ((2024, 11), (6.540000000000001, 20)), ((2024, 10), (2.21, 23)), ((2024, 8), (0.4099999999999995, 21)), ((2024, 7), (-5.84, 23)), ((2024, 6), (8.419999999999998, 20)), ((2024, 5), (2.46, 20)), ((2024, 4), (-2.3600000000000003, 21)), ((2024, 3), (1.54, 20)), ((2024, 2), (5.18, 21)), ((2024, 1), (4.069999999999999, 22)), ((2023, 12), (2.44, 19)), ((2023, 11), (6.45, 21)), ((2023, 10), (-2.4200000000000004, 22)), ((2023, 8), (-0.06999999999999991, 22)), ((2023, 7), (1.5100000000000002, 21)), ((2023, 6), (1.55, 21)), ((2023, 5), (7.140000000000001, 19)), ((2023, 4), (-2.11, 18)), ((2023, 3), (1.7800000000000007, 23)), ((2023, 2), (1.5299999999999998, 20)), ((2023, 1), (4.239999999999999, 22)), ((2022, 12), (-3.8200000000000007, 21)), ((2022, 11), (-1.7200000000000002, 21)), ((2022, 10), (-0.6699999999999995, 21)), ((2022, 8), (-0.4199999999999996, 22)), ((2022,

In [45]:
# Calculate the monthly averages

# In Spark, mapValues() is a transformation operation on RDDs (Resilient Distributed Datasets) that transforms the values of a key-value pair RDD without changing the keys. It applies a specified function to the values of each 
# # key-value pair in the RDD, returning a new RDD with the same keys and the transformed values.
monthly_avgs = monthly_sums_counts.mapValues(lambda x: x[0] / x[1])
monthly_avgs_sorted = monthly_avgs.sortBy(lambda x: (x[0][0], x[0][1]), ascending=False)

In [None]:
# Printing Q4 results
for (year, month), avg in monthly_avgs_sorted.collect():
    print(f"{year}-{month:02d}: Average NAV = {avg:.4f}")

2025-03: Average NAV = -0.3932
2025-02: Average NAV = -0.3860
2025-01: Average NAV = 0.1336
2024-12: Average NAV = 0.1463
2024-11: Average NAV = 0.3270
2024-10: Average NAV = 0.0961
2024-08: Average NAV = 0.0195
2024-07: Average NAV = -0.2539
2024-06: Average NAV = 0.4210
2024-05: Average NAV = 0.1230
2024-04: Average NAV = -0.1124
2024-03: Average NAV = 0.0770
2024-02: Average NAV = 0.2467
2024-01: Average NAV = 0.1850
2023-12: Average NAV = 0.1284
2023-11: Average NAV = 0.3071
2023-10: Average NAV = -0.1100
2023-08: Average NAV = -0.0032
2023-07: Average NAV = 0.0719
2023-06: Average NAV = 0.0738
2023-05: Average NAV = 0.3758
2023-04: Average NAV = -0.1172
2023-03: Average NAV = 0.0774
2023-02: Average NAV = 0.0765
2023-01: Average NAV = 0.1927
2022-12: Average NAV = -0.1819
2022-11: Average NAV = -0.0819
2022-10: Average NAV = -0.0319
2022-08: Average NAV = -0.0191
2022-07: Average NAV = 0.3719
2022-06: Average NAV = -0.2385
2022-05: Average NAV = -0.2695
2022-04: Average NAV = -0.2

25/03/28 13:47:29 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1020764 ms exceeds timeout 120000 ms
25/03/28 13:47:29 WARN SparkContext: Killing executors is not supported by current scheduler.
25/03/28 14:03:12 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$