In [1]:
# Reference https://www.anyscale.com/blog/writing-your-first-distributed-python-application-with-ray, where you could find dataset hyperlink.
# For this tutorial, you only need to download the files for 1980 and 2020.


from collections import namedtuple
import csv
import tarfile
import time
from datetime import datetime

import ray

@ray.remote
class GSODActor():

    def __init__(self, year, high_temp):
        self.high_temp = float(high_temp)
        self.high_temp_count = None
        self.rows = []
        self.stations = None
        self.year = year

    def get_row_count(self):
        return len(self.rows)

    def get_high_temp_count(self):
        if self.high_temp_count is None:
            filtered = [l for l in self.rows if float(l.TEMP) >= self.high_temp]
            self.high_temp_count = len(filtered)
        return self.high_temp_count

    def get_station_count(self):
        print(f"{datetime.now()} get station count")
        return len(self.stations)

    def get_stations(self):
        print(f"{datetime.now()} get stations")
        return self.stations

    def get_high_temp_count(self, stations):
        print(f'{datetime.now()} get high temp count')
        filtered_rows = [l for l in self.rows if float(l.TEMP) >= self.high_temp and l.STATION in stations]
        return len(filtered_rows)

    def load_data(self):
        print(f"{datetime.now()} loading data of {self.year}")
        file_name = self.year + '.tar.gz'
        row = namedtuple('Row', ('STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', 'TEMP', 'TEMP_ATTRIBUTES', 'DEWP',
                                 'DEWP_ATTRIBUTES', 'SLP', 'SLP_ATTRIBUTES', 'STP', 'STP_ATTRIBUTES', 'VISIB', 'VISIB_ATTRIBUTES',
                                 'WDSP', 'WDSP_ATTRIBUTES', 'MXSPD', 
                                 'GUST', 'MAX', 'MAX_ATTRIBUTES', 'MIN', 'MIN_ATTRIBUTES', 'PRCP',
                                 'PRCP_ATTRIBUTES', 'SNDP', 'FRSHTT'))

        tar = tarfile.open(file_name, 'r:gz')
        for member in tar.getmembers():
            member_handle = tar.extractfile(member)
            byte_data = member_handle.read()
            decoded_string = byte_data.decode()
            lines = decoded_string.splitlines()
            reader = csv.reader(lines, delimiter=',')

            # Get all the rows in the member. Skip the header.
            _ = next(reader)
            file_rows = [row(*l) for l in reader]
            self.rows += file_rows

        self.stations = {l.STATION for l in self.rows}
        print(f"{datetime.now()} loaded data of {self.year}")

In [2]:
# Code assumes you have the 1980.tar.gz and 2020.tar.gz files in your current working directory.
def compare_years(year1, year2, high_temp, threshold_year1=0, threshold_year2=0):

    # if you know that you need fewer than the default number of workers,
    # you can modify the num_cpus parameter
    ray.init(num_cpus=2)

    # Create actor processes
    gsod_y1 = GSODActor.remote(year1, high_temp)
    gsod_y2 = GSODActor.remote(year2, high_temp)

    ray.get([gsod_y1.load_data.remote(), gsod_y2.load_data.remote()])

    y1_stations, y2_stations = ray.get([gsod_y1.get_stations.remote(),
               	                    gsod_y2.get_stations.remote()])

    intersection = set.intersection(y1_stations, y2_stations)

    # y1_count, y2_count = ray.get([gsod_y1.get_high_temp_count.remote(intersection),
    #                               gsod_y2.get_high_temp_count.remote(intersection)])

    print('Number of stations in common: {}'.format(len(intersection)))
    # print('{} - High temp count for common stations: {}'.format(year1, y1_count))
    # print('{} - High temp count for common stations: {}'.format(year2, y2_count))

    # control flow
    if len(intersection) > threshold_year1:
        y1_count = ray.get(gsod_y1.get_high_temp_count.remote(intersection))
        print('{} - High temp count for common stations: {}'.format(year1, y1_count))
    elif len(intersection) > threshold_year2:
        y2_count = ray.get(gsod_y2.get_high_temp_count.remote(intersection))
        print('{} - High temp count for common stations: {}'.format(year2, y2_count))
    else:
        cnt = ray.get(gsod_y1.get_station_count.remote())
        print(f"else condition output {cnt}.")

#Running the code below will output which year had more extreme temperatures
# compare_years('1980', '2020', 100)

In [3]:
# if
compare_years('1980', '2020', 1000)

2022-04-18 07:04:07,370	INFO services.py:1412 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


 pid=18412)[0m 2022-04-18 07:04:11.066428 loading data of 1980
 pid=10036)[0m 2022-04-18 07:04:11.065427 loading data of 2020
 pid=18412)[0m 2022-04-18 07:04:36.833663 loaded data of 1980
 pid=10036)[0m 2022-04-18 07:05:02.598428 loaded data of 2020
 pid=18412)[0m 2022-04-18 07:05:02.607404 get stations
 pid=10036)[0m 2022-04-18 07:05:02.607404 get stations
Number of stations in common: 4844
 pid=18412)[0m 2022-04-18 07:05:02.628405 get high temp count
1980 - High temp count for common stations: 0


In [6]:
# elif
compare_years('1980', '2020', 1000, 5000)

2022-04-18 07:22:09,243	INFO services.py:1412 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


 pid=34780)[0m 2022-04-18 07:22:12.438352 loading data of 2020
 pid=33128)[0m 2022-04-18 07:22:12.433352 loading data of 1980
 pid=33128)[0m 2022-04-18 07:22:36.877323 loaded data of 1980
 pid=34780)[0m 2022-04-18 07:23:04.112843 loaded data of 2020
Number of stations in common: 4844
 pid=34780)[0m 2022-04-18 07:23:04.120844 get stations
 pid=34780)[0m 2022-04-18 07:23:04.138842 get high temp count
 pid=33128)[0m 2022-04-18 07:23:04.120844 get stations
2020 - High temp count for common stations: 0


In [8]:
# else
compare_years('1980', '2020', 1000, 5000, 5000)

2022-04-18 07:24:01,506	INFO services.py:1412 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


 pid=32380)[0m 2022-04-18 07:24:04.556470 loading data of 1980
 pid=44592)[0m 2022-04-18 07:24:04.557469 loading data of 2020
 pid=32380)[0m 2022-04-18 07:24:29.246193 loaded data of 1980
Number of stations in common: 4844
else condition output 8512.
 pid=32380)[0m 2022-04-18 07:24:54.831127 get stations
 pid=32380)[0m 2022-04-18 07:24:54.844105 get station count
 pid=44592)[0m 2022-04-18 07:24:54.826105 loaded data of 2020
 pid=44592)[0m 2022-04-18 07:24:54.831127 get stations


[2m[36m(pid=)[0m [2022-04-18 10:09:41,464 C 19600 9628] (raylet.exe) node_manager.cc:172: This node has beem marked as dead.
[2m[36m(pid=)[0m *** StackTrace Information ***
[2m[36m(pid=)[0m     BaseThreadInitThunk
[2m[36m(pid=)[0m     RtlUserThreadStart
[2m[36m(pid=)[0m 


In [7]:
ray.shutdown()