## **Map Reduce**


In [4]:
from collections import defaultdict
import csv
import multiprocessing

def map_function(record):
    try:
        fields = record.strip().split(',')
        return (fields[0], float(fields[3]) / 10.0)  # (year, temperature in Celsius)
    except (IndexError, ValueError):
        return None

def reduce_function(year_temps):
    year, temperatures = year_temps
    if not temperatures:
        return (year, None, None, None)
    return (year, min(temperatures), max(temperatures), sum(temperatures) / len(temperatures))



In [5]:
def main():
    # Read data from CSV file
    data = []
    with open('temperature_data.csv', 'r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip header row
        data = [','.join(row) for row in csv_reader]

    # Map phase
    with multiprocessing.Pool() as pool:
        mapped_data = [item for item in pool.map(map_function, data) if item]

    # Shuffle and sort phase
    grouped_data = defaultdict(list)
    for year, temp in mapped_data:
        grouped_data[year].append(temp)

    # Reduce phase and find coolest/hottest years
    results = [reduce_function((year, temps)) for year, temps in grouped_data.items()]
    coolest_year = min(results, key=lambda x: x[2] if x[2] is not None else float('inf'))
    hottest_year = max(results, key=lambda x: x[3] if x[3] is not None else float('-inf'))

    print(f"Coolest Year: {coolest_year[0]}, Min Temp: {coolest_year[2]}°C")
    print(f"Hottest Year: {hottest_year[0]}, Avg Temp: {hottest_year[3]}°C")

if __name__ == "__main__":
    main()


Coolest Year: 1902, Min Temp: 23.3°C
Hottest Year: 2023, Avg Temp: 26.320000000000004°C
