In [None]:
import pymongo
from pymongo import MongoClient
client = MongoClient('10.192.96.162', 27017

## Task 01
- Based on the two datasets provided hotspot_historic.csv and climate_historic.csv, a suitable datamodel to support the efficient querying of the two datasets in MongoDB would be to Embed both the documents together

- Looking at the climate history and the hotspot history csv we can see that we can join embed both of these csv files by their dates

- Each station would record the climate data for a set time interval everyday, the climate data would include variables such as air_temperature, relative_humidity, winspeedknots, max_wind_speed and precipitation

- Each station would have 0 or more hotspots and would each hotspot record would have the time or record, latitude, longitude, confidence and the surface_temperature_celsius.

- By embedding climate and hotspot we can retrieve the all date records of the recorded station before it switches to another station and from this we can also view the all hotspots and their relevant details such as the precise location and time

- By fetching a single document this is more efficient that performing mutliple queries across separate collections or requiring join algroithms

``` python
{'GHI_w/m2': 177,
  'air_temperature_celcius': 21,
  'date': Timestamp('2023-08-03 00:00:00'),
  'hotspots': [{'confidence': 68,
                'datetime': Timestamp('2023-03-08 04:51:00'),
                'latitude': -37.7885,
                'longitude': 141.9352,
                'surface_temperature_celsius': 55},
               {'confidence': 75,
                'datetime': Timestamp('2023-03-08 00:30:30'),
                'latitude': -38.1031,
                'longitude': 142.4797,
                'surface_temperature_celsius': 48}],
  'max_wind_speed': 13.0,
  'precipitation_type': 'I',
  'precipitation_value': 0.0,
  'relative_humidity': 51.7,
  'station': 948701,
  'windspeed_knots': 7.2}
 ```

## Task 2.1
- First read both the files having the sperator as a comma since it is a csv file, next format the date for both dataframes to appropriate date time format
- Precipitation is actually a string type value so I made two new columns called "precipitation_value" and "precipitation_type" to seperate the string 
- The value is set to numeric and the original precipitate column is dropped
- Both dataframes are then converted to dictionary format

In [None]:
import pandas as pd
# read the csv delimited by the comma as the seperator
climate_csv = pd.read_csv("./climate_historic.csv", sep=",")
# create date time column using date column from csv and setting date time data type
climate_csv['date'] = pd.to_datetime(climate_csv['date'])

# seperates the string into precipitation value and type
climate_csv[['precipitation_value', 'precipitation_type']] = climate_csv['precipitation'].str.extract(r'(\d+\.?\d*)(\D+)')
# convert to numeric data type
climate_csv['precipitation_value'] = pd.to_numeric(climate_csv['precipitation_value'], errors='coerce')
# drop the precipitaion column since it already has been processed
climate_csv = climate_csv.drop('precipitation', axis=1)

# create climate dictionary
climate_dict = climate_csv.to_dict(orient='records')
print(climate_dict)


hotspot_csv = pd.read_csv("./hotspot_historic.csv", sep=",")
# convert to date 
hotspot_csv['date'] = pd.to_datetime(hotspot_csv['date'])
# convert to date time data type value
hotspot_csv['datetime'] = pd.to_datetime(hotspot_csv['datetime'], format='%Y-%m-%d %H:%M:%S')
print(hotspot_csv['datetime'].iloc[0])
# create hotspot dictionary, where each column would be a key in the dictionary
hotspot_dict = hotspot_csv.to_dict(orient='records')
print(hotspot_dict)


Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 105}, {'latitude': -37.2252, 'longitude': 147.9363, 'datetime': Timestamp('2023-03-10 04:46:20'), 'confidence': 100, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 109}, {'latitude': -37.2284, 'longitude': 147.9187, 'datetime': Timestamp('2023-03-10 04:45:30'), 'confidence': 94, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 73}, {'latitude': -37.6572, 'longitude': 142.0703, 'datetime': Timestamp('2023-03-10 04:45:30'), 'confidence': 97, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 80}, {'latitude': -37.6592, 'longitude': 142.058, 'datetime': Timestamp('2023-03-10 04:43:50'), 'confidence': 81, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 55}, {'latitude': -36.2544, 'longitude': 148.0353, 'datetime': Timestamp('2023-03-10 04:43:00'), 'confidence': 55, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 42}, {'latitude': -37.2197, 'longitude': 147.9621, 'datetime': Timestamp('2023-03-10 04:42:30'), 'confidence': 54, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 43}, {'latitude': -37.2128, 'longitude': 147.9308, 'datetime': Timestamp('2023-03-10 04:39:00'), 'confidence': 74, 'date': Timestamp('2023-10-03 00:00:00'), 'surface_temperature_celcius': 48}, {'latitude': -37.996, 'longitude': 146.3535, 'datetime': Timestamp('2023-03-09 13:23:40'), 'confidence': 86, 'date': Timestamp('2023-09-03 00:00:00'), 'surface_temperature_celcius': 41}, {'latitude': -37.7171, 'longitude': 147.5866, 'datetime': Timestamp('2023-03-09 03:57:00'), 'confidence': 54, 'date': Timestamp('2023-09-03 00:00:00'), 'surface_temperature_celcius': 44}, {'latitude': -37.7074, 'longitude': 147.5849, 'datetime': Timestamp('2023-03-09 03:56:50'), 'confidence': 78, 'date': Timestamp('2023-09-03 00:00:00'), 'surface_temperature_celcius': 55}, {'latitude': -37.7885, 'longitude': 141.9352, 'datetime': Timestamp('2023-03-08 04:51:00'), 'confidence': 68, 'date': Timestamp('2023-08-03 00:00:00'), 'surface_temperature_celcius': 55}, {'latitude': -38.1031, 'longitude': 142.4797, 'datetime': Timestamp('2023-03-08 00:30:30'), 'confidence': 75, 'date': Timestamp('2023-08-03 00:00:00'), 'surface_temperature_celcius': 48}, {'latitude': -37.7752, 'longitude': 141.9086, 'datetime': Timestamp('2023-03-07 04:16:10'), 'confidence': 88, 'date': Timestamp('2023-07-03 00:00:00'), 'surface_temperature_celcius': 64}, {'latitude': -34.3795, 'longitude': 141.6331, 'datetime': Timestamp('2023-03-06 05:06:30'), 'confidence': 87, 'date': Timestamp('2023-06-03 00:00:00'), 'surface_temperature_celcius': 62}, {'latitude': -34.3735, 'longitude': 141.6604, 'datetime': Timestamp('2023-03-06 05:06:20'), 'confidence': 85, 'date': Timestamp('2023-06-03 00:00:00'), 'surface_temperature_celcius': 59}]
/tmp/ipykernel_66372/1756324149.py:3: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
  climate_csv['date'] = pd.to_datetime(climate_csv['date'])
/tmp/ipykernel_66372/1756324149.py:14: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
  hotspot_csv['date'] = pd.to_datetime(hotspot_csv['date'])

## Task 2.2

## Task 2.2a
- Finidng all climate data on 12th December 2023

In [None]:
collections = collection.find()
for tour in collections:
    pprint(tour)

{'GHI_w/m2': 154,
 '_id': ObjectId('6650a2f9f374f529b6ea16e0'),
 'air_temperature_celcius': 19,
 'date': datetime.datetime(2022, 12, 31, 0, 0),
 'hotspots': [],
 'max_wind_speed': 11.1,
 'precipitation_type': 'I',
 'precipitation_value': 0.0,
 'relative_humidity': 56.8,
 'station': 948700,
 'windspeed_knots': 7.9}
{'GHI_w/m2': 128,
 '_id': ObjectId('6650a2f9f374f529b6ea16e1'),
 'air_temperature_celcius': 15,
 'date': datetime.datetime(2023, 2, 1, 0, 0),
 'hotspots': [],
 'max_wind_speed': 13.0,
 'precipitation_type': 'G',
 'precipitation_value': 0.02,
 'relative_humidity': 50.7,
 'station': 948700,
 'windspeed_knots': 9.2}
{'GHI_w/m2': 133,
 '_id': ObjectId('6650a2f9f374f529b6ea16e2'),
 'air_temperature_celcius': 16,
...
 'precipitation_value': 0.0,
 'relative_humidity': 52.9,
 'station': 948702,
 'windspeed_knots': 8.1}
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

In [None]:
from datetime import datetime

date = "12/12/23"
# converting it to date format
date = datetime.strptime(date, '%d/%m/%y')

# finding the climate record that has the matching date
pprint(collection.find_one({"date":date}))

{'GHI_w/m2': 156,
 '_id': ObjectId('6650a2f9f374f529b6ea1839'),
 'air_temperature_celcius': 19,
 'date': datetime.datetime(2023, 12, 12, 0, 0),
 'hotspots': [{'confidence': 53,
               'datetime': datetime.datetime(2023, 12, 12, 0, 45, 38),
               'latitude': -37.903,
               'longitude': 145.25,
               'surface_temperature_celsius': 44}],
 'max_wind_speed': 12.0,
 'precipitation_type': 'I',
 'precipitation_value': 0.0,
 'relative_humidity': 55.3,
 'station': 948702,
 'windspeed_knots': 6.2}

## Task 2.2b
- Finding the latitude, longitude, surface temperature (°C), and confidence when the surface temperature (°C) was between 65 °C and 100 °C.

In [None]:
# find records where surface temperature celsius is greater than or equals to 65 and lesser than equals to 100
# display the latitude, longitude, temperature and confidence 
results = collection.find({"hotspots": {"$elemMatch": {"surface_temperature_celsius": {"$gte": 65, "$lte": 100}}}}, {"hotspots.latitude":1, "hotspots.longitude":1, "hotspots.surface_temperature_celsius":1, "hotspots.confidence": 1})

for result in results:
    pprint(result)

{'_id': ObjectId('6650a2f9f374f529b6ea1724'),
 'hotspots': [{'confidence': 100,
               'latitude': -37.223,
               'longitude': 147.9431,
               'surface_temperature_celsius': 105},
              {'confidence': 100,
               'latitude': -37.2252,
               'longitude': 147.9363,
               'surface_temperature_celsius': 109},
              {'confidence': 94,
               'latitude': -37.2284,
               'longitude': 147.9187,
               'surface_temperature_celsius': 73},
              {'confidence': 97,
               'latitude': -37.6572,
               'longitude': 142.0703,
               'surface_temperature_celsius': 80},
              {'confidence': 81,
               'latitude': -37.6592,
               'longitude': 142.058,
               'surface_temperature_celsius': 55},
              {'confidence': 55,
               'latitude': -36.2544,
               'longitude': 148.0353,
               'surface_temperature_celsius': 42},
...
              {'confidence': 86,
               'latitude': -35.543,
               'longitude': 143.316,
               'surface_temperature_celsius': 67}]}

In [None]:
## Task 2.2c
- Finding the date, surface temperature (°C), air temperature (°C), relative humidity and maxwind speed on 15th and 16th of December 2023.

In [None]:
start = datetime.strptime("15/12/23", '%d/%m/%y')
end = datetime.strptime("16/12/23", '%d/%m/%y')


pprint(collection.find_one({"date": {"$gte":start, "$lte": end}}, {"date": 1, "air_temperature_celcius": 1, "hotspot.surface_temperature_celcius": 1, "max_wind_speed": 1,"relative_humidity":1}))

{'_id': ObjectId('6650a2f9f374f529b6ea183c'),
 'air_temperature_celcius': 18,
 'date': datetime.datetime(2023, 12, 15, 0, 0),
 'max_wind_speed': 14.0,
 'relative_humidity': 52.0}

## Task 2.2d
- Finding datetime, air temperature (°C), surface temperature (°C) and confidence when the confidence is between 80 and 100.

In [None]:
pprint(collection.find_one({"hotspots.confidence": {"$gte":80, "$lte": 100}}, {"hotspots.datetime": 1, "air_temperature_celcius": 1, "hotspots.surface_temperature_celcius": 1, "max_wind_speed": 1,"hotspots.confidence":1}))

{'_id': ObjectId('6650a2f9f374f529b6ea1720'),
 'air_temperature_celcius': 20,
 'hotspots': [{'confidence': 87,
               'datetime': datetime.datetime(2023, 3, 6, 5, 6, 30)},
              {'confidence': 85,
               'datetime': datetime.datetime(2023, 3, 6, 5, 6, 20)}],
 'max_wind_speed': 21.0}

## Task 2.2e
- Finding the top 10 records with the highest surface temperature (°C)

In [None]:
results = collection.find().sort("hotspots.surface_temperature_celcius",-1).limit(10)
for result in results:
    pprint(result)

{'GHI_w/m2': 154,
 '_id': ObjectId('6650a2f9f374f529b6ea16e8'),
 'air_temperature_celcius': 19,
 'date': datetime.datetime(2023, 9, 1, 0, 0),
 'hotspots': [],
 'max_wind_speed': 8.9,
 'precipitation_type': 'I',
 'precipitation_value': 0.0,
 'relative_humidity': 56.3,
 'station': 948700,
 'windspeed_knots': 5.8}
{'GHI_w/m2': 161,
 '_id': ObjectId('6650a2f9f374f529b6ea16e9'),
 'air_temperature_celcius': 20,
 'date': datetime.datetime(2023, 10, 1, 0, 0),
 'hotspots': [],
 'max_wind_speed': 13.0,
 'precipitation_type': 'I',
 'precipitation_value': 0.0,
 'relative_humidity': 57.0,
 'station': 948700,
 'windspeed_knots': 8.7}
{'GHI_w/m2': 185,
 '_id': ObjectId('6650a2f9f374f529b6ea16e4'),
 'air_temperature_celcius': 24,
...
 'precipitation_value': 0.0,
 'relative_humidity': 54.1,
 'station': 948700,
 'windspeed_knots': 12.8}

## Task 2.2f
- Finding the number of fires each day. You are required to only display the total number of fires and the date in the output.

In [None]:
results = collection.aggregate([{"$project":{"_id":0, "date": 1, "count":{"$size":"$hotspots"}}}])
for result in results:
    pprint(result)

{'count': 0, 'date': datetime.datetime(2022, 12, 31, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 2, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 3, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 4, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 5, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 6, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 7, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 8, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 9, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 10, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 11, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 12, 1, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 13, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 14, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 15, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 16, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 17, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 18, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 19, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 20, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 21, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 22, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 23, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 24, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 1, 25, 0, 0)}
...
{'count': 0, 'date': datetime.datetime(2023, 12, 29, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 12, 30, 0, 0)}
{'count': 0, 'date': datetime.datetime(2023, 12, 31, 0, 0)}
{'count': 0, 'date': datetime.datetime(2024, 1, 1, 0, 0)}

## Task 2.2g
- Finding the records of fires where the confidence is below 70

In [None]:
results = collection.aggregate([{"$unwind": "$hotspots"},{"$match": {"hotspots.confidence": {"$lt": 70}}},{"$project": {"_id": 0,"date": 1,"records": "$hotspots"}}])

for result in results:
    pprint(result)

{'date': datetime.datetime(2023, 8, 3, 0, 0),
 'records': {'confidence': 68,
             'datetime': datetime.datetime(2023, 3, 8, 4, 51),
             'latitude': -37.7885,
             'longitude': 141.9352,
             'surface_temperature_celsius': 55}}
{'date': datetime.datetime(2023, 9, 3, 0, 0),
 'records': {'confidence': 54,
             'datetime': datetime.datetime(2023, 3, 9, 3, 57),
             'latitude': -37.7171,
             'longitude': 147.5866,
             'surface_temperature_celsius': 44}}
{'date': datetime.datetime(2023, 10, 3, 0, 0),
 'records': {'confidence': 55,
             'datetime': datetime.datetime(2023, 3, 10, 4, 43),
             'latitude': -36.2544,
             'longitude': 148.0353,
             'surface_temperature_celsius': 42}}
{'date': datetime.datetime(2023, 10, 3, 0, 0),
 'records': {'confidence': 54,
             'datetime': datetime.datetime(2023, 3, 10, 4, 42, 30),
             'latitude': -37.2197,
             'longitude': 147.9621,
             'surface_temperature_celsius': 43}}
{'date': datetime.datetime(2023, 3, 13, 0, 0),
...
             'datetime': datetime.datetime(2023, 12, 27, 0, 2, 15),
             'latitude': -35.554,
             'longitude': 143.307,
             'surface_temperature_celsius': 53}}

## Task 2.2h
- Finding the average surface temperature (°C) for each day. You are required to only display average surface temperature (°C) and the date in the output.

In [None]:
results = collection.aggregate([{"$unwind": "$hotspots"},{"$group": {"_id": "$date", "avg": {"$avg": "$hotspots.surface_temperature_celsius"}}}])

for result in results:
    pprint(result)

{'_id': datetime.datetime(2023, 3, 15, 0, 0), 'avg': 46.0}
{'_id': datetime.datetime(2023, 4, 24, 0, 0), 'avg': 59.375}
{'_id': datetime.datetime(2023, 4, 26, 0, 0), 'avg': 34.0}
{'_id': datetime.datetime(2023, 12, 13, 0, 0), 'avg': 60.0}
{'_id': datetime.datetime(2023, 4, 17, 0, 0), 'avg': 50.921052631578945}
{'_id': datetime.datetime(2023, 9, 26, 0, 0), 'avg': 33.0}
{'_id': datetime.datetime(2023, 10, 10, 0, 0), 'avg': 53.333333333333336}
{'_id': datetime.datetime(2023, 3, 28, 0, 0), 'avg': 60.925925925925924}
{'_id': datetime.datetime(2023, 11, 23, 0, 0), 'avg': 58.8}
{'_id': datetime.datetime(2023, 9, 21, 0, 0), 'avg': 40.5}
{'_id': datetime.datetime(2023, 10, 16, 0, 0), 'avg': 36.0}
{'_id': datetime.datetime(2023, 4, 25, 0, 0), 'avg': 48.666666666666664}
{'_id': datetime.datetime(2023, 10, 23, 0, 0), 'avg': 38.0}
{'_id': datetime.datetime(2023, 3, 29, 0, 0), 'avg': 51.0}
{'_id': datetime.datetime(2023, 3, 6, 0, 0), 'avg': 47.0}
{'_id': datetime.datetime(2023, 11, 14, 0, 0), 'avg': 52.0}
{'_id': datetime.datetime(2023, 10, 12, 0, 0), 'avg': 46.0}
{'_id': datetime.datetime(2023, 3, 18, 0, 0), 'avg': 79.33333333333333}
{'_id': datetime.datetime(2023, 4, 19, 0, 0), 'avg': 54.16}
{'_id': datetime.datetime(2023, 5, 16, 0, 0), 'avg': 39.666666666666664}
{'_id': datetime.datetime(2023, 12, 12, 0, 0), 'avg': 44.0}
{'_id': datetime.datetime(2023, 6, 7, 0, 0), 'avg': 56.0}
{'_id': datetime.datetime(2023, 10, 5, 0, 0), 'avg': 52.86842105263158}
{'_id': datetime.datetime(2023, 6, 3, 0, 0), 'avg': 60.5}
{'_id': datetime.datetime(2023, 11, 30, 0, 0), 'avg': 52.41935483870968}
...
{'_id': datetime.datetime(2023, 12, 27, 0, 0), 'avg': 62.75}
{'_id': datetime.datetime(2023, 7, 4, 0, 0), 'avg': 50.69230769230769}
{'_id': datetime.datetime(2023, 12, 15, 0, 0), 'avg': 39.0}
{'_id': datetime.datetime(2023, 10, 8, 0, 0), 'avg': 63.0}

## Task 2.2i
- Finding the top 10 records with the lowest GHI.

In [None]:
results = collection.find().sort("GHI_w",1).limit(10)
for result in results:
    pprint(result)

{'GHI_w/m2': 154,
 '_id': ObjectId('6650a2f9f374f529b6ea16e8'),
 'air_temperature_celcius': 19,
 'date': datetime.datetime(2023, 9, 1, 0, 0),
 'hotspots': [],
 'max_wind_speed': 8.9,
 'precipitation_type': 'I',
 'precipitation_value': 0.0,
 'relative_humidity': 56.3,
 'station': 948700,
 'windspeed_knots': 5.8}
{'GHI_w/m2': 161,
 '_id': ObjectId('6650a2f9f374f529b6ea16e9'),
 'air_temperature_celcius': 20,
 'date': datetime.datetime(2023, 10, 1, 0, 0),
 'hotspots': [],
 'max_wind_speed': 13.0,
 'precipitation_type': 'I',
 'precipitation_value': 0.0,
 'relative_humidity': 57.0,
 'station': 948700,
 'windspeed_knots': 8.7}
{'GHI_w/m2': 185,
 '_id': ObjectId('6650a2f9f374f529b6ea16e4'),
 'air_temperature_celcius': 24,
...
 'precipitation_value': 0.0,
 'relative_humidity': 54.1,
 'station': 948700,
 'windspeed_knots': 12.8}

## Task 2.2j
-  Finding the records with a 24-hour precipitation recorded between 0.20 to 0.35.

In [None]:
results = collection.find({"precipitation_value": {"$gte":0.20, "$lte": 0.35}})
for result in results:
    pprint(result)

{'GHI_w/m2': 157,
 '_id': ObjectId('6650a2f9f374f529b6ea16ec'),
 'air_temperature_celcius': 19,
 'date': datetime.datetime(2023, 1, 13, 0, 0),
 'hotspots': [],
 'max_wind_speed': 18.1,
 'precipitation_type': 'G',
 'precipitation_value': 0.31,
 'relative_humidity': 54.1,
 'station': 948700,
 'windspeed_knots': 11.2}
{'GHI_w/m2': 146,
 '_id': ObjectId('6650a2f9f374f529b6ea1737'),
 'air_temperature_celcius': 17,
 'date': datetime.datetime(2023, 3, 29, 0, 0),
 'hotspots': [{'confidence': 69,
               'datetime': datetime.datetime(2023, 3, 29, 0, 48, 40),
               'latitude': -34.2648,
               'longitude': 141.6325,
               'surface_temperature_celsius': 51}],
 'max_wind_speed': 21.0,
 'precipitation_type': 'G',
 'precipitation_value': 0.24,
 'relative_humidity': 49.9,
 'station': 948701,
...
 'precipitation_value': 0.2,
 'relative_humidity': 61.0,
 'station': 948702,
 'windspeed_knots': 9.3}


## Task 3
- Optimisation of current database using compound index

- Considering the use cases in task 2, many query operations involve searching by date, filtering by station and aggregation on hotspots

- for high data ingestion rate we wish to minimize index overhead by limiting the number of index

- we can create a compound index that covers multiple query patterns such as between date and station ID since we might want to search for records for a particular station over a specific date period 

- we can also create between data and confidence or if queries commonly involve retrieving hotspots based on the date and their confidence level)

In [None]:
db.collection.createIndex({"date": 1, "station": 1})
db.collection.createIndex({"date": 1, "hotspots.confidence": 1})