# Test visualisation of the bronze test data.

In [1]:
import pandas as pd

In [2]:
data_belagavi = pd.read_parquet("../data/data_2025-05-26.parquet")
print(data_belagavi)

                   timestamp  air_temperature location_id crop_id
0  2025-05-26 13:00:00+00:00             21.6    Belagavi   maize
1  2025-05-26 14:00:00+00:00             21.2    Belagavi   maize
2  2025-05-26 15:00:00+00:00             20.9    Belagavi   maize
3  2025-05-26 16:00:00+00:00             21.1    Belagavi   maize
4  2025-05-26 17:00:00+00:00             20.9    Belagavi   maize
5  2025-05-26 18:00:00+00:00             20.7    Belagavi   maize
6  2025-05-26 19:00:00+00:00             20.4    Belagavi   maize
7  2025-05-26 20:00:00+00:00             20.3    Belagavi   maize
8  2025-05-26 21:00:00+00:00             20.6    Belagavi   maize
9  2025-05-26 22:00:00+00:00             20.6    Belagavi   maize
10 2025-05-26 23:00:00+00:00             20.6    Belagavi   maize


In [None]:
# Check for data completeness for the loaded day from the Parquet file.
# Assumes the Parquet file is intended to contain hourly data for a single, full day.
if not data_belagavi.empty:
    expected_hourly_records = 24
    actual_records = len(data_belagavi)

    first_timestamp_in_data = data_belagavi["timestamp"].min()
    last_timestamp_in_data = data_belagavi["timestamp"].max()

    event_date = first_timestamp_in_data.date()

    if actual_records < expected_hourly_records:
        print(
            f"WARNING: The loaded data for {event_date} appears to be incomplete for a full day."
        )
        print(
            f"  Expected {expected_hourly_records} hourly records, but found {actual_records}."
        )
        print(
            f"  Data currently loaded ranges from {first_timestamp_in_data.time()} to {last_timestamp_in_data.time()}."
        )
        print(
            f"  This means data before {first_timestamp_in_data.time()} on that day might be missing."
        )
        print(
            "  Consequently, the calculated daily T_min, T_max, and GDD might not be accurate for the entire day."
        )
    elif actual_records == expected_hourly_records:
        is_standard_hourly_range = (
            first_timestamp_in_data.time() == pd.Timestamp("00:00:00").time()
            and last_timestamp_in_data.time() == pd.Timestamp("23:00:00").time()
        )
        if is_standard_hourly_range:
            print(
                f"Data for {event_date} appears complete: {actual_records} hourly records from 00:00 to 23:00."
            )
        else:
            print(
                f"WARNING: Data for {event_date} has {actual_records} records, but the time range ({first_timestamp_in_data.time()} to {last_timestamp_in_data.time()}) is not the standard 00:00-23:00."
            )
            print(
                "  Please verify data integrity. This may affect daily T_min, T_max, and GDD calculations."
            )
    else:  # More than 24 records
        print(
            f"INFO: For {event_date}, found {actual_records} records, which is more than the typical 24 hourly records."
        )
        print(
            f"  Data ranges from {first_timestamp_in_data.time()} to {last_timestamp_in_data.time()}."
        )
        print(
            "  Ensure this data granularity is appropriate for the GDD calculation method used."
        )
else:
    print(
        "WARNING: The DataFrame `data_belagavi` is empty. No data to check or process."
    )

  Expected 24 hourly records, but found 11.
  Data currently loaded ranges from 13:00:00 to 23:00:00.
  This means data before 13:00:00 on that day (e.g., 00:00-04:00) might be missing.
  Consequently, the calculated daily T_min, T_max, and GDD might not be accurate for the entire day.


In [4]:
data_belagavi_min = data_belagavi.air_temperature.min()
print(data_belagavi_min)
data_belagavi_max = data_belagavi.air_temperature.max()
print(data_belagavi_max)

20.3
21.6


### GDD = ((T_max + T_min) / 2) - T_base
T_base for maize and sorghum is 10 celsius.
- T_base is dependent on variety of the plant in question, and may vary based on each variety's biological adaptability in spesific conditions.
- Sorghum variety *sorghum bicolor* and maize 

In [5]:
GDD_maize = ((data_belagavi_max + data_belagavi_min) / 2) - 10
print(GDD_maize)

10.950000000000003


In [6]:
T_base = 10
avg_temp = (data_belagavi_max + data_belagavi_min) / 2

if avg_temp < T_base:
    GDD_maize = 0
else:
    GDD_maize = avg_temp - T_base

print(GDD_maize)

10.950000000000003


In [7]:
print(data_belagavi.columns)

Index(['timestamp', 'air_temperature', 'location_id', 'crop_id'], dtype='object')


In [8]:
data_belagavi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   timestamp        11 non-null     datetime64[ns, UTC]
 1   air_temperature  11 non-null     float64            
 2   location_id      11 non-null     object             
 3   crop_id          11 non-null     object             
dtypes: datetime64[ns, UTC](1), float64(1), object(2)
memory usage: 484.0+ bytes


In [9]:
data_belagavi.describe()

Unnamed: 0,air_temperature
count,11.0
mean,20.809091
std,0.380669
min,20.3
25%,20.6
50%,20.7
75%,21.0
max,21.6


In [10]:
# Group by day and count the number of entries
data_belagavi["timestamp"] = pd.to_datetime(data_belagavi["timestamp"])
data_belagavi["day"] = data_belagavi["timestamp"].dt.date
data_belagavi_days = data_belagavi.groupby("day").size().reset_index(name="count")
print(data_belagavi_days)

          day  count
0  2025-05-26     11
