In [1]:
!pip install fastparquet



In [2]:
import pandas as pd
from fastparquet import ParquetFile
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [3]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir("drive/My Drive/DataViz")

In [5]:
# Load the dataset
pf = ParquetFile("daily_weather.parquet")
df = pf.to_pandas()

In [6]:
df.nunique()

station_id                 1227
city_name                  1234
date                      69648
season                        4
avg_temp_c                 1048
min_temp_c                 1029
max_temp_c                 1142
precipitation_mm           2812
snow_depth_mm               888
avg_wind_dir_deg            361
avg_wind_speed_kmh          944
peak_wind_gust_kmh          663
avg_sea_level_pres_hpa     1170
sunshine_total_min          997
dtype: int64

In [7]:
df.size

386900682

In [8]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
0,41515,Asadabad,1957-07-01,Summer,27.0,21.1,35.6,0.0,,,,,,
1,41515,Asadabad,1957-07-02,Summer,22.8,18.9,32.2,0.0,,,,,,
2,41515,Asadabad,1957-07-03,Summer,24.3,16.7,35.6,1.0,,,,,,
3,41515,Asadabad,1957-07-04,Summer,26.6,16.1,37.8,4.1,,,,,,
4,41515,Asadabad,1957-07-05,Summer,30.8,20.0,41.7,0.0,,,,,,


First let's see the number of weather reports this dataset has by year

In [34]:
grouped = df.groupby(["station_id", pd.PeriodIndex(df['date'], freq="Y")])
grouped.head()

aggregated_data = grouped.agg({
    'avg_temp_c': 'mean',
    'precipitation_mm': 'mean',
    'snow_depth_mm': 'mean',
    'avg_wind_dir_deg': 'mean',
    'avg_wind_speed_kmh': 'mean',
    'avg_sea_level_pres_hpa': 'mean',
    'min_temp_c': 'min',
    'max_temp_c': 'max',
    'peak_wind_gust_kmh': 'max',
    'sunshine_total_min': 'sum'
})

# Step 3: Combine results into a final dataset
final_dataset = pd.DataFrame({
    'avg_temp_c': aggregated_data['avg_temp_c'],   # Rename columns if needed
    'precipitation_mm': aggregated_data['precipitation_mm'],   # Rename columns if needed
    'snow_depth_mm': aggregated_data['snow_depth_mm'],   # Rename columns if needed
    'avg_wind_dir_deg': aggregated_data['avg_wind_dir_deg'],   # Rename columns if needed
    'avg_wind_speed_kmh': aggregated_data['avg_wind_speed_kmh'],   # Rename columns if needed
    'avg_sea_level_pres_hpa': aggregated_data['avg_sea_level_pres_hpa'],   # Rename columns if needed
    'min_temp_c': aggregated_data['min_temp_c'],
    'max_temp_c': aggregated_data['max_temp_c'],
    'peak_wind_gust_kmh': aggregated_data['peak_wind_gust_kmh'],
    'sunshine_total_min': aggregated_data['sunshine_total_min']
}).reset_index()  # Reset index to make the DataFrame flat

# Display or save the final dataset
final_dataset_filtered = final_dataset[final_dataset['avg_temp_c'].notna()].reset_index()
print(final_dataset_filtered)

final_dataset.to_csv("aggregated.csv", encoding='utf-8')

        index station_id  date  avg_temp_c  precipitation_mm  snow_depth_mm  \
0         143      01008  1975  -12.327473          0.376812            NaN   
1         144      01008  1976   -4.456044          0.822368      35.000000   
2         145      01008  1977   -6.672253          0.445141      42.136986   
3         146      01008  1978   -7.422740          0.433735     108.246575   
4         147      01008  1979   -7.496919          0.529070     145.150685   
...       ...        ...   ...         ...               ...            ...   
66699  235579      KPHF0  2019   16.469209          3.655616       0.000000   
66700  235580      KPHF0  2020   17.080110          4.539071       0.000000   
66701  235581      KPHF0  2021   15.196703          3.134795       0.000000   
66702  235582      KPHF0  2022   15.018681          3.255890       0.000000   
66703  235583      KPHF0  2023   16.607143          2.822594       0.000000   

       avg_wind_dir_deg  avg_wind_speed_kmh  avg_se