# 01 — Pandas `groupby` + Datetime on GHCN Parquet
This exercise loads the Parquet produced by **Notebook 02** and demonstrates standard `groupby` + datetime analyses.
You can read locally or (after you push to GitHub) via a **cloud HTTPS raw** URL.

In [1]:
import pandas as pd, numpy as np
from pathlib import Path

# Primary local artifact produced by 02_fetch_ghcn_il_to_parquet.ipynb
LOCAL_PARQUET = '/data/keeling/a/deffip2/ATMS_523/HW_3_Module_3/ATMS-523-Module-3-pandas-datetime-climate-deffip2/data/ghcn_il_top4_daily.parquet'

# After pushing to GitHub, set this to your repo's raw URL to read from the cloud:
CLOUD_PARQUET = None  # e.g., 'https://raw.githubusercontent.com/USER/REPO/main/data/ghcn_il_top4_daily.parquet'

def read_cloud_first(cloud_url, local_fallback):
    try:
        if cloud_url:
            df = pd.read_parquet(cloud_url)  # needs pyarrow/fastparquet
            print('Loaded from cloud:', cloud_url)
            return df
    except Exception as e:
        print('Cloud read failed → using local:', type(e).__name__, str(e)[:120])
    print('Loaded local:', local_fallback)
    return pd.read_parquet(local_fallback)

df = read_cloud_first(CLOUD_PARQUET, LOCAL_PARQUET).sort_values(['ID','DATE']).reset_index(drop=True)
df.dtypes

Loaded local: /data/keeling/a/deffip2/ATMS_523/HW_3_Module_3/ATMS-523-Module-3-pandas-datetime-climate-deffip2/data/ghcn_il_top4_daily.parquet


ID              object
DATE    datetime64[ns]
PRCP           float64
TMAX           float64
TMIN           float64
SNOW           float64
SNWD           float64
DAPR           float64
MDPR           float64
TOBS           float64
WT01           float64
WT04           float64
WT05           float64
WT06           float64
WT03           float64
WT07           float64
WT08           float64
WT09           float64
WT11           float64
WT14           float64
WT16           float64
WT18           float64
DASF           float64
MDSF           float64
WESD           float64
EVAP           float64
dtype: object

## 1) Add datetime helpers

In [2]:
df['year']  = pd.to_datetime(df['DATE']).dt.year
df['month'] = pd.to_datetime(df['DATE']).dt.month
df['ym']    = pd.to_datetime(df['DATE']).dt.to_period('M')
df.head()

Unnamed: 0,ID,DATE,PRCP,TMAX,TMIN,SNOW,SNWD,DAPR,MDPR,TOBS,...,WT14,WT16,WT18,DASF,MDSF,WESD,EVAP,year,month,ym
0,USC00110137,1892-12-02,0.0,,,,,,,,...,,1.0,,,,,,1892,12,1892-12
1,USC00110137,1892-12-03,,,,,,,,,...,,,,,,,,1892,12,1892-12
2,USC00110137,1892-12-06,20.8,,,,,,,,...,,1.0,,,,,,1892,12,1892-12
3,USC00110137,1892-12-07,22.1,,,,,,,,...,,,,,,,,1892,12,1892-12
4,USC00110137,1892-12-13,11.4,,,,,,,,...,,1.0,,,,,,1892,12,1892-12


## 2) Monthly means & totals by station

In [4]:
monthly = (
    df.groupby(['ID','ym'], as_index=False)
      .agg(TMIN=('TMIN','mean'), TMAX=('TMAX','mean'), PRCP=('PRCP','sum'))
)
monthly_piv = monthly.pivot(index='ym', columns='ID', values='TMIN')
monthly.tail(), monthly_piv.tail()

(               ID       ym       TMIN       TMAX   PRCP
 4853  USC00117391  2025-05  11.690323  22.812903  122.8
 4854  USC00117391  2025-06  19.466667  29.507143   70.4
 4855  USC00117391  2025-07  21.232258  31.145161  257.7
 4856  USC00117391  2025-08  17.038710  29.012903   75.8
 4857  USC00117391  2025-09  12.960714  25.896429   13.8,
 ID       USC00110137  USC00110338  USC00116526  USC00117391
 ym                                                         
 2025-05    12.503571     9.354839     8.683871    11.690323
 2025-06    19.324138    16.946667    16.436667    19.466667
 2025-07    22.307143    19.722581    19.125806    21.232258
 2025-08    18.459259    17.133333    15.580645    17.038710
 2025-09    15.013333    13.406667    11.751724    12.960714)

**Try it:** Compute monthly *median* `TMAX` by station.

In [5]:
monthly_med = (
    df.groupby(['ID','ym'], as_index=False)
      .agg(TMAX=('TMAX','median'))
)
monthly_med_piv = monthly_med.pivot(index='ym', columns='ID', values='TMAX')
monthly_med.tail(), monthly_med_piv.tail()

(               ID       ym  TMAX
 4853  USC00117391  2025-05  22.8
 4854  USC00117391  2025-06  29.7
 4855  USC00117391  2025-07  31.7
 4856  USC00117391  2025-08  28.9
 4857  USC00117391  2025-09  26.1,
 ID       USC00110137  USC00110338  USC00116526  USC00117391
 ym                                                         
 2025-05         22.5         20.0        21.10         22.8
 2025-06         30.0         28.9        28.05         29.7
 2025-07         31.7         30.0        30.00         31.7
 2025-08         30.3         27.8        26.70         28.9
 2025-09         27.8         26.7        27.20         26.1)

## 3) Annual precipitation totals and rankings

In [6]:
annual_prcp = (
    df.groupby(['ID','year'], as_index=False)
      .agg(annual_prcp_mm=('PRCP','sum'))
)
annual_prcp['rank_within_year'] = annual_prcp.groupby('year')['annual_prcp_mm'].rank(ascending=False, method='min')
annual_prcp.sort_values(['year','rank_within_year']).head(12)

Unnamed: 0,ID,year,annual_prcp_mm,rank_within_year
369,USC00117391,1866,358.1,1.0
370,USC00117391,1867,730.0,1.0
371,USC00117391,1868,859.7,1.0
372,USC00117391,1869,1051.1,1.0
373,USC00117391,1870,986.4,1.0
374,USC00117391,1871,831.3,1.0
375,USC00117391,1872,627.5,1.0
376,USC00117391,1873,448.6,1.0
377,USC00117391,1874,618.4,1.0
378,USC00117391,1875,681.7,1.0


**Try it:** Rank warmest station per year using mean `TAVG`.

In [8]:
annual_tavg = (
    df.groupby(['ID','year'], as_index=False)
      .agg(annual_tavg=('TMAX','mean'))
)
annual_tavg['rank_within_year'] = annual_tavg.groupby('year')['annual_tavg'].rank(ascending=False, method='min')
annual_tavg.sort_values(['year','rank_within_year']).head(12)

Unnamed: 0,ID,year,annual_tavg,rank_within_year
369,USC00117391,1866,,
370,USC00117391,1867,,
371,USC00117391,1868,,
372,USC00117391,1869,,
373,USC00117391,1870,,
374,USC00117391,1871,,
375,USC00117391,1872,18.475472,1.0
376,USC00117391,1873,14.854247,1.0
377,USC00117391,1874,15.639178,1.0
378,USC00117391,1875,12.956986,1.0


## 4) Station-by-month climatology (using whatever is present)

In [9]:
climo = df.groupby(['ID','month'], as_index=False)['TMAX'].mean()
climo_piv = climo.pivot(index='month', columns='ID', values='TMAX')
climo_piv

ID,USC00110137,USC00110338,USC00116526,USC00117391
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.206984,-0.64633,0.290932,-0.295235
2,5.861129,1.304014,2.374458,2.132727
3,11.82408,7.960794,9.20914,9.04622
4,18.641973,15.491488,17.048922,16.344476
5,24.038368,21.870903,23.236806,22.627047
6,29.194917,27.075834,28.192177,27.815774
7,31.286829,29.453647,30.386635,29.359379
8,30.389658,28.292638,29.317349,28.476923
9,26.789799,24.376057,25.657596,25.001848
10,20.332227,17.70926,18.992826,17.721014


**Try it:** Compute monthly precipitation climatology (sum of `PRCP` across years).

In [10]:
climo_prcp = df.groupby(['ID','month'], as_index=False)['PRCP'].sum()
climo_prcp_piv = climo_prcp.pivot(index='month', columns='ID', values='PRCP')
climo_prcp_piv

ID,USC00110137,USC00110338,USC00116526,USC00117391
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4251.2,5833.9,5633.8,2162.5
2,4525.5,5218.8,5050.1,2188.6
3,7073.6,8311.4,8685.7,3687.1
4,8425.0,11176.0,11379.1,4418.1
5,9439.2,13311.4,13356.6,6248.5
6,7599.5,13877.8,13054.5,6575.8
7,8279.8,12297.3,12061.7,5385.4
8,6085.5,11971.6,11997.1,5153.0
9,5885.1,11432.2,11489.9,4127.6
10,5299.5,9476.2,8133.6,3261.0
