In [1]:
import pandas as pd

In [2]:
# Station metadata
stn_ids = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt', header=None, infer_nrows=1000)
stn_ids.columns = ['ID','LAT','LON','ELEV','UKN','NAME','GSN','WBAN']
stn_ids

Unnamed: 0,ID,LAT,LON,ELEV,UKN,NAME,GSN,WBAN
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP,GSN,41196.0
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL,,41194.0
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL,,41217.0
...,...,...,...,...,...,...,...,...
129653,ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON,,67969.0
129654,ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO,,67975.0
129655,ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE,,67977.0
129656,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0


In [3]:
# Station periods of record
periods = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-inventory.txt', header=None, infer_nrows=1000)
periods.columns = ['ID','LAT','LON','ELEM','TiMIN','TiMAX']
periods

Unnamed: 0,ID,LAT,LON,ELEM,TiMIN,TiMAX
0,ACW00011604,17.1167,-61.7833,TMAX,1949,1949
1,ACW00011604,17.1167,-61.7833,TMIN,1949,1949
2,ACW00011604,17.1167,-61.7833,PRCP,1949,1949
3,ACW00011604,17.1167,-61.7833,SNOW,1949,1949
4,ACW00011604,17.1167,-61.7833,SNWD,1949,1949
...,...,...,...,...,...,...
767508,ZI000067983,-20.2000,32.6160,PRCP,1951,2025
767509,ZI000067983,-20.2000,32.6160,TAVG,1962,2025
767510,ZI000067991,-22.2170,30.0000,TMAX,1951,1990
767511,ZI000067991,-22.2170,30.0000,TMIN,1951,1990


In [5]:
# Merge the two DataFrames
merged_stns = pd.merge(stn_ids,periods,how='left',left_on='ID',right_on='ID')
merged_stns

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,TMAX,1949.0,1949.0
1,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,TMIN,1949.0,1949.0
2,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,PRCP,1949.0,1949.0
3,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,SNOW,1949.0,1949.0
4,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,SNWD,1949.0,1949.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
767572,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0,-20.2000,32.6160,PRCP,1951.0,2025.0
767573,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0,-20.2000,32.6160,TAVG,1962.0,2025.0
767574,ZI000067991,-22.2170,30.0000,457.0,,BEITBRIDGE,,67991.0,-22.2170,30.0000,TMAX,1951.0,1990.0
767575,ZI000067991,-22.2170,30.0000,457.0,,BEITBRIDGE,,67991.0,-22.2170,30.0000,TMIN,1951.0,1990.0


In [6]:
# Grab all stations that have TMIN data from at least 1990 to 2020
merged_stns = merged_stns[(merged_stns['ELEM'] == 'TMIN') & (merged_stns['TiMAX'] >= 2020) & (merged_stns['TiMIN'] <= 1990)]
merged_stns

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
19,AE000041196,25.333,55.517,34.0,,SHARJAH INTER. AIRP,GSN,41196.0,25.333,55.517,TMIN,1944.0,2025.0
23,AEM00041194,25.255,55.364,10.4,,DUBAI INTL,,41194.0,25.255,55.364,TMIN,1983.0,2025.0
27,AEM00041217,24.433,54.651,26.8,,ABU DHABI INTL,,41217.0,24.433,54.651,TMIN,1983.0,2025.0
40,AFM00040938,34.210,62.228,977.2,,HERAT,,40938.0,34.210,62.228,TMIN,1973.0,2020.0
45,AFM00040948,34.566,69.212,1791.3,,KABUL INTL,,40948.0,34.566,69.212,TMIN,1967.0,2021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
767505,ZA000067743,-17.817,25.817,986.0,,LIVINGSTONE,GSN,67743.0,-17.817,25.817,TMIN,1973.0,2025.0
767510,ZAM00067663,-14.450,28.467,1207.0,,KABWE/MILLIKEN,,67663.0,-14.450,28.467,TMIN,1973.0,2025.0
767522,ZI000067775,-17.917,31.133,1480.0,,HARARE (KUTSAGA),GSN,67775.0,-17.917,31.133,TMIN,1956.0,2025.0
767563,ZI000067975,-20.067,30.867,1095.0,,MASVINGO,,67975.0,-20.067,30.867,TMIN,1951.0,2025.0


In [8]:
station_id = 'USC00087205'
# Make sure our station exists and its period of record is sufficient
merged_stns.loc[merged_stns['ID'] == station_id]

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
469182,USC00087205,28.0208,-82.1392,33.2,FL,PLANT CITY,,,28.0208,-82.1392,TMIN,1892.0,2025.0


In [10]:
# Load the station data from S3 bucket
df = pd.read_csv(
    f"s3://noaa-ghcn-pds/csv/by_station/{station_id}.csv",
    storage_options={"anon": True},  # passed to `s3fs.S3FileSystem`
    dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},
    parse_dates=['DATE']
).set_index('DATE')

df

  df = pd.read_csv(


Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1892-09-01,USC00087205,TMAX,322,,,6,
1892-09-02,USC00087205,TMAX,317,,,6,
1892-09-03,USC00087205,TMAX,317,,,6,
1892-09-04,USC00087205,TMAX,322,,,6,
1892-09-05,USC00087205,TMAX,333,,,6,
...,...,...,...,...,...,...,...
2025-12-10,USC00087205,PRCP,0,,,H,1600.0
2025-12-11,USC00087205,PRCP,0,,,H,1600.0
2025-12-12,USC00087205,PRCP,0,,,H,1600.0
2025-12-13,USC00087205,PRCP,0,,,H,1600.0


In [22]:
# Grab TMIN data since we are conerned with freezing days
df_tmin = df.loc[df['ELEMENT'] == 'TMIN']
df_tmin['DATA_VALUE'] = df_tmin['DATA_VALUE'] / 10.0  # Convert to deg C
df_tmin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin['DATA_VALUE'] = df_tmin['DATA_VALUE'] / 10.0  # Convert to deg C


Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1892-09-01,USC00087205,TMIN,20.6,,,6,
1892-09-02,USC00087205,TMIN,20.6,,,6,
1892-09-03,USC00087205,TMIN,21.1,,,6,
1892-09-04,USC00087205,TMIN,21.7,,,6,
1892-09-05,USC00087205,TMIN,21.1,,,6,
...,...,...,...,...,...,...,...
2025-12-10,USC00087205,TMIN,12.8,,,H,1600.0
2025-12-11,USC00087205,TMIN,10.0,,,H,1600.0
2025-12-12,USC00087205,TMIN,6.7,,,H,1600.0
2025-12-13,USC00087205,TMIN,10.0,,,H,1600.0


### Now that we have the min temp data from Plant City, lets move onto question 1.

Goal: Mean number of days per month (for October-January) for 1991-2020 period that were less than or equal to 32 deg F (frost) and 28 deg F (freeze).

In [23]:
df_tmin_f = df_tmin.copy()
df_tmin_f['DATA_VALUE'] = ((df_tmin['DATA_VALUE'] * (9/5)) + 32)  # Convert to deg F
df_tmin_f

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1892-09-01,USC00087205,TMIN,69.08,,,6,
1892-09-02,USC00087205,TMIN,69.08,,,6,
1892-09-03,USC00087205,TMIN,69.98,,,6,
1892-09-04,USC00087205,TMIN,71.06,,,6,
1892-09-05,USC00087205,TMIN,69.98,,,6,
...,...,...,...,...,...,...,...
2025-12-10,USC00087205,TMIN,55.04,,,H,1600.0
2025-12-11,USC00087205,TMIN,50.00,,,H,1600.0
2025-12-12,USC00087205,TMIN,44.06,,,H,1600.0
2025-12-13,USC00087205,TMIN,50.00,,,H,1600.0


In [30]:
# Select TMIN data for 1991-2020 and months October to January
df_tmin_f_period = df_tmin_f[(df_tmin_f.index.year >= 1991)
                              & (df_tmin_f.index.year <= 2020)
                              & (df_tmin_f.index.month.isin([10,11,12,1]))]

# Add month of year column as well
df_tmin_f_period['MONTH'] = df_tmin_f_period.index.month
df_tmin_f_period

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin_f_period['MONTH'] = df_tmin_f_period.index.month


Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME,MONTH
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1991-01-01,USC00087205,TMIN,62.06,,,0,1800.0,1
1991-01-02,USC00087205,TMIN,64.94,,,0,1800.0,1
1991-01-03,USC00087205,TMIN,64.04,,,0,1800.0,1
1991-01-04,USC00087205,TMIN,64.04,,,0,1800.0,1
1991-01-05,USC00087205,TMIN,62.96,,,0,1800.0,1
...,...,...,...,...,...,...,...,...
2020-12-27,USC00087205,TMIN,33.98,,,7,1600.0,12
2020-12-28,USC00087205,TMIN,48.02,,,7,1600.0,12
2020-12-29,USC00087205,TMIN,53.06,,,7,1600.0,12
2020-12-30,USC00087205,TMIN,55.04,,,7,1600.0,12


In [32]:
# Add columns to denote frost and freeze days
df_tmin_f_period['FROST'] = df_tmin_f_period['DATA_VALUE'] <= 32
df_tmin_f_period['FREEZE'] = df_tmin_f_period['DATA_VALUE'] <= 28

df_tmin_f_period

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin_f_period['FROST'] = df_tmin_f_period['DATA_VALUE'] <= 32
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin_f_period['FREEZE'] = df_tmin_f_period['DATA_VALUE'] <= 28


Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME,MONTH,FROST,FREEZE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1991-01-01,USC00087205,TMIN,62.06,,,0,1800.0,1,False,False
1991-01-02,USC00087205,TMIN,64.94,,,0,1800.0,1,False,False
1991-01-03,USC00087205,TMIN,64.04,,,0,1800.0,1,False,False
1991-01-04,USC00087205,TMIN,64.04,,,0,1800.0,1,False,False
1991-01-05,USC00087205,TMIN,62.96,,,0,1800.0,1,False,False
...,...,...,...,...,...,...,...,...,...,...
2020-12-27,USC00087205,TMIN,33.98,,,7,1600.0,12,False,False
2020-12-28,USC00087205,TMIN,48.02,,,7,1600.0,12,False,False
2020-12-29,USC00087205,TMIN,53.06,,,7,1600.0,12,False,False
2020-12-30,USC00087205,TMIN,55.04,,,7,1600.0,12,False,False


In [37]:
# Group by month and sum the number of frost and freeze days over the 30-year period
df_mean_ff_days = df_tmin_f_period.groupby('MONTH').agg({'FROST':'sum','FREEZE':'sum'}) / 30.0  # Divide by 30 years to get mean number of days per month
df_mean_ff_days

Unnamed: 0_level_0,FROST,FREEZE
MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.866667,0.5
10,0.0,0.0
11,0.033333,0.0
12,0.6,0.166667


### Part 2: 