# Extreme Temperature Events: Feature Extraction
Erin De Pree

This notebook extracts features from the ERA5 temperature data.

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from pandas.errors import SettingWithCopyWarning

from datetime import datetime

## Suppressing annoying notices

In [2]:
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Functions

In [3]:
def monthly_temperatures(df):
    """
    finds the monthly mean, standard deviation, and extreme temperatures for a dataframe of daily temperatures

    parameters
    ----------
    df : DataFrame
        daily entries of date, mean, maximum, and minimum temperature along with latitude and longitude

    Returns
    -------
    big_df : DataFrame
        with date column in datetime and an added month column, and the monthly min, max, mean, std, extreme temperatures
    """

    lat = int(np.round(df.loc[0,'latitude']))
    long = int(np.round(df.loc[0,'longitude']))

    df = df.rename(columns = {'temp_max': 'max', 'temp_min': 'min'})
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df.apply(lambda row: row['date'].month, axis=1)

    month_temp = df.groupby(by='month')[['max','min']].agg(['mean', 'std'])
    month_temp['max', 'extreme'] = month_temp['max','mean'] + 2.5 * month_temp['max', 'std']
    month_temp['min', 'extreme'] = month_temp['min','mean'] - 2.5 * month_temp['min', 'std']
    month_temp.columns = ['_'.join(a) for a in month_temp.columns.to_flat_index()]
    month_temp['latitude'] = lat
    month_temp['longitude'] = long
    
    # month_temp.to_csv(f"../data/processed/monthly/monthly_summary_lat={lat}_long={long}.csv")

    big_df = pd.merge(left=df.drop(columns=['latitude', 'longitude']), right=month_temp, on='month')

    return big_df

In [4]:
def find_cold_waves(df):
    """
    takes a pands data frame and returns the cold waves

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    cold_extreme : DataFrame 
        Pandas dataframe of all cold events with start date, start month, latitude, longitude, average max, min, changes, etc
        single row for each event
    """
    # find the cold waves: start by identifying the days when the minimum temperature is below the extreme minimum temperature
    cold_days = df[df['min'] <= df['min_extreme']]

    # calculating the change in temperature from the average minimum and the extreme minimum in both 
    cold_days.loc[:,'delta_min_mean_C'] = cold_days.loc[:,'min'] - cold_days.loc[:,'min_mean']
    cold_days.loc[:,'delta_min_mean_std'] = cold_days.loc[:,'delta_min_mean_C'] / cold_days.loc[:,'min_std']
    cold_days.loc[:,'delta_min_extreme_C'] = cold_days.loc[:,'min'] - cold_days.loc[:,'min_extreme']
    cold_days.loc[:,'delta_min_extreme_std'] = cold_days.loc[:,'delta_min_extreme_C'] / cold_days.loc[:,'min_std']

    # identify events as consecutive days below extreme minimum and label them
    cold_days.loc[:,'event_label'] = (cold_days.groupby(by=['latitude', 'longitude'])['date'].diff() != pd.Timedelta(days = 1)).cumsum()
    cold_days.loc[:,'duration'] = cold_days.groupby(by='event_label')['date'].transform('count')
    cold_days = cold_days.reset_index(drop=True)

    # average temperatures for each event
    event_means = cold_days.groupby(by='event_label')[[
        'max', 'min', 'min_mean', 'min_std', 'min_extreme', 
        'delta_min_mean_C', 'delta_min_mean_std', 
        'delta_min_extreme_C', 'delta_min_extreme_std'
    ]].mean()

    # only want a single entry (or row) per event, not one for each day
    cold_waves = cold_days[cold_days['duration'] > 1]
    cold_waves = cold_waves.reset_index(drop=True)
    
    indices_keep = []
    for event in cold_waves['event_label'].unique():
        indices_keep.append(cold_waves[cold_waves['event_label']==event].index[0])

    cold_extreme = pd.merge(
        left=cold_waves[['date', 'month', 'latitude', 'longitude', 'duration', 'event_label']], 
        right=event_means, 
        on='event_label'
    )

    cold_extreme = cold_extreme.rename(columns = {'date': 'start_date', 'month': 'start_month'}).drop(columns=['event_label'])

    return cold_extreme

In [5]:
def find_heat_waves(df):
    """
    takes a pands data frame and returns the heat waves

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    heat_extreme : DataFrame 
        Pandas dataframe of all heat events with start date, start month, latitude, longitude, average max, min, changes, etc
        single row for each event
    """
    # find the cold waves: start by identifying the days when the minimum temperature is below the extreme minimum temperature
    hot_days = df[df['max'] >= df['max_extreme']]

    # calculating the change in temperature from the average minimum and the extreme minimum in both 
    hot_days.loc[:,'delta_max_mean_C'] = hot_days.loc[:,'max'] + hot_days.loc[:,'max_mean']
    hot_days.loc[:,'delta_max_mean_std'] = hot_days.loc[:,'delta_max_mean_C'] / hot_days.loc[:,'max_std']
    hot_days.loc[:,'delta_max_extreme_C'] = hot_days.loc[:,'max'] + hot_days.loc[:,'max_extreme']
    hot_days.loc[:,'delta_max_extreme_std'] = hot_days.loc[:,'delta_max_extreme_C'] / hot_days.loc[:,'max_std']

    # identify events as consecutive days below extreme minimum and label them
    hot_days.loc[:,'event_label'] = (hot_days.groupby(by=['latitude', 'longitude'])['date'].diff() != pd.Timedelta(days = 1)).cumsum()
    hot_days.loc[:,'duration'] = hot_days.groupby(by='event_label')['date'].transform('count')
    hot_days = hot_days.reset_index(drop=True)

    # average temperatures for each event
    event_means = hot_days.groupby(by='event_label')[[
        'max', 'min', 'max_mean', 'max_std', 'max_extreme', 
        'delta_max_mean_C', 'delta_max_mean_std', 
        'delta_max_extreme_C', 'delta_max_extreme_std'
    ]].mean()

    # only want a single entry (or row) per event, not one for each day
    indices_keep = []
    heat_waves = hot_days[hot_days['duration'] > 1]
    heat_waves = heat_waves.reset_index(drop=True)

    for event in heat_waves['event_label'].unique():
        indices_keep.append(heat_waves[heat_waves['event_label']==event].index[0])

    heat_extreme = pd.merge(
        left=heat_waves[['date', 'month', 'latitude', 'longitude', 'duration', 'event_label']], 
        right=event_means, 
        on='event_label'
    )

    heat_extreme = heat_extreme.rename(columns = {'date': 'start_date', 'month': 'start_month'}).drop(columns=['event_label'])

    return heat_extreme

In [6]:
def find_extreme_temp_events(df):
    """
    takes a pandas dataframe and returns the extreme temperature entries

    Parameters
    ----------
    df : DataFrame
        daily entries of date, mean daily temperature, maximum daily temperature, and minimum daily temperature
        along with latitude and longitude

    Returns
    -------
    final_data : DataFrame
        entries of dates experiencing a heat or cold wave
    """
        
    data = monthly_temperatures(df)
    
    return find_cold_waves(data), find_heat_waves(data)

## Processing Data

In [7]:
# get file list
file_list = os.listdir('../data/processed/daily_temperature_grid')

In [8]:
# remove the unwanted files (there's probably a faster way to do this)
file_list.remove('.ipynb_checkpoints')
file_list.remove('.DS_Store')

In [9]:
short_file_list = [
    'temp_lat=70_long=-70.csv', 
    'temp_lat=50_long=80.csv', 
    'temp_lat=50_long=-90.csv', 
    'temp_lat=20_long=80.csv',
    'temp_lat=60_long=130.csv'
]

In [None]:
## this cell takes a LONG time to run

#status_notes = []
#cold_events = []
#heat_events = []
#
#data_directory = '../data/processed/daily_temperature_grid'

#for file in short_file_list:
#    data = pd.read_csv('../data/processed/daily_temperature_grid/' + file).drop(columns=['Unnamed: 0'])
    
#    status_entry = {}
#    lat = int(data.loc[0,'latitude'])
#    long = int(data.loc[0,'longitude'])

#    try: 
#        cold_waves, heat_waves = find_extreme_temp_events(data)

        # record final dataframe
        #file_path = f"../data/processed/temp_events_grid/temp_waves_lat={lat}_long={long}.csv"
        #final_data[(final_data['is_heat_wave'] == True) | (final_data['is_cold_wave']==True)].to_csv(file_path)
#        cold_events.append(cold_waves)
#        heat_events.append(heat_waves)
        
#    except:
#        status_entry = {
#            'latitude': int(data.loc[0,'latitude']), 
#            'longitude': int(data.loc[0,'longitude']), 
#            'status': 'error occurred', 
#            'file': file
#        }
#        print(status_entry)
        
#        status_notes.append(status_entry)

In [20]:
## example of prior cell with fewer files  :-)

status_notes = []
cold_events = []
heat_events = []

data_directory = '../data/processed/daily_temperature_grid/'

for file in short_file_list:
    data = pd.read_csv('../data/processed/daily_temperature_grid/' + file).drop(columns=['Unnamed: 0'])
    
    status_entry = {}
    lat = int(data.loc[0,'latitude'])
    long = int(data.loc[0,'longitude'])

    try: 
        cold_waves, heat_waves = find_extreme_temp_events(data)

        # record final dataframe
        #file_path = f"../data/processed/temp_events_grid/temp_waves_lat={lat}_long={long}.csv"
        #final_data[(final_data['is_heat_wave'] == True) | (final_data['is_cold_wave']==True)].to_csv(file_path)
        cold_events.append(cold_waves)
        heat_events.append(heat_waves)
        
    except:
        status_entry = {
            'latitude': int(data.loc[0,'latitude']), 
            'longitude': int(data.loc[0,'longitude']), 
            'status': 'error occurred', 
            'file': file
        }
        print(status_entry)
        
        status_notes.append(status_entry)

extreme_cold = pd.concat(cold_events).reset_index(drop=True)
extreme_heat = pd.concat(heat_events).reset_index(drop=True)

In [22]:
# extreme_cold.to_csv('../data/final/cold_events_final.csv')

In [23]:
extreme_heat.to_csv('../data/final/heat_events_final.csv')

These datasets are now ready to be modeled.