# **Script4SnowDiagnostics**  
**Description:** Notebook to read in raw data of interest extracted using Script4ProfileConversion.ipynb and count the number of days/instances with problmatic snow conditions.    
**Input Data:** .csv files of raw profile data for each point and ensemble member  
**Output Data:** .csv files of counts of days with problmatic snow conditions each season  
**Creator:** Emma Perkins  
**Date:** November 2023, Updated January 2024

In [184]:
#import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import glob
import xarray as xr
from statsmodels.formula.api import ols
import statsmodels.stats.api as sms
import statsmodels.api as sm
import os
from sklearn.preprocessing import PolynomialFeatures
import math

## **Collect All Data from 1950 - 2100 in Same .csv Files**  
**ONLY RUN THIS CELL ONCE, ALREADY RUN ON 11/14/2023**

In [2]:
#set path to csv files
main_path = '/glade/work/eperkins/SNOWPACK_data/Results/analysis/GE_alts/csv_files/'
hist_files = sorted(glob.glob(main_path+'1950_2014/LE*/*.csv'))
future_files = sorted(glob.glob(main_path+'2015_2100/LE*/*.csv'))

#read in data from all csv files
for i in range(80):
    LE = 'LE'+str(math.ceil((i+1)/8))
    point = hist_files[i].split('/')[-1][0:-4]
    pd.concat([pd.read_csv(hist_files[i]),pd.read_csv(future_files[i])]).to_csv(main_path+'1950_2100/'+point+'/cesm2_'+LE+'.csv')

## **Supporting Functions**

In [176]:
def calculate_stats(season_data, depth_cutoff):
    #season data: pandas dataframe
    #depth cutoff: number for deep day cutoff in cm
    def day_counter(unique_dates, duration): 
        #function to count number of instances of consecutive days that a condition is met
        duration_met = 0
        consec_days = 0
        max_days = 0
        for i in range(len(unique_dates)-1):
            if unique_dates[i+1] == unique_dates[i] + datetime.timedelta(days=1):
                consec_days += 1
                if (consec_days >= duration -1) and (i ==  len(range(len(unique_dates)-1)) - 1): #account for the condition that all dates are consecutive
                    duration_met += 1
            elif consec_days >= duration - 1:
                duration_met += 1
                consec_days = 0
            if consec_days > max_days:
                max_days = consec_days
        return(duration_met, max_days)
    
    #count total number of high density days
    high_density = season_data[season_data.max_den > 350].day.unique()
    hd5, hd_count_max = day_counter(high_density,5)

    #count total number of icy days -  days where density is greater than 350 and water content is less than 2%
    icy_snowpack = season_data[((season_data.max_den > 350) & (season_data.WC_dmax < 0.1)) | (season_data.max_ice_vol > 70)].day.unique()
    ice5, ice_count_max = day_counter(icy_snowpack, 5)

    #count length of snow season
    season_start = season_data[season_data.depth > 0].reset_index().date[0].dayofyear
    season_end = season_data[season_data.depth > 0].reset_index().date[len(season_data[season_data.depth > 0].reset_index()) - 1].dayofyear
    season_length = (season_data[season_data.depth > 0].iloc[-1].date - season_data[season_data.depth > 0].iloc[0].date).days

    #count number of days with high snowpack
    deep_days = season_data[season_data.depth > depth_cutoff].day.unique()
    deep5, deep_count_max = day_counter(deep_days, 5)
    
    return [hd5, hd_count_max, len(high_density), ice5, ice_count_max, len(icy_snowpack), season_start, season_end, season_length, deep5, deep_count_max, len(deep_days)]

In [177]:
#supporting function to append summary stats from single season to dataframe
def append_stats_multiyear(depth_cutoff):
    ss = data.groupby('season').apply(lambda df: calculate_stats(df, depth_cutoff))
    for i in range(len(ss)):
        summary_stats.loc[len(summary_stats)] = pd.Series({'season': ss.index[i], 'hd5': ss.iloc[i][0], 'hd_count_max': ss.iloc[i][1],'hd_total': ss.iloc[i][2],
                                                           'ice5': ss.iloc[i][3], 'ice_count_max': ss.iloc[i][4], 'ice_total': ss.iloc[i][5],
                                                           'season_start': ss.iloc[i][6], 'season_end': ss.iloc[i][7], 'length': ss.iloc[i][8],
                                                           'deep5': ss.iloc[i][9], 'deep_count_max': ss.iloc[i][10], 'deep_total': ss.iloc[i][11]})
    return summary_stats

## **Calculate snowpack diagnostics for all points**

In [178]:
%%time

#load all data
main_path = '/glade/work/eperkins/SNOWPACK_data/Results/analysis/GE_alts/'
read_data_path = main_path + 'csv_files/1950_2100/'
write_data_path = main_path + 'snow_chars/1950_2100/'

file_lists = list(range(8))
dfs = []
depth_cutoff = 50 #adjust to liking, currently 50cm

for i in range(8):
    file_lists[i] = sorted(glob.glob(read_data_path+'Point'+str(i+1)+'/cesm2_*.csv'))
    df = []
    for j in range(10):
        #read in the data and reformat for easy analysis
        data = pd.read_csv(file_lists[i][j]).drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1) #drop extra index columns
        data['date'] = pd.to_datetime(data.date) #convert date from string to datetime
        data['day'] = data.date.apply(lambda x: datetime.date(x.year, x.month, x.day)) #add column of day (as opposed to also hour) for counting consecutive days
        
        #count days with problematic snow conditions
        #initialize summary stats dataframe
        summary_stats = pd.DataFrame(columns=['hd5', 'hd_count_max', 'hd_total','ice5','ice_count_max','ice_total', 'season_start', 'season_end',
                                              'length','deep5','deep_count_max','deep_total','season']) #initializes empty dataframe of summary stats

        summary_stats = append_stats_multiyear(depth_cutoff)
        for var in ['hd5', 'hd_count_max', 'hd_total','ice5','ice_count_max','ice_total', 'season_start', 'season_end', 'length','deep5','deep_count_max','deep_total','season']:
            summary_stats[var] = summary_stats[var].astype(int)
        summary_stats.to_csv(write_data_path+'Point'+str(i+1)+'/start_end'+str(j+1)+'.csv')
    print('Done with Point: '+str(i+1))

Done with Point: 1
Done with Point: 2
Done with Point: 3
Done with Point: 4
Done with Point: 5
Done with Point: 6
Done with Point: 7
Done with Point: 8
CPU times: user 2min 21s, sys: 2.36 s, total: 2min 23s
Wall time: 2min 36s


## **Convert diagnostic statistics to NetCDF**

In [180]:
#load all data
main_path = '/glade/work/eperkins/SNOWPACK_data/Results/analysis/GE_alts/snow_chars/1950_2100/'

file_lists = list(range(8))
dfs = []

for i in range(8):
    file_lists[i] = sorted(glob.glob(main_path+'Point'+str(i+1)+'/start_end*.csv'))
    df = []
    for j in range(10):
        df.append(pd.read_csv(file_lists[i][j]).drop(['Unnamed: 0'], axis = 1))
    dfs.append(df)

In [181]:
#create a numpy array with the same dimensions as the intended NetCDF file
var_lists = []

for var in ['hd5', 'hd_count_max', 'hd_total', 'ice5', 'ice_count_max', 'ice_total', 'season_start', 'season_end', 'length', 'deep5', 'deep_count_max', 'deep_total']:
    master_list = []
    for i in range(8):
        sub_list = []
        for j in range(10):
            sub_list.append(dfs[i][j][var].to_numpy())
        master_list.append(sub_list)
    var_lists.append(np.array(master_list).transpose())

In [182]:
#build an Xarray Dataset using the data from the numpy array above
seasons = dfs[0][0].season
points = [1,2,3,4,5,6,7,8]
runs = [1,2,3,4,5,6,7,8,9,10]

full_data = xr.Dataset(
    data_vars=dict(
        hd5=(['season','run','point'],var_lists[0]),
        hd_count_max=(['season','run','point'],var_lists[1]),
        hd_total=(['season','run','point'],var_lists[2]),
        ice5=(['season','run','point'],var_lists[3]),
        ice_count_max=(['season','run','point'],var_lists[4]),
        ice_total=(['season','run','point'],var_lists[5]),
        season_start=(['season','run','point'],var_lists[6]),
        season_end=(['season','run','point'],var_lists[7]),
        length=(['season','run','point'],var_lists[8]),
        deep5=(['season','run','point'],var_lists[9]),
        deep_count_max=(['season','run','point'],var_lists[10]),
        deep_total=(['season','run','point'],var_lists[11]),
    ),
    coords=dict(
        season=(['season'], seasons),
        run=(['run'], runs),
        point=(['point'], points)
    )
)

#save the new dataset as a NetCDF file
full_data.to_netcdf('/glade/work/eperkins/SNOWPACK_data/Results/analysis/GE_alts/NetCDF_files/1950_2100_md350_wc01_iv70_start_end.nc')