## Upper Air Data Preparation and Data Integration

**Import necessary modules and the ASOS/SNOTEL dataframes**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dtb
import os
from glob import glob
import datetime as dt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')   #suppress warning messages for cleaner presentation

**Import in surface dataframe**

In [2]:
data = [pd.read_csv('asos_snotel_clean_w_LXV.dat', parse_dates = True, index_col = 'Date_Time')]
asos_snotel_df= pd.concat(data)



asos_snotel_df = asos_snotel_df.astype('float32')





#### Import KGJT and KDEN Upper Air Data

In [3]:
upper_data = None
upper_data_df = None
UA_df = None
UA_KGJT_df = None
UA_KDEN_df = None


UA_df = pd.DataFrame()
UA_KGJT = [pd.read_csv(r'C:\Users\RAPP\Documents\Capstone\data\RAOB\KGJT_2006010100_2018010100.txt', header= 0, parse_dates = ['validUTC'], index_col = 'validUTC')]
UA_KDEN= [pd.read_csv(r'C:\Users\RAPP\Documents\Capstone\data\RAOB\KDEN_2005010100_2018010100.txt', header= 0, parse_dates = ['validUTC'], index_col = 'validUTC')]



#turn into datafrmes
UA_KGJT_df = pd.concat(UA_KGJT)
UA_KDEN_df = pd.concat(UA_KDEN)
print(UA_KGJT_df.head())
print(UA_KDEN_df.head())







#


                    station  levelcode  pressure_mb height_m tmpc dwpc drct  \
validUTC                                                                      
2006-12-03 12:00:00    KGJT          4       1000.0    370.0    M    M    M   
2006-12-03 12:00:00    KGJT          4        925.0    988.0    M    M    M   
2006-12-27 12:00:00    KGJT          4        850.0   1463.0    M    M    M   
2006-12-05 00:00:00    KGJT          4        925.0    934.0    M    M    M   
2006-12-23 12:00:00    KGJT          4       1000.0    273.0    M    M    M   

                    speed_kts bearing range_sm  
validUTC                                        
2006-12-03 12:00:00         M       M        M  
2006-12-03 12:00:00         M       M        M  
2006-12-27 12:00:00         M       M        M  
2006-12-05 00:00:00         M       M        M  
2006-12-23 12:00:00         M       M        M  
           station  levelcode  pressure_mb height_m tmpc  dwpc drct speed_kts  \
validUTC              

**As there are memory concerns, delete unnecessary columns**

In [4]:
#nont useful column and saves memory
del UA_KGJT_df['station']
del UA_KGJT_df['bearing']
del UA_KGJT_df['range_sm']


del UA_KDEN_df['station'] 
del UA_KDEN_df['bearing'] 
del UA_KDEN_df['range_sm']


**We are going to just keep the mandatory measurement levels.  These are indicated by level code 4.**

In [5]:
UA_KGJT_df = UA_KGJT_df.query('levelcode == 4')
UA_KDEN_df = UA_KDEN_df.query('levelcode == 4')


**Replace missing data with NaN and make sure data is all numeric.  Also, set to 32 bit datatype to conserve memory.  This is plenty of resolution for the values within this dataset.**

In [6]:
UA_KGJT_df = UA_KGJT_df.replace('M', np.NaN)
UA_KGJT_df = UA_KGJT_df.apply(pd.to_numeric)
UA_KGJT_df = UA_KGJT_df.astype('float32')

UA_KDEN_df = UA_KDEN_df.replace('M', np.NaN)
UA_KDEN_df = UA_KDEN_df.apply(pd.to_numeric)
UA_KDEN_df = UA_KDEN_df.astype('float32')





**Set index of dataframes to Date/Time**

In [7]:
UA_KGJT_df.index.names = ['Date_Time']
UA_KDEN_df.index.names = ['Date_Time']

print(UA_KDEN_df.head())

            levelcode  pressure_mb  height_m       tmpc       dwpc   drct  \
Date_Time                                                                   
2005-04-12        4.0       1000.0      71.0        NaN        NaN    NaN   
2005-04-12        4.0        925.0     735.0        NaN        NaN    NaN   
2005-04-12        4.0        850.0    1443.0        NaN        NaN    NaN   
2005-04-12        4.0        700.0    3017.0  -3.900000 -12.900000  300.0   
2005-04-12        4.0        500.0    5590.0 -18.700001 -38.700001  310.0   

            speed_kts  
Date_Time              
2005-04-12        NaN  
2005-04-12        NaN  
2005-04-12        NaN  
2005-04-12       15.0  
2005-04-12       31.0  


**While not really necessary, localize to UTC**

In [8]:
UA_KGJT_df.index = UA_KGJT_df.index.tz_localize('UTC')
UA_KDEN_df.index = UA_KDEN_df.index.tz_localize('UTC')



**Create copy of the surface data frame.  This new dataframe will be used to join the Upper Air Data to the surface data into one dataframe**

In [9]:
##UA_copy_df = UA_KGJT_df.copy()
asos_snotel_UA_df = asos_snotel_df.copy() 

**Read in most useful levels join to surface data frame.  This will be called asos_snotel_UA_df**

In [10]:

levels = [200, 250, 300, 400, 500, 700, 850]



for level in levels:
  asos_snotel_UA_df = asos_snotel_UA_df.join(UA_KGJT_df[UA_KGJT_df['pressure_mb'] == level].add_prefix('KGJT_'+str(level)+'mb_'), how= 'outer')
  asos_snotel_UA_df = asos_snotel_UA_df.join(UA_KDEN_df[UA_KDEN_df['pressure_mb'] == level].add_prefix('KDEN_'+str(level)+'mb_'), how= 'outer')





**Now create features which represent differences in meteorological values between pressure levels.  Add these to the asos_snotel_UA_df **

In [11]:

# pairs are the levels which delta measurements will be calculated for example [850,700] pair will represent 850mb-700mb
pairs = [[850, 700], [850, 500], [850, 400], [850, 300], [850, 250], [850, 200], \
         [700, 500], [700, 400], [700, 300], [700, 250], [700, 200], \
         [500, 400], [500, 300], [500,250], [500,200], \
         [400, 300], [400,250], [400,200], \
         [300, 250], [300, 200], \
         [250, 200]]
         
for pair in pairs:
 asos_snotel_UA_df = asos_snotel_UA_df.join(((UA_KGJT_df[UA_KGJT_df['pressure_mb'] == pair[0]]) - (UA_KGJT_df[UA_KGJT_df['pressure_mb'] == pair[1]])).add_prefix('KGJT_d'+str(pair[0])+'_'+str(pair[1])+'_'))
 asos_snotel_UA_df = asos_snotel_UA_df.join(((UA_KDEN_df[UA_KDEN_df['pressure_mb'] == pair[0]]) - (UA_KDEN_df[UA_KDEN_df['pressure_mb'] == pair[1]])).add_prefix('KDEN_d'+str(pair[0])+'_'+str(pair[1])+'_'))



**Remove more unecessary columns to save memory**

In [12]:

asos_snotel_UA_df = asos_snotel_UA_df[asos_snotel_UA_df.columns.drop(list(asos_snotel_UA_df.filter(regex='levelcode')))]
asos_snotel_UA_df = asos_snotel_UA_df[asos_snotel_UA_df.columns.drop(list(asos_snotel_UA_df.filter(regex='pressure_mb')))]
asos_snotel_UA_df = asos_snotel_UA_df[asos_snotel_UA_df.columns.drop(list(asos_snotel_UA_df.filter(regex='KDEN')))]


print(asos_snotel_UA_df.head())

                           CMtnSNTL_Temp_degC  CMtnSNTL_SnowDepth_in  \
Date_Time                                                              
2005-04-12 00:00:00+00:00                 NaN                    NaN   
2005-04-12 12:00:00+00:00                 NaN                    NaN   
2006-01-01 00:00:00+00:00                 NaN                    NaN   
2006-01-01 01:00:00+00:00                -1.3                   43.0   
2006-01-01 02:00:00+00:00                -2.7                   43.0   

                           CMtn_Temperature_degC  CMtn_Dewpoint_degC  \
Date_Time                                                              
2005-04-12 00:00:00+00:00                    NaN                 NaN   
2005-04-12 12:00:00+00:00                    NaN                 NaN   
2006-01-01 00:00:00+00:00                    NaN                 NaN   
2006-01-01 01:00:00+00:00                   -1.0                -7.0   
2006-01-01 02:00:00+00:00                   -3.0               

**Finally write dataframe to csv file.  Use resample to make sure there are no missing or duplicate 12hr values**

In [13]:
asos_snotel_UA_df.resample('12H').first().to_csv('asos_snotel_UA_12hr_df.dat', sep = ',', float_format = '%.2f')

** **

***