# Preprocessing and EDA - daily , hourly, and locations exploration

2016 data is used as an example here.

Official documentation at these links:
  - [WD_Table](http://artefacts.ceda.ac.uk/badc_datadocs/ukmo-midas/WD_Table.html)
  - [WH_Table](http://artefacts.ceda.ac.uk/badc_datadocs/ukmo-midas/WH_Table.html)

First, read the files.
The official documentation says data txt files are actually csvs.

I confirmed that they are.


In [1]:
import numpy as np;
import pandas as pd;

data_daily_2016  = pd.read_csv('../data/MIDAS/WD/midas_wxdrnl_201601-201612.txt', header=None, index_col=False);
data_hourly_2016 = pd.read_csv('../data/MIDAS/WH/midas_wxhrly_201601-201612.txt', header=None, index_col=False);

data_daily_2016.head()


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,2016-01-01 09:00,10,DCNN,0,1,NCM,3,1011,,,...,,,,,2016-01-01 13:52,670,,,,
1,2016-01-01 09:00,44,DCNN,0,1,NCM,9,1011,,1.0,...,,,,,2016-01-01 09:07,0,,,,
2,2016-01-01 09:00,154,DCNN,0,1,NCM,23,1011,,,...,,,,,2016-01-02 01:30,0,,,,
3,2016-01-01 09:00,293,DCNN,0,1,NCM,32,1011,,,...,,,,,2016-01-02 01:30,0,,,,
4,2016-01-01 09:00,425,DCNN,0,1,NCM,54,1011,,,...,,,,,2016-01-02 01:30,0,,,,


In [2]:
data_hourly_2016.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,2016-01-01 00:00,3002,WMO,SYNOP,1,12,1011,4,7,210,...,,,,,90.6,A,,,,
1,2016-01-01 00:00,3005,WMO,SYNOP,1,9,1011,4,1,240,...,,,,,82.7,A,,,0.0,1.0
2,2016-01-01 00:00,3008,WMO,SYNOP,1,3,1011,4,7,250,...,,,,,68.7,A,,,,
3,2016-01-01 00:00,3010,WMO,SYNOP,1,17,1011,4,6,270,...,,,,,62.7,A,,,,
4,2016-01-01 00:00,3014,WMO,SYNOP,1,2,1011,4,6,240,...,,,,,77.1,A,,,,


data has no headers, need to read seperately.

As mentioned before, for details, refer to these links:
  - [WD_Table](http://artefacts.ceda.ac.uk/badc_datadocs/ukmo-midas/WD_Table.html)
  - [WH_Table](http://badc.nerc.ac.uk/artefacts/badc_datadocs/ukmo-midas/WH_Table.html)



In [3]:
data_WD_headers = pd.read_csv('../data/MIDAS/HEADERS/WD_Column_Headers.txt').columns.to_numpy();
data_WD_headers = np.array( list( map( lambda str: str.strip(), data_WD_headers) ) )

data_WH_headers = pd.read_csv('../data/MIDAS/HEADERS/WH_Column_Headers.txt').columns.to_numpy();
data_WH_headers = np.array( list( map( lambda str: str.strip(), data_WH_headers) ) )

# headers

print(f'data_WD_headers')
print( data_WD_headers )
print(f'\n\n')
print(f'data_WH_headers')
print( data_WH_headers )



data_WD_headers
['OB_END_TIME' 'ID' 'ID_TYPE' 'OB_HOUR_COUNT' 'VERSION_NUM'
 'MET_DOMAIN_NAME' 'SRC_ID' 'REC_ST_IND' 'CS_24HR_SUN_DUR' 'CONC_STATE_ID'
 'LYING_SNOW_FLAG' 'SNOW_DEPTH' 'FRSH_SNOW_AMT' 'SNOW_DAY_ID'
 'HAIL_DAY_ID' 'THUNDER_DAY_FLAG' 'GALE_DAY_FLAG' 'FRSH_MNT_SNWFALL_FLAG'
 'WMO_24HR_SUN_DUR' 'CS_24HR_SUN_DUR_Q' 'CONC_STATE_ID_Q' 'SNOW_DEPTH_Q'
 'FRSH_SNW_AMT_Q' 'SNOW_DAY_ID_Q' 'HAIL_DAY_ID_Q' 'THUNDER_DAY_FLAG_Q'
 'GALE_DAY_FLAG_Q' 'WMO_24HR_SUN_DUR_Q' 'METO_STMP_TIME'
 'MIDAS_STMP_ETIME' 'DRV_24HR_SUN_DUR' 'DRV_24HR_SUN_DUR_Q'
 'LYING_SNOW_HT' 'LYING_SNOW_HT_Q']



data_WH_headers
['OB_TIME' 'ID' 'ID_TYPE' 'MET_DOMAIN_NAME' 'VERSION_NUM' 'SRC_ID'
 'REC_ST_IND' 'WIND_SPEED_UNIT_ID' 'SRC_OPR_TYPE' 'WIND_DIRECTION'
 'WIND_SPEED' 'PRST_WX_ID' 'PAST_WX_ID_1' 'PAST_WX_ID_2' 'CLD_TTL_AMT_ID'
 'LOW_CLD_TYPE_ID' 'MED_CLD_TYPE_ID' 'HI_CLD_TYPE_ID' 'CLD_BASE_AMT_ID'
 'CLD_BASE_HT' 'VISIBILITY' 'MSL_PRESSURE' 'CLD_AMT_ID_1'
 'CLOUD_TYPE_ID_1' 'CLD_BASE_HT_ID_1' 'CLD_AMT_ID_2' 'CLOUD

append headers to data


In [4]:


# check if headers matches

data_daily_2016.columns  = data_WD_headers
data_hourly_2016.columns = data_WH_headers

print(f'daily header matches? {list( data_daily_2016.columns.values )  == data_WD_headers.tolist()}');
print(f'hourly header matches? {list( data_hourly_2016.columns.values ) == data_WH_headers.tolist()}');


daily header matches? True
hourly header matches? True


now actually looking at the 2 data sets


In [5]:
data_daily_2016.head()

Unnamed: 0,OB_END_TIME,ID,ID_TYPE,OB_HOUR_COUNT,VERSION_NUM,MET_DOMAIN_NAME,SRC_ID,REC_ST_IND,CS_24HR_SUN_DUR,CONC_STATE_ID,...,HAIL_DAY_ID_Q,THUNDER_DAY_FLAG_Q,GALE_DAY_FLAG_Q,WMO_24HR_SUN_DUR_Q,METO_STMP_TIME,MIDAS_STMP_ETIME,DRV_24HR_SUN_DUR,DRV_24HR_SUN_DUR_Q,LYING_SNOW_HT,LYING_SNOW_HT_Q
0,2016-01-01 09:00,10,DCNN,0,1,NCM,3,1011,,,...,,,,,2016-01-01 13:52,670,,,,
1,2016-01-01 09:00,44,DCNN,0,1,NCM,9,1011,,1.0,...,,,,,2016-01-01 09:07,0,,,,
2,2016-01-01 09:00,154,DCNN,0,1,NCM,23,1011,,,...,,,,,2016-01-02 01:30,0,,,,
3,2016-01-01 09:00,293,DCNN,0,1,NCM,32,1011,,,...,,,,,2016-01-02 01:30,0,,,,
4,2016-01-01 09:00,425,DCNN,0,1,NCM,54,1011,,,...,,,,,2016-01-02 01:30,0,,,,


In [6]:
data_WD_headers

array(['OB_END_TIME', 'ID', 'ID_TYPE', 'OB_HOUR_COUNT', 'VERSION_NUM',
       'MET_DOMAIN_NAME', 'SRC_ID', 'REC_ST_IND', 'CS_24HR_SUN_DUR',
       'CONC_STATE_ID', 'LYING_SNOW_FLAG', 'SNOW_DEPTH', 'FRSH_SNOW_AMT',
       'SNOW_DAY_ID', 'HAIL_DAY_ID', 'THUNDER_DAY_FLAG', 'GALE_DAY_FLAG',
       'FRSH_MNT_SNWFALL_FLAG', 'WMO_24HR_SUN_DUR', 'CS_24HR_SUN_DUR_Q',
       'CONC_STATE_ID_Q', 'SNOW_DEPTH_Q', 'FRSH_SNW_AMT_Q',
       'SNOW_DAY_ID_Q', 'HAIL_DAY_ID_Q', 'THUNDER_DAY_FLAG_Q',
       'GALE_DAY_FLAG_Q', 'WMO_24HR_SUN_DUR_Q', 'METO_STMP_TIME',
       'MIDAS_STMP_ETIME', 'DRV_24HR_SUN_DUR', 'DRV_24HR_SUN_DUR_Q',
       'LYING_SNOW_HT', 'LYING_SNOW_HT_Q'], dtype='<U21')

In [7]:
data_hourly_2016.head()

Unnamed: 0,OB_TIME,ID,ID_TYPE,MET_DOMAIN_NAME,VERSION_NUM,SRC_ID,REC_ST_IND,WIND_SPEED_UNIT_ID,SRC_OPR_TYPE,WIND_DIRECTION,...,VERT_VSBY_J,STN_PRES_J,ALT_PRES_J,Q10MNT_MXGST_SPD_J,RLTV_HUM,RLTV_HUM_J,SNOW_DEPTH,SNOW_DEPTH_Q,DRV_HR_SUN_DUR,DRV_HR_SUN_DUR_Q
0,2016-01-01 00:00,3002,WMO,SYNOP,1,12,1011,4,7,210,...,,,,,90.6,A,,,,
1,2016-01-01 00:00,3005,WMO,SYNOP,1,9,1011,4,1,240,...,,,,,82.7,A,,,0.0,1.0
2,2016-01-01 00:00,3008,WMO,SYNOP,1,3,1011,4,7,250,...,,,,,68.7,A,,,,
3,2016-01-01 00:00,3010,WMO,SYNOP,1,17,1011,4,6,270,...,,,,,62.7,A,,,,
4,2016-01-01 00:00,3014,WMO,SYNOP,1,2,1011,4,6,240,...,,,,,77.1,A,,,,


In [8]:
data_WH_headers

array(['OB_TIME', 'ID', 'ID_TYPE', 'MET_DOMAIN_NAME', 'VERSION_NUM',
       'SRC_ID', 'REC_ST_IND', 'WIND_SPEED_UNIT_ID', 'SRC_OPR_TYPE',
       'WIND_DIRECTION', 'WIND_SPEED', 'PRST_WX_ID', 'PAST_WX_ID_1',
       'PAST_WX_ID_2', 'CLD_TTL_AMT_ID', 'LOW_CLD_TYPE_ID',
       'MED_CLD_TYPE_ID', 'HI_CLD_TYPE_ID', 'CLD_BASE_AMT_ID',
       'CLD_BASE_HT', 'VISIBILITY', 'MSL_PRESSURE', 'CLD_AMT_ID_1',
       'CLOUD_TYPE_ID_1', 'CLD_BASE_HT_ID_1', 'CLD_AMT_ID_2',
       'CLOUD_TYPE_ID_2', 'CLD_BASE_HT_ID_2', 'CLD_AMT_ID_3',
       'CLOUD_TYPE_ID_3', 'CLD_BASE_HT_ID_3', 'CLD_AMT_ID_4',
       'CLOUD_TYPE_ID_4', 'CLD_BASE_HT_ID_4', 'VERT_VSBY',
       'AIR_TEMPERATURE', 'DEWPOINT', 'WETB_TEMP', 'STN_PRES', 'ALT_PRES',
       'GROUND_STATE_ID', 'Q10MNT_MXGST_SPD', 'CAVOK_FLAG',
       'CS_HR_SUN_DUR', 'WMO_HR_SUN_DUR', 'WIND_DIRECTION_Q',
       'WIND_SPEED_Q', 'PRST_WX_ID_Q', 'PAST_WX_ID_1_Q', 'PAST_WX_ID_2_Q',
       'CLD_TTL_AMT_ID_Q', 'LOW_CLD_TYPE_ID_Q', 'MED_CLD_TYPE_ID_Q',
       'HI_CLD_T

In [9]:

# BOTA   http://archive.ceda.ac.uk/cgi-bin/midas_stations/station_details.cgi.py?id=253&db=midas_stations
# HILVIW http://archive.ceda.ac.uk/cgi-bin/midas_stations/station_details.cgi.py?id=15726&db=midas_stations
# GOGAR  http://archive.ceda.ac.uk/cgi-bin/midas_stations/station_details.cgi.py?id=19260&db=midas_stations
# 253  -> 15726  -> 19260
# BOTA -> HILVIW -> GOGAR
print( sum(data_daily_2016['SRC_ID'] == 253) )      #0
print( sum( data_daily_2016['SRC_ID'] == 15726 ) )  #0
print( sum( data_daily_2016['SRC_ID'] == 19260 ) )  #1109

# VERSION_NUM = 1 are reviewed.
print( sum( 

np.logical_and(

(data_daily_2016['SRC_ID']      == 19260 ).to_numpy(), 
(data_daily_2016['VERSION_NUM'] == 1     ).to_numpy()

) 

) )



data_gogar_daily_2016 = data_daily_2016[ np.logical_and(

(data_daily_2016['SRC_ID']      == 19260 ).to_numpy(), 
(data_daily_2016['VERSION_NUM'] == 1     ).to_numpy()

)] 


# This is the only time stamp for WD
data_gogar_daily_2016[ data_gogar_daily_2016.duplicated(subset=['OB_END_TIME'], keep=False) ].iloc[0:5,:].to_numpy()

print( sum(data_hourly_2016['SRC_ID'] == 253 ) ) #9454

print( 366 * 24 ) #8784

print( 9454 - 8784 ) #670

print( sum(data_hourly_2016['SRC_ID'] == 15726 ) ) #0
print( sum(data_hourly_2016['SRC_ID'] == 19260 ) ) #8836

0
0
1109
1094
9454
8784
670
0
8836



Daily data only available for Gogar bank.

9454 datapoints hourly in Botanic garden for 2016, which is more than expected. 
8836 datapoints hourly in Gogar bank for 2016, which is more than expected. 

### Expectaion:
> ( 366 days * 24 hours = 8784 hours ) i.e. there is roughly _**+670 extra**_ data points
# 

These are likely due to versioning.  Can be easily removed by filtering `VERSION_NUM == 1`
as suggested by the Met Office ( [link](http://artefacts.ceda.ac.uk/badc_datadocs/ukmo-midas/WH_Table.html) )

use VERSION_NUM = 1 data for quality checked data

In [10]:

# filter out data from botanic garden, hence the name
data_botanic_hourly_2016 = data_hourly_2016[ np.logical_and( data_hourly_2016['SRC_ID'] == 253, data_hourly_2016['VERSION_NUM'] == 1) ]


print( len( data_botanic_hourly_2016 ) ) # 9454, which is the same amount. i.e. all data is good, but Why all the extra data then?

# peek data
data_botanic_hourly_2016



9454


Unnamed: 0,OB_TIME,ID,ID_TYPE,MET_DOMAIN_NAME,VERSION_NUM,SRC_ID,REC_ST_IND,WIND_SPEED_UNIT_ID,SRC_OPR_TYPE,WIND_DIRECTION,...,VERT_VSBY_J,STN_PRES_J,ALT_PRES_J,Q10MNT_MXGST_SPD_J,RLTV_HUM,RLTV_HUM_J,SNOW_DEPTH,SNOW_DEPTH_Q,DRV_HR_SUN_DUR,DRV_HR_SUN_DUR_Q
184,2016-01-01 00:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,77.1,A,,,,
522,2016-01-01 01:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,77.6,A,,,,
859,2016-01-01 02:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,74.4,A,,,,
1196,2016-01-01 03:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,69.8,A,,,,
1535,2016-01-01 04:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,67.8,A,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216680,2016-12-31 19:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,90.2,A,,,,
3217024,2016-12-31 20:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,85.8,A,,,,
3217366,2016-12-31 21:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,85.4,A,,,,
3217708,2016-12-31 22:00,1649,DCNN,AWSHRLY,1,253,1011,4,6,,...,,,,,80.7,A,,,,


## Look deeper at Botanic Garden data;



In [11]:
print( data_botanic_hourly_2016.shape ) # (9453, 104) matches official documentation

print( len( data_botanic_hourly_2016['OB_TIME'].unique() )  ) # 8782 

print( 8782 - (366 * 24)  ) # -2, missing 2 hours


print( data_botanic_hourly_2016['OB_TIME'].value_counts().value_counts() ) 

print( 8110 + 672 ) 



(9454, 104)
8782
-2
1    8110
2     672
Name: OB_TIME, dtype: int64
8782


check which datapoints are missing



In [12]:

correct_timestamp_array = pd.period_range('2016-01-01 00:00', '2016-12-31 23:00', freq='H').to_series().astype(str).to_numpy()
correct_timestamp_array = correct_timestamp_array.astype(str)

data_timestamp_array    = data_botanic_hourly_2016['OB_TIME'].unique().astype(str);

# check how many paddings to add for element wise comparison
len( correct_timestamp_array )
len( data_timestamp_array )

# as the timestamp array is 1 dimentional, using only results at index 0 is enough, and it IS THE ONLY AVAILABLE result
dislocation_indexs = np.where( ( np.append( data_timestamp_array, ['PAD','PAD'] ) == correct_timestamp_array ) == False )[0];

# sanity checks
dislocation_indexs

len( dislocation_indexs )
sum( dislocation_indexs == np.arange(3748, 8783 + 1) )

first_dislocation_index = dislocation_indexs[0]




# missing 2016-06-05 04:00, one more to go
data_timestamp_array[first_dislocation_index - 1 : first_dislocation_index + 1]



# Do the same procedure, will not add repetitive comments on same thing.
len( data_timestamp_array[first_dislocation_index :] )
len( correct_timestamp_array[ first_dislocation_index + 1 : ] )

shifted_dislocation_indexs = np.where( ( np.append( data_timestamp_array[first_dislocation_index :], ['PAD'] ) == correct_timestamp_array[ first_dislocation_index + 1 : ] ) == False )[0];

shifted_dislocation_indexs

len( shifted_dislocation_indexs )
sum( shifted_dislocation_indexs == np.arange(27, 5034 + 1) )

# shifted_dislocation_indexs is shifted by  first_dislocation_index = 3748
second_dislocation_index = first_dislocation_index + shifted_dislocation_indexs[0]

# missing 2016-06-06 08:00 as well, that should be all of the missing timestamp.
data_timestamp_array[second_dislocation_index - 1 : second_dislocation_index + 1]



array(['2016-06-06 07:00', '2016-06-06 09:00'], dtype='<U16')


Major findings on the data so far:

 - Shape: (9454, 104)
 - In total of <u>_9454_</u>, there are **8110** unique time points and **672** duplicated ones
 - removing duplicates results in **2** missing _time points_ namely:
   - **2016-06-05 04:00**
   - **2016-06-06 08:00**

---------------------------------

Now, lets peek at the _duplicated_ ones



In [13]:

duplicated_timepoints_selector = data_botanic_hourly_2016['OB_TIME'].duplicated()

sum( duplicated_timepoints_selector ) # 672 as expected

# result is 672, i.e. the entire period of Februrary except 2016-02-29 is duplicated for once
sum( data_botanic_hourly_2016[ duplicated_timepoints_selector ]['OB_TIME'].to_numpy().astype(str) == pd.period_range('2016-02-01 00:00', '2016-02-28 23:00', freq='H').to_series().astype(str).to_numpy() )
28 * 24

sum( data_botanic_hourly_2016.duplicated() ) # 0, so some columns have different values for the duplicated time points

data_keeplast  = data_botanic_hourly_2016[ data_botanic_hourly_2016['OB_TIME'].duplicated(keep='last') ]
data_keepfirst = data_botanic_hourly_2016[ data_botanic_hourly_2016['OB_TIME'].duplicated(keep='first')]


# The only difference seems to be the three ' 1' and ' 6'
data_keeplast.iloc[1:2,:].to_numpy()
data_keepfirst.iloc[1:2,:].to_numpy()

# same observation for all entries
np.where( ( data_keeplast.fillna(-9999).to_numpy() == data_keepfirst.fillna(-9999).to_numpy() ) == False )

# sanity check
foo, bar = np.where( ( data_keeplast.fillna(-9999).to_numpy() == data_keepfirst.fillna(-9999).to_numpy() ) == False )

len( foo )
672 * 3

data_WH_headers[  [58,59,60]  ]


array(['AIR_TEMPERATURE_Q', 'DEWPOINT_Q', 'WETB_TEMP_Q'], dtype='<U18')


So, it is found that for all the duplicates, it only differs at index 58, 59, 60

which corresponds to AIR_TEMPERATURE_Q, DEWPOINT_Q and WETB_TEMP_Q respectively

According to documentation ( [<u>link</u>](http://artefacts.ceda.ac.uk/badc_datadocs/ukmo-midas/WH_Table.html) ), these are Quality Control Codes (QC codes) for the corresponding weather recordings.

The QC codes are described [<u>here</u>](http://dap.ceda.ac.uk/thredds/fileServer/badc/ukmo-midas/metadata/doc/QC_J_flags.html)

And according to MIDAS the quick start user guide available [<u>here</u>](http://cedadocs.ceda.ac.uk/1437/5/midas_quick_start_user_guide.pdf)

The values shall be read from the RIGHT. So it will be the QC stage:

 - **' 1'** means Stage 1 <u>_'Initial climate QC program has run'_</u>
 - **' 6'** means Stage 6 <u>_'Final (or only) areal or buddy job run and queries processed '_</u>
 - Level 0 - no processing; Level 9 - normal processing complete.


 So preferring later stage (larger number) will be benefitial.

 Export the final data under `./data/preproessed/` directory.


In [14]:

data_botanic_hourly_2016.drop_duplicates(subset=['OB_TIME'], keep='first')['AIR_TEMPERATURE_Q'].value_counts()
data_botanic_hourly_2016.drop_duplicates(subset=['OB_TIME'], keep='last')['AIR_TEMPERATURE_Q'].value_counts()
696 - 24 # 672 as expected


# unique timestamp by keeping latest entry
data_botanic_hourly_2016_unique_timestamp = data_botanic_hourly_2016.drop_duplicates(subset=['OB_TIME'], keep='last')


# final 2016 hourly data for the botanic garden, with 2 missing entries, 3 missing useful columns;
data_botanic_hourly_2016_unique_timestamp[ ['OB_TIME','AIR_TEMPERATURE','RLTV_HUM','MSL_PRESSURE','STN_PRES','ALT_PRES']]

# cant plot, dtype error
#import seaborn as sns;
#sns.lineplot( data=data_botanic_hourly_2016_unique_timestamp, x='OB_TIME', y='AIR_TEMPERATURE' )

data_botanic_hourly_2016_unique_timestamp[ ['OB_TIME','AIR_TEMPERATURE','RLTV_HUM','MSL_PRESSURE','STN_PRES','ALT_PRES']].info()

# can be fixed by outputting csv, reading again with correct dtype settings
# Uncomment the line below to export preproessed csv file
# data_botanic_hourly_2016_unique_timestamp.to_csv(path_or_buf='./data/preproessed/botanic_hourly_2016_unique_timestamp.csv', index=False);



<class 'pandas.core.frame.DataFrame'>
Int64Index: 8782 entries, 184 to 3218048
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   OB_TIME          8782 non-null   object
 1   AIR_TEMPERATURE  8782 non-null   object
 2   RLTV_HUM         8782 non-null   object
 3   MSL_PRESSURE     8782 non-null   object
 4   STN_PRES         8782 non-null   object
 5   ALT_PRES         8782 non-null   object
dtypes: object(6)
memory usage: 480.3+ KB


# How about Gogar bank? 


In [15]:

# shape is 8784 i.e. PERFECT, no missing entries, and more attributes.
data_hourly_2016[ np.logical_and( data_hourly_2016['SRC_ID'] == 19260, data_hourly_2016['VERSION_NUM'] == 1) ][['OB_TIME','AIR_TEMPERATURE','RLTV_HUM','MSL_PRESSURE','STN_PRES','ALT_PRES']]

data_gogar_hourly_2016_unique_timestamp = data_hourly_2016[ np.logical_and( data_hourly_2016['SRC_ID'] == 19260, data_hourly_2016['VERSION_NUM'] == 1) ];

# same output
data_gogar_hourly_2016_unique_timestamp[['OB_TIME','AIR_TEMPERATURE','RLTV_HUM','MSL_PRESSURE','STN_PRES','ALT_PRES']]



Unnamed: 0,OB_TIME,AIR_TEMPERATURE,RLTV_HUM,MSL_PRESSURE,STN_PRES,ALT_PRES
38,2016-01-01 00:00,2.3,81.8,1011.2,1004,
375,2016-01-01 01:00,3,79,1012.4,1005.3,
713,2016-01-01 02:00,2.8,77.1,1013.7,1006.6,
1049,2016-01-01 03:00,3.3,70.8,1014.7,1007.5,
1388,2016-01-01 04:00,3.2,68.9,1014.9,1007.8,
...,...,...,...,...,...,...
3216536,2016-12-31 19:00,8,88.6,1016.5,1009.5,
3216880,2016-12-31 20:00,7.2,89.7,1016.6,1009.6,
3217223,2016-12-31 21:00,6.9,88.1,1016.8,1009.8,
3217565,2016-12-31 22:00,6.2,84.8,1016.9,1009.8,



since gogar bank data is so good hourly, and our forum\_electricity data are updated _**per 10 minutes**_.
It most likely to be more benefitial for more fine grained one.

### **So we will not look at the daily data for now**

again, wrong dtype can be fixed by outputting csv, reading again with correct dtype settings

Uncomment the line below to export preproessed csv file




In [16]:
# data_gogar_hourly_2016_unique_timestamp.to_csv(path_or_buf='./data/preprocessed/gogar_hourly_2016_unique_timestamp.csv', index=False);