# Create csv of file list

In [1]:
from os import listdir, path, makedirs
import re
from datetime import datetime, timedelta, time
from glob import glob
import pandas as pd

### Image directory

In [2]:
original_dir = '/n/mickley/users/'
band1_dir = path.join(original_dir, 'ktoshima', '*b1*')
band3_dir = path.join(original_dir, 'ktoshima', '*b3*')
hms_dir = path.join(original_dir, 'HMS', 'HMS_Density')
daynight_dir = path.join(original_dir, 'HMS', 'DayNight')

## Define patterns to extract timestamp from images
### GOES pattern

In [29]:
band1_path = path.join(band1_dir, "**", "*.png")
band3_path = path.join(band3_dir, "**", "*.png")

In [30]:
sample = path.basename(glob(band1_path, recursive=True)[0])
sample

'GOES16_201807010100.png'

In [4]:
goes_pattern = re.compile(r"GOES16_(?P<year>\d{4}?)(?P<month>\d{2}?)(?P<day>\d{2}?)(?P<hour>\d{2}?)(?P<minute>\d{2}?)\.png", re.VERBOSE)

In [7]:
def extract_GOES(s):
    match = goes_pattern.match(s)
    if match:
        year = int(match.group('year'))
        month = int(match.group('month'))
        day = int(match.group('day'))
        hour = int(match.group('hour'))
        minute = 0 if int(match.group('minute')) < 30 else 30
        return datetime(year=year, month=month, day=day, hour=hour, minute=minute)
    else:
        return None

In [8]:
extract_GOES(sample)

datetime.datetime(2018, 7, 1, 1, 0)

### HMS pattern

In [31]:
hms_path = path.join(hms_dir, "**", "*.tif")

In [32]:
hms_sample = path.basename(glob(hms_path, recursive=True)[2])
hms_sample

'HMS_Density_20170701_0100.tif'

In [12]:
hms_pattern = re.compile(r"HMS_Density_(?P<year>\d{4}?)(?P<month>\d{2}?)(?P<day>\d{2}?)_(?P<hour>\d{2}?)(?P<minute>\d{2}?)\.tif", re.VERBOSE)

In [13]:
def extract_HMS(s):
    match = hms_pattern.match(s)
    if match:
        year = int(match.group('year'))
        month = int(match.group('month'))
        day = int(match.group('day'))
        hour = int(match.group('hour'))
        minute = 0 if int(match.group('minute')) < 30 else 30
        return datetime(year=year, month=month, day=day, hour=hour, minute=minute)
    else:
        return ValueError()

In [14]:
extract_HMS(hms_sample)

datetime.datetime(2017, 7, 1, 1, 0)

### Daynight pattern

In [35]:
daynight_path = path.join(daynight_dir, "**", "*.png")

In [40]:
daynight_sample = path.basename(glob(daynight_path, recursive=True)[10])
daynight_sample

'DayNight_001_0500.png'

In [41]:
daynight_pattern = re.compile(r"DayNight_(?P<yday>\d{3}?)_(?P<hour>\d{2}?)(?P<minute>\d{2}?)\.png", re.VERBOSE)

In [42]:
def extract_daynight(s):
    match = daynight_pattern.match(s)
    if match:
        yday = int(match.group('yday'))
        hour = int(match.group('hour'))
        minute = 0 if int(match.group('minute')) < 30 else 30
        return yday, hour, minute
    else:
        return ValueError()

In [43]:
extract_daynight(daynight_sample)

(1, 5, 0)

## Create csv for file list
### GOES

In [44]:
band1_path_list = filter(lambda s: goes_pattern.match(path.basename(s)), glob(band1_path, recursive=True))
band3_path_list = filter(lambda s: goes_pattern.match(path.basename(s)), glob(band3_path, recursive=True))

In [45]:
band1_df = pd.concat([pd.DataFrame([file_path],
                                   columns=['path_band1']) 
                      for file_path in band1_path_list],
                     ignore_index=True)
band3_df = pd.concat([pd.DataFrame([file_path],
                                   columns=['path_band3']) 
                      for file_path in band3_path_list],
                     ignore_index=True)

In [46]:
band1_df['timestamp'] = band1_df['path_band1'].apply(lambda filepath: extract_GOES(path.basename(filepath)))
band1_df = band1_df.set_index('timestamp').sort_index()

In [65]:
band1_df

Unnamed: 0_level_0,path_band1
timestamp,Unnamed: 1_level_1
2017-07-11 00:00:00,/n/mickley/users/ktoshima/goes16_b1_2017/GOES1...
2017-07-11 00:30:00,/n/mickley/users/ktoshima/goes16_b1_2017/GOES1...
2017-07-11 01:00:00,/n/mickley/users/ktoshima/goes16_b1_2017/GOES1...
2017-07-11 01:30:00,/n/mickley/users/ktoshima/goes16_b1_2017/GOES1...
2017-07-11 02:00:00,/n/mickley/users/ktoshima/goes16_b1_2017/GOES1...
...,...
2020-11-30 17:00:00,/n/mickley/users/ktoshima/goes16_b1_2020/GOES1...
2020-11-30 17:30:00,/n/mickley/users/ktoshima/goes16_b1_2020/GOES1...
2020-11-30 18:00:00,/n/mickley/users/ktoshima/goes16_b1_2020/GOES1...
2020-11-30 18:30:00,/n/mickley/users/ktoshima/goes16_b1_2020/GOES1...


In [66]:
len(band1_df)

11117

In [48]:
band3_df['timestamp'] = band3_df['path_band3'].apply(lambda filepath: extract_GOES(path.basename(filepath)))
band3_df = band3_df.set_index('timestamp').sort_index()

In [67]:
band3_df

Unnamed: 0_level_0,path_band3
timestamp,Unnamed: 1_level_1
2017-07-11 00:00:00,/n/mickley/users/ktoshima/goes16_b3_2017/GOES1...
2017-07-11 00:30:00,/n/mickley/users/ktoshima/goes16_b3_2017/GOES1...
2017-07-11 01:00:00,/n/mickley/users/ktoshima/goes16_b3_2017/GOES1...
2017-07-11 01:30:00,/n/mickley/users/ktoshima/goes16_b3_2017/GOES1...
2017-07-11 02:00:00,/n/mickley/users/ktoshima/goes16_b3_2017/GOES1...
...,...
2020-11-30 17:00:00,/n/mickley/users/ktoshima/goes16_b3_2020/GOES1...
2020-11-30 17:30:00,/n/mickley/users/ktoshima/goes16_b3_2020/GOES1...
2020-11-30 18:00:00,/n/mickley/users/ktoshima/goes16_b3_2020/GOES1...
2020-11-30 18:30:00,/n/mickley/users/ktoshima/goes16_b3_2020/GOES1...


In [68]:
len(band3_df)

11117

### HMS

In [50]:
hms_path_list = filter(lambda s: hms_pattern.match(path.basename(s)), glob(hms_path, recursive=True))

In [51]:
hms_df = pd.concat([pd.DataFrame([file_path],
                                 columns=['path_hms']) 
                    for file_path in hms_path_list],
                   ignore_index=True)

In [52]:
hms_df['timestamp'] = hms_df['path_hms'].apply(lambda filepath: extract_HMS(path.basename(filepath)))
hms_df = hms_df.set_index('timestamp').sort_index()

In [69]:
hms_df

Unnamed: 0_level_0,path_hms
timestamp,Unnamed: 1_level_1
2017-07-01 00:00:00,/n/mickley/users/HMS/HMS_Density/2017/HMS_Dens...
2017-07-01 00:30:00,/n/mickley/users/HMS/HMS_Density/2017/HMS_Dens...
2017-07-01 01:00:00,/n/mickley/users/HMS/HMS_Density/2017/HMS_Dens...
2017-07-01 01:30:00,/n/mickley/users/HMS/HMS_Density/2017/HMS_Dens...
2017-07-01 02:00:00,/n/mickley/users/HMS/HMS_Density/2017/HMS_Dens...
...,...
2020-11-30 17:00:00,/n/mickley/users/HMS/HMS_Density/2020/HMS_Dens...
2020-11-30 17:30:00,/n/mickley/users/HMS/HMS_Density/2020/HMS_Dens...
2020-11-30 18:00:00,/n/mickley/users/HMS/HMS_Density/2020/HMS_Dens...
2020-11-30 18:30:00,/n/mickley/users/HMS/HMS_Density/2020/HMS_Dens...


In [70]:
len(hms_df)

11317

### Daynight

In [54]:
daynight_path_list = filter(lambda s: daynight_pattern.match(path.basename(s)), glob(daynight_path, recursive=True))

In [55]:
daynight_df = pd.concat([pd.DataFrame([file_path],
                                 columns=['path_daynight']) 
                    for file_path in daynight_path_list],
                   ignore_index=True)

In [56]:
def extract_daynight_timestamp(daynight_filename):
    yday, hour, minute = extract_daynight(path.basename(daynight_filename))
    return pd.Series({'yday':yday, 'hour':hour, 'minute': minute})
daynight_df = daynight_df.merge(daynight_df.path_daynight.apply(extract_daynight_timestamp), left_index=True, right_index=True)

In [57]:
daynight_df = daynight_df.set_index(['yday', 'hour', 'minute']).sort_index()

In [58]:
daynight_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,path_daynight
yday,hour,minute,Unnamed: 3_level_1
1,0,0,/n/mickley/users/HMS/DayNight/DayNight_001_000...
1,0,30,/n/mickley/users/HMS/DayNight/DayNight_001_003...
1,1,0,/n/mickley/users/HMS/DayNight/DayNight_001_010...
1,1,30,/n/mickley/users/HMS/DayNight/DayNight_001_013...
1,2,0,/n/mickley/users/HMS/DayNight/DayNight_001_020...
...,...,...,...
366,21,30,/n/mickley/users/HMS/DayNight/DayNight_366_213...
366,22,0,/n/mickley/users/HMS/DayNight/DayNight_366_220...
366,22,30,/n/mickley/users/HMS/DayNight/DayNight_366_223...
366,23,0,/n/mickley/users/HMS/DayNight/DayNight_366_230...


## Remove duplicates
### GOES

In [59]:
band1_df_mod = band1_df.reset_index()
duplicated_index = band1_df_mod.timestamp.duplicated()
print(duplicated_index.sum())

0


In [60]:
duplicated_elements = band1_df_mod[band1_df_mod.timestamp == band1_df_mod[duplicated_index].timestamp.iat[0]]
print(duplicated_elements)
print(duplicated_elements.values)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [37]:
remove_idx = duplicated_elements.iloc[0].name
band1_df_mod = band1_df_mod.drop(index=remove_idx)
band1_df = band1_df_mod.set_index('timestamp')

In [38]:
band1_df

Unnamed: 0_level_0,path_band1
timestamp,Unnamed: 1_level_1
2018-01-01 20:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 20:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 21:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 21:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 22:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
...,...
2018-12-29 23:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-12-30 00:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-12-30 00:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-12-30 01:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...


In [61]:
band3_df_mod = band3_df.reset_index()
duplicated_index = band3_df_mod.timestamp.duplicated()
print(duplicated_index.sum())

0


### HMS

In [62]:
hms_df_mod = hms_df.reset_index()
duplicated_index = hms_df_mod.timestamp.duplicated()
print(duplicated_index.sum())

0


### Save csv

In [64]:
band1_df.to_csv('csv/band1.csv')
band3_df.to_csv('csv/band3.csv')
hms_df.to_csv('csv/hms.csv')
daynight_df.to_csv('csv/daynight.csv')