# Create `Stations_Unique.csv`

##  Configuration

In [1]:
# Import dependencies
import os
import glob
import pandas as pd

from plots import stations_plot, gdf_epicentre

## User input

### Indicate event and list `Stations_REF.csv` files

In [None]:
# Event name
event = '20150425_M7.8_Gorkha'

stations = glob.glob(
    os.path.join('..', '**', event, 'Recording_Stations', 
                 'Stations_*.csv'), recursive=True)
folder = os.path.dirname(stations[0])

# Raise error if 'Stations_Unique.csv' in stations
unique = [item for item in stations 
          if 'Stations_Unique.csv' in item]
assert len(unique) == 0, '`Stations_Unique.csv` already exists'

print('Sources of data:')
print(*stations, sep='\n')

### List unique references

In [None]:
# Stations priority to create `Stations_Unique.csv` file
# The README file includes the list and priority of reference data.

priority = ['file_1', 'file_2']

## Create unique file

### Add all sources of info based on priority

In [None]:
# Raise error if the files are not for the specified event
assert len(priority) != 0, 'Indicate list of files as source data'
assert event in priority[0], f'The priority files are not associated with the event {event}'

# Read all sources of information
dfs = [pd.read_csv(x) for x in priority]

# Sort each data by the descending PGA_VALUE, if available
try:
    dfs = [df.sort_values('PGA_VALUE', 
                   ascending=False,)
           for df in dfs]
except KeyError:
    print('PGA_VALUE not in columns')
    
df = pd.concat(dfs)
df

### Check for duplicated rows

We check for two different types of duplicates:
- `dup1` based on the STATION_ID
- `dup2` based on the coordinates

In [None]:
# Print duplicated values

def find_duplicates(df, cols):

    dup = df[cols].duplicated()
    
    if len(df[dup]) != 0:
        print(f'\x1b[0;31m \n  Found {len(df[dup])} duplicated values\x1b[0m')
        print(df.loc[dup, ['STATION_ID', 'REFERENCES']])
    else:
        print('  None')
        
    return dup

# Explore duplicated rows across sources
print('Check duplicates for STATION_ID')
dup1 = find_duplicates(df, ['STATION_ID'])

# Explore duplicated rows based on coordinates (5 decimal places)
print('\nCheck duplicates for COORDINATES')
dup2 = find_duplicates(df.round(5), ['LONGITUDE', 'LATITUDE'])


### Remove duplicated stations

In [None]:
# Remove duplicated rows based on position within the files.
# Indicate which duplicates should be removed
#    dup = dup1 | dup2 --> union of dup1 and dup2
#    dup = dup1 & dup2 --> intersection of dup1 and dup2
dup = dup1 | dup2 

if dup.any():
    print('\n Total recording stations including duplicates:', len(df))
    print(f'\nRemoving {len(df[dup])} duplicated rows')

# Remove duplicates and drop columns with all NaN (if any)
df2 = df[~dup].dropna(axis=1, how='all').copy()

print(f'Station recordings: {len(df2)}')


## Plot recording stations in PGA

In [None]:
# Create figure
title = f'''Recording stations {event.replace('DRAFT_', '')}'''
fig, ax = stations_plot(df2, title=title)

# Add epicenter in figure
ref_rupture = os.path.join(folder,'..', 'Rupture', 'earthquake_rupture_model_USGS.xml')
epi = gdf_epicentre(ref_rupture)
ax.scatter(epi.geometry.x, epi.geometry.y, 
           s=500, marker='*', color='gold', edgecolor='grey',
           zorder=2)

# # Manually adjust limits to include epicenter in figure
# import contextily as ctx
# ax.set_xlim(xmin, xmax)
# ax.set_ylim(ymin, ymax)
# ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerLite, crs='EPSG:4326', alpha=0.8)
 

## Save files

In [None]:
# Save `Stations_Unique.csv`
if len(stations) > 1:
    file_path = os.path.join(folder, 'Stations_Unique.csv')
    df2.to_csv(file_path, encoding='utf-8', index=False)
    print('\n Saving updated file in:\n', file_path)
else:
    print('BE CAREFUL!!!!!',
          'There is only 1 station. At least 2 stations needed',
          'File not saved.', sep='\n')
    
# Save figure
output_path = os.path.join(folder, 'recording_stations.png')
fig.savefig(output_path, facecolor="w", 
            dpi=200,
            bbox_inches='tight')
print('figure saved in', output_path)