# Synthetic Dataset Verification

Goal: Create synthetic dataset with heteroscedastic variability

## Set Up

In [None]:
#standard libraries
library(assertthat)
library(stringr)
library(pracma)
library(plyr)
library(dplyr)
library(tidyverse)
#regression libraries
library(lme4)
#plotting libraries
library(ggplot2)
library(ggpubr)
library(RColorBrewer)
library(scales)
library(repr)
#user functions
#source(file.path('..','..','r_lib','auxiliary_functions.r'))

options(warn=-1)

### Plotting Setup

In [None]:
base_breaks <- function(n = 10){
  function(x) {
    axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
  }
}
#log breaks
breaks_log       <- 10^(-10:10)
breaks_log_minor <- rep(1:9, 21)*(10^rep(-10:10, each=9))
#plotting theme
theme_set(theme_linedraw())

#text size
size_title <- 15
size_text  <- 12

#color palette
color_dW   <- "#666666"
color_dB   <- "#00BFC4"
color_dBP  <- "#4DAF4A"
color_dS2S <- "#F781BF"
color_dSP  <- "#FFFF33"
color_dT   <- "#F8766D"

### Define Variables

In [None]:
#EAS flatfile (removed outliers 2023-11-10)
fname_flatfile <- file.path('..','..','..','Raw_files','flatfiles','fas_flatfile_v2no_outliers_20231130.csv')

#scaling factor for random effects distance scaling
scl_rrup_rref <- 0.01

#UTM Zome
utm_zone <- '11S'

#Defined coefficients
c_intrcp <-  2.3
c_mag    <-  1.0
c_mag2   <- -0.2
c_gs     <- -1.0
c_rrup   <- -0.01
c_lnvs30 <- -0.65

#Aleatory Variability
tau    <- 0.6
phi    <- 0.5
phiS2S <- 0.3
#Distance Dependent Variability 
tauP    <- 0.002
phiP    <- 0.002

#maximum distance
max_rrup <- Inf
# max_rrup <- 50

#empirical std correction
flag_emp_sd_correct <- TRUE

#output
dir_out <- file.path('..','..','..','Data','gmm_ergodic','syndata')
dir_fig <- file.path(dir_out, 'figures','')

## Load Data

In [None]:
#read flatifle
df_syn_ds <- read_csv(fname_flatfile, show_col_types = FALSE)
#keep only relevant columns
df_syn_ds <- df_syn_ds[,c('motion_id','event_id','station_id','path_id','user_id','event_name','comcat_id','magnitude','ztor','vs30','rrup')]

#rupture distance subset
df_syn_ds <- subset(df_syn_ds, rrup<= max_rrup)

colnames(df_syn_ds)
head(df_syn_ds) 

#### Compute Scaling Term

In [None]:
calc_scl_terms <- function(df_flt, mag='magnitude', rrup='rrup', vs30='vs30', scl_rrup_rref=1,
                           fun_mag_quad, fun_pseudo_depth){
    
    #rupture distance scaling
    df_flt[,'rrup']   <- df_flt[,rrup]
    #magnitude scaling
    df_flt[,'mag']    <- df_flt[,mag]
    #quadratic magnitude scaling
    df_flt[,'mag2']   <- fun_mag_quad(df_flt[,mag]) 
    #pseudo depth
    df_flt[,'pdepth'] <-  fun_pseudo_depth(df_flt$mag)
    #geometrical spreading
    df_flt[,'gs']     <-  log(df_flt$rrup + df_flt$pdepth)
    #site scaling
    df_flt[,'vs30']   <- df_flt[,vs30]
    df_flt[,'lnvs30'] <- log(df_flt[,vs30])
    
    #distance scaling for random effects
    df_flt['reff_rrup'] <- scl_rrup_rref * df_flt['rrup']
    
    return(df_flt)
}

#compute median scaling
calc_med_scl <- function(df){
    return( c_intrcp + c_mag * df$mag + c_mag2 * df$mag2 + c_gs * df$gs + c_rrup * df$rrup + c_lnvs30 * df$lnvs30 )
}

#add utm coordinates

#custom scaling functions
fun_mag_quad     <- function(mag) (8.5-mag)^2 
fun_pseudo_depth <- function(mag)  5 * exp(0.4*(mag-6))

#compute scaling terms
df_syn_ds <- calc_scl_terms(df_syn_ds, scl_rrup_rref=scl_rrup_rref, fun_mag_quad=fun_mag_quad, fun_pseudo_depth=fun_pseudo_depth)

#### Initialize Dataset

In [None]:
#intitialize random effects
df_syn_ds_eq <- data.frame(event_id=unique(df_syn_ds$event_id))     #event random effects
df_syn_ds_eq <- df_syn_ds_eq %>% left_join(df_syn_ds[,c('event_id','event_name','comcat_id','magnitude')], by='event_id', multiple='first')
df_syn_ds_st <- data.frame(station_id=unique(df_syn_ds$station_id)) #station random effects
df_syn_ds_st <- df_syn_ds_st %>% left_join(df_syn_ds[,c('station_id','vs30')], by='station_id', multiple='first')

#initialize coefficinet dataframe
df_gmm_coeffs <- as.data.frame(t(c(c_intrcp,c_mag,c_mag2,c_gs,c_rrup,c_lnvs30,tau,tauP,phi,phiS2S,phiP)),
                               row.names='gmm_syn')
colnames(df_gmm_coeffs) <- c('(Intercept)','mag','mag2','gs','rrup','lnvs30','tau','tauP','phi','phiS2S','phiP')

### Synthetic Dataset

In [None]:
#median ground motion
df_syn_ds$med_w_mag  <- c_intrcp + c_mag * df_syn_ds$mag + c_mag2 * df_syn_ds$mag2 + c_gs * df_syn_ds$gs + c_rrup * df_syn_ds$rrup + c_lnvs30 * df_syn_ds$lnvs30
df_syn_ds$med_wo_mag <- c_intrcp +                                                   c_gs * df_syn_ds$gs + c_rrup * df_syn_ds$rrup + c_lnvs30 * df_syn_ds$lnvs30

#hetero-scedasticity scaling
# df_syn_ds$scl_dW    <- sapply( df_syn_ds$magnitude,    function(x) 1.4 - min(0.2*max(x-4,0),0.4) )
# df_syn_ds_eq$scl_dB <- sapply( df_syn_ds_eq$magnitude, function(x) 1.2 - min(0.1*max(x-5,0),0.2) )
df_syn_ds$scl_dW    <- 1
df_syn_ds$scl_dW    <- sapply( df_syn_ds$magnitude,    function(x) 1.8 - min(0.40*max(x-4,0),0.8) )
df_syn_ds_eq$scl_dB <- sapply( df_syn_ds_eq$magnitude, function(x) 1.5 - min(0.25*max(x-5,0),0.5) )
                              
#within event variability
df_syn_ds$dW <- df_syn_ds$scl_dW * rnorm(nrow(df_syn_ds), sd=phi)

#random effects conditioned on event id
df_syn_ds_eq$dB   <-  df_syn_ds_eq$scl_dB * rnorm(nrow(df_syn_ds_eq), sd=tau)
df_syn_ds_eq$dBP  <-                        rnorm(nrow(df_syn_ds_eq), sd=tauP)
#random effects conditioned on station id
df_syn_ds_st$dS2S <- rnorm(nrow(df_syn_ds_st), sd=phiS2S)
df_syn_ds_st$dSP  <- rnorm(nrow(df_syn_ds_st), sd=phiP)

#merge random effects on synthetic dataset
df_syn_ds <- df_syn_ds %>% left_join(df_syn_ds_eq[c('event_id','dB','dBP','scl_dB')], by='event_id')
df_syn_ds <- df_syn_ds %>% left_join(df_syn_ds_st[c('station_id','dS2S','dSP')],      by='station_id')

#toal residuals 
df_syn_ds$syn_erg_req_dT <- df_syn_ds$dW + df_syn_ds$dB +                   df_syn_ds$rrup * df_syn_ds$dBP
df_syn_ds$syn_ss_req_dT  <- df_syn_ds$dW + df_syn_ds$dB + df_syn_ds$dS2S +  df_syn_ds$rrup * df_syn_ds$dBP

#compute response variable
df_syn_ds$syn_erg_w_mag_req_y  <- df_syn_ds$med_w_mag  + df_syn_ds$syn_erg_req_dT 
df_syn_ds$syn_ss_w_mag_req_y   <- df_syn_ds$med_w_mag  + df_syn_ds$syn_ss_req_dT
df_syn_ds$syn_erg_wo_mag_req_y <- df_syn_ds$med_wo_mag + df_syn_ds$syn_erg_req_dT 
df_syn_ds$syn_ss_wo_mag_req_y  <- df_syn_ds$med_wo_mag + df_syn_ds$syn_ss_req_dT

In [None]:
title_name <- 'Tau and Phi Scaling'
pl_gmm_tau_scl <- ggplot(data=df_syn_ds, aes(x=mag, y=scl_dW)) + geom_point(aes(color=as.factor(1)), size=2) +
                                    geom_abline(intercept=1, slope=0,linetype="dashed") +
                                    labs(x='Magnitude', y='phi scaling', title=title_name) +
                                    ylim(0, 2) +
                                    theme(plot.title=element_text(size=size_title),  axis.title=element_text(size=size_title), 
                                          axis.text.y=element_text(size=size_text),  axis.text.x=element_text(size=size_text),
                                          legend.title=element_text(size=size_text), legend.text=element_text(size=size_text)) +
                                    theme(legend.position = "none")

pl_gmm_phi_scl <- ggplot(data=df_syn_ds, aes(x=mag, y=scl_dB)) + geom_point(aes(color=as.factor(1)), size=2) +
                                    geom_abline(intercept=1, slope=0,linetype="dashed") +
                                    labs(x='Magnitude', y='tau scaling', title='') +
                                    ylim(0, 2) +
                                    theme(plot.title=element_text(size=size_title),  axis.title=element_text(size=size_title), 
                                          axis.text.y=element_text(size=size_text),  axis.text.x=element_text(size=size_text),
                                          legend.title=element_text(size=size_text), legend.text=element_text(size=size_text)) +
                                    theme(legend.position = "none")

options(repr.plot.width=10, repr.plot.height=2*3)
ggarrange(pl_gmm_tau_scl, pl_gmm_phi_scl,  ncol=1, nrow=2)

### Save Data

In [None]:
#create output directories
dir.create(dir_out, showWarnings = FALSE)
dir.create(dir_fig, showWarnings = FALSE)

#reorganize and keep relevant columns
df_syn_ds <- df_syn_ds[,c('motion_id','event_id','station_id','path_id','user_id','event_name','comcat_id','magnitude','ztor','vs30','rrup',
                          'mag','mag2','pdepth','gs','lnvs30','reff_rrup','med_w_mag','med_wo_mag',
                          'scl_dW','scl_dB','dW','dB','dBP','dS2S','dSP',
                          'syn_erg_w_mag_req_y','syn_ss_w_mag_req_y','syn_erg_wo_mag_req_y','syn_ss_wo_mag_req_y',
                          'syn_erg_req_dT','syn_ss_req_dT')]
#write out computed regressed residuals
write_csv(df_syn_ds, file=file.path(dir_out, 'synds_realiz.csv'))