#### Synthetic example where input U-Pb dates are derived from multiple Gaussian distributions. Mixture modeling is required to estimate the original Gaussian distributions prior to convolution with the Pb loss function.
By: Glenn R. Sharman, Department of Geosciences, Universit of Arkansas

In [None]:
# Import required modules
import convFuncs as convFunc

import matplotlib.pyplot as plt
import numpy as np

from scipy.interpolate import interp1d
from scipy.signal import convolve
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.stats import kstest

from sklearn.mixture import GaussianMixture
from astropy.stats import kuiper

import pathlib

import xlsxwriter

from importlib import reload

import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # For improving matplotlib figure resolution
matplotlib.rcParams['pdf.fonttype'] = 42 # For allowing preservation of fonts upon importing into Adobe Illustrator
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
# Define the Gaussian distributions
ages = [50, 52, 55]  # Mean ages of the normal distribution, in Ma
omega_Mas = [1, 0.8, 1.2] # Standard deviations of the normal distribution, in Myr

ns = [500, 350, 150] # Number of analyses to draw
weightings = ns/np.sum(ns) # Proportion of analyses contributed by each Gaussian distribution

n_x = 20001 # Number of x-axis values for both Ma and pct space

In [None]:
# Define x-axis in % from age
x1 = -100 # Note, it is not possible for a U-Pb date to be < -100% from it's true age, as this would result in a negative age
x2 = 100
x = np.linspace(x1, x2, n_x)

# Calculate Gaussian pdfs and draw analyses from each
norm_Ma_n = []
norm_pct_n = []
norm_Ma_pdfs = []
xages = []
for i in range(len(ages)):
    # Define x-axis in Ma
    xage_1 = 0 # Lower limit in Ma
    xage_2 = ages[i]*2 # Upper limit in Ma
    xage = np.linspace(xage_1, xage_2, n_x)
    xages.append(xage)
    
    # Define the Gaussian distribution pdf and sample it randomly
    rv_norm_Ma = norm(loc = ages[i], scale = omega_Mas[i])
    norm_Ma_pdf = rv_norm_Ma.pdf(xage)
    norm_Ma_pdfs.append(norm_Ma_pdf/np.sum(norm_Ma_pdf)) # Normalize so area under the curve = 1
    norm_Ma_n.append(rv_norm_Ma.rvs(size=ns[i])) # Draw randomly from the normal distribution
    norm_pct_n.append((norm_Ma_n[i]-ages[i])/ages[i]*100)

In [None]:
# Because x-axis arrays are all different, must convert them to the same x-axis values
xage_comb = np.linspace(0, np.min(ages)*2, n_x) # Use the minimum value of ages, so we don't get an error. Note, this will not work if the peaks are widely spaced from each other

norm_Ma_pdfs_comb = []
for i in range(len(ages)):
    pdf_interp = interp1d(xages[i], norm_Ma_pdfs[i])
    norm_Ma_pdfs_comb.append(pdf_interp(xage_comb))

In [None]:
# Combine pdfs into one weighted distribution

norm_Ma_pdf_comb = np.zeros_like(norm_Ma_pdfs_comb[0])
for i in range(len(ages)):
    norm_Ma_pdf_comb += norm_Ma_pdfs_comb[i]*weightings[i]
norm_Ma_pdf_comb = norm_Ma_pdf_comb/np.sum(norm_Ma_pdf_comb)

In [None]:
# Make a plot that compares the sampled data versus the overall probability distribution
plt.plot(xage_comb, norm_Ma_pdf_comb, color='black')
plt.ylim(0,)
plt.twinx()
plt.hist(norm_Ma_n, bins=25, histtype='bar', stacked=False, rwidth=1/len(ages)*100, alpha=1);

plt.xlim(40,60)

#### Create a Pb loss distribution and convolve it with each of the sub-distributions

In [None]:
reload(convFunc);

dist_type_original = 'weibull'
params = [5, 1.5] # Scale, shape

Pb_loss_pct_pdf = convFunc.Pb_loss_fun(params, dist_type_original, x)

conv_Ma_pdfs = []
for i in range(len(ages)):
    conv_Ma_pdfs.append(convolve(Pb_loss_pct_pdf, norm_Ma_pdfs_comb[i], mode='same'))

In [None]:
# Scale each convolved pdf by its associated weighting
conv_Ma_pdf_comb = np.zeros_like(conv_Ma_pdfs[0])
for i in range(len(ages)):
    conv_Ma_pdf_comb += conv_Ma_pdfs[i]*weightings[i]

In [None]:
# Replace slightly negative values with 0
conv_Ma_pdf_comb[conv_Ma_pdf_comb<0] = 0
conv_Ma_pdf_comb = conv_Ma_pdf_comb/np.sum(conv_Ma_pdf_comb)

In [None]:
# Generate dates from the convolved distribution
dates_nonCA = np.random.choice(xage_comb, size=np.sum(ns), replace=True, p=conv_Ma_pdf_comb)

In [None]:
# Plot the Pb loss perturbed vs unperturbed U-Pb date distributions
fig, ax = plt.subplots()

plt.plot(xage_comb, norm_Ma_pdf_comb, color='navy')
plt.ylim(0,)
plt.twinx()
plt.plot(xage_comb, conv_Ma_pdf_comb, color='red')
plt.ylim(0,)
plt.xlim(35,60)

In [None]:
# Plot the Pb loss distribution
fig, ax = plt.subplots(1, figsize=(3,3))

ax.plot(x, Pb_loss_pct_pdf)
ax.set_xlim(-20,0)
ax.set_ylim(0,)

##### Conduct Gaussian mixture modeling

In [None]:
gmm = GaussianMixture(n_components=3)
X = np.concatenate(norm_Ma_n).ravel() # Flatten the list of arrays
gmm.fit(X.reshape(-1,1))

#gmm_std_devs = np.sqrt(gmm.covariances_)
gmm_std_devs = [np.float(x) for x in gmm.covariances_]
gmm_means = [np.float(x) for x in gmm.means_]
gmm_weights = [np.float(x) for x in gmm.weights_]

In [None]:
gmm_Ma_pdf = np.zeros_like(conv_Ma_pdfs[0])
gmm_Ma_pdf_conv = np.zeros_like(conv_Ma_pdfs[0])
for i in range(len(ages)):   
    # Define the Gaussian distribution pdf and sample it randomly
    rv_norm_Ma = norm(loc = gmm_means[i], scale = gmm_std_devs[i])
    norm_Ma_pdf = rv_norm_Ma.pdf(xage_comb)
    norm_Ma_pdf = norm_Ma_pdf/np.sum(norm_Ma_pdf) # Normalize so area under the curve = 1
    gmm_Ma_pdf += norm_Ma_pdf*gmm_weights[i]
    gmm_conv = convolve(Pb_loss_pct_pdf, norm_Ma_pdfs_comb[i], mode='same')
    gmm_Ma_pdf_conv += gmm_conv*gmm_weights[i]

In [None]:
# Make a plot that compares the actual pdf vs modeled pdf
plt.plot(xage_comb, conv_Ma_pdf_comb, '-', color='red', label='Pb loss perturbed')
plt.plot(xage_comb, gmm_Ma_pdf, '--', color='navy', label='Modeled (no Pb loss)')
plt.plot(xage_comb, norm_Ma_pdf_comb, '-', color='navy', label='Actual (no Pb loss)')
plt.legend()

plt.ylim(0,)
plt.twinx()
plt.hist(np.concatenate(norm_Ma_n).ravel(), bins=50, alpha=0.5, color='gray');
plt.xlim(40, 60)

#### Final step is to model Pb loss for all three Gaussians

In [None]:
reload(convFunc);

dist_types = ['none','constant','isolated','uniform','gamma','expon','rayleigh','weibull','pareto','halfnorm','lognorm'] # Select which form(s) of Pb loss you want to model

method = 'ss' # 'ss' is sum of squared residuals between ECDF and modeled CDF

label = 'TestMultiModalMixture'

xlim = (40, 60)
xlim_Pb_loss = (-20, 0)
dates_CA = np.concatenate(norm_Ma_n).ravel()

plot_ref_age = False

errors_nonCA = np.zeros_like(dates_nonCA)
errors_CA = np.zeros_like(dates_CA)

pathlib.Path(str(label)).mkdir(parents=True, exist_ok=True) # Recursively creates the directory and does not raise an exception if the directory already exists 

file_name = str(label)+'/'+'model_results_'+label+'.xlsx'

plot_fig = True

workbook = xlsxwriter.Workbook(file_name)

bold_format = workbook.add_format({'bold' : True})

max_offset = (np.max(ages)-np.min(dates_nonCA))/np.max(ages)*-100

# Record model parameters
worksheet = workbook.add_worksheet('Model_parameters')
worksheet.write(0, 0, 'Sample', bold_format)
worksheet.write(1, 0, 'N (non-CA)', bold_format)
worksheet.write(0, 1, label)
worksheet.write(1, 1, len(dates_nonCA))
if dates_CA is not None:
    worksheet.write(2, 0, 'N (CA)', bold_format)
    worksheet.write(2, 1, len(dates_CA))
    worksheet.write(3, 0, 'Misfit function', bold_format)
    worksheet.write(3, 1, method)
else:
    worksheet.write(2, 0, 'Misfit function', bold_format)
    worksheet.write(2, 1, method)
                
c = 0 # Counter variable
worksheet = workbook.add_worksheet('Model_results')
worksheet.write(0, 1, 'fun', bold_format)
worksheet.write(0, 2, 'KS Dmax (f*g)', bold_format)
worksheet.write(0, 3, 'KS p-value (f*g)', bold_format)
worksheet.write(0, 4, 'Kuiper Vmax (f*g)', bold_format)
worksheet.write(0, 5, 'Kuiper p-value (f*g)', bold_format)
if dates_CA is not None:
    worksheet.write(0, 6, 'KS Dmax (f)', bold_format)
    worksheet.write(0, 7, 'KS p-value (f)', bold_format)
    worksheet.write(0, 8, 'Kuiper Vmax (f)', bold_format)
    worksheet.write(0, 9, 'Kuiper p-value (f)', bold_format)
    worksheet.write(0, 10, 'f(t) age', bold_format)
    c += 5
worksheet.write(0, 6+c, 'f(t) 1 s.d.', bold_format)
worksheet.write(0, 7+c, 'g(t) params[0]', bold_format)
worksheet.write(0, 8+c, 'g(t) params[1]', bold_format)
worksheet.write(0, 9+c, 'g(t) params[2]', bold_format)

c = 0 # counter variable
for dist_type in dist_types:
    print('Starting ',dist_type)
    if dist_type == 'none':
        params_0 = [0] # Age (Ma), omega (Myr), and shift in %
        bounds = [(0,0)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'constant':
        params_0 = [-2.0] # Age (Ma), omega (Myr), and shift in %
        bounds = [(max_offset,0)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'isolated':
        params_0 = [-3, 0.8] # Age (Ma), omega (Myr), and shift in %, and proportion of grains with shift (0-1)
        bounds = [(max_offset,0), (0,1)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'expon':
        params_0 = [1.0] # Age (Ma), omega (Myr), and shape
        bounds = [(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})
        
    if dist_type == 'rayleigh':
        params_0 = [1.0] # Age (Ma), omega (Myr), and shape
        bounds = [(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'halfnorm':
        params_0 = [1.0] # Age (Ma), omega (Myr), and shape
        bounds = [(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'lognorm':
        params_0 = [1.0, 1.0] # Age (Ma), omega (Myr), shape, and scale
        bounds = [(0,None), (0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})
        
    if dist_type == 'weibull':
        params_0 = [1.0, 1.0] # scale, and shape
        bounds = [(0,None),(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'gamma':
        params_0 = [0.5, 1.0] # Age (Ma), omega (Myr), scale, and shape
        bounds = [(0,None),(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'gengamma':
        params_0 = [1.0, 1.0, 1.0] # Age (Ma), omega (Myr), scale, and shape
        bounds = [(0,None),(0,None),(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'uniform':
        params_0 = [1.0, 1.0] # Age (Ma), omega (Myr), u_min, and u_max
        bounds = [(0,None),(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    if dist_type == 'pareto':
        params_0 = [1] # Age (Ma), omega (Myr), shape
        bounds = [(0,None)]
        result = minimize(convFunc.misfit_poly, params_0, args=(dist_type, gmm_Ma_pdf, x, xage_comb, dates_nonCA, method), 
                      bounds=bounds, tol=1e-20, method='Powell', options={'maxiter' : 1e6, 'disp' : False})

    Pb_loss_pct_pdf = convFunc.Pb_loss_fun(params=result.x, dist_type=dist_type, x=x)    
    
    norm_Ma_pdf_comb_fit = np.zeros_like(norm_Ma_pdfs_comb[0])
    for i in range(len(ages)):
        norm_Ma_pdf_comb_fit += norm_Ma_pdfs_comb[i]*gmm_weights[i]
    norm_Ma_pdf_comb_fit = norm_Ma_pdf_comb_fit/np.sum(norm_Ma_pdf_comb_fit)

    conv_Ma_pdf_fit = convolve(Pb_loss_pct_pdf, norm_Ma_pdf_comb_fit, mode='same')
    
    Pb_loss_pct_pdf_fit = convFunc.Pb_loss_fun(result.x, dist_type, x)

    conv_pdf_fit = convolve(Pb_loss_pct_pdf_fit, gmm_Ma_pdf, mode='same')

    ks_results = kstest(rvs=dates_nonCA, cdf=convFunc.cdf_fun(xage_comb, conv_pdf_fit))
    kuiper_results = kuiper(data=dates_nonCA, cdf=convFunc.cdf_fun(xage_comb, conv_pdf_fit))
    
    if dates_CA is not None:
        ks_results_f = kstest(rvs=dates_CA, cdf=convFunc.cdf_fun(xage_comb, norm_Ma_pdf_comb_fit))
        kuiper_results_f = kuiper(data=dates_CA, cdf=convFunc.cdf_fun(xage_comb, norm_Ma_pdf_comb_fit))
    
    d = 0 # Counter variable
    worksheet.write(c+1, 0, dist_type, bold_format)
    worksheet.write(c+1, 1, result.fun)
    worksheet.write(c+1, 2, ks_results[0])
    worksheet.write(c+1, 3, ks_results[1])
    worksheet.write(c+1, 4, kuiper_results[0])
    worksheet.write(c+1, 5, kuiper_results[1])
    if dates_CA is not None:
        worksheet.write(c+1, 6, ks_results_f[0])
        worksheet.write(c+1, 7, ks_results_f[1])
        worksheet.write(c+1, 8, kuiper_results_f[0])
        worksheet.write(c+1, 9, kuiper_results_f[1])
        d += 5
    for i in range(len(result.x)):
        worksheet.write(c+1, 7+d+i, result.x[i])
       
    print('---{}: '.format(method), np.round(result.fun,6))
    
    for i in range(len(result.x)-2):
        print('---g(t) params[{}]'.format(i),np.round(result.x[i+2],2))
    c+=1
    
    if plot_fig:
        fig = convFunc.plot_Pb_loss_model_poly(norm_Ma_pdf = norm_Ma_pdf_comb, conv_Ma_pdf=conv_pdf_fit, params_Pb_loss=result.x,
                                                     fit=result.fun, dates_input=dates_nonCA, errors_1s_input=errors_nonCA, 
                                xage=xage_comb, x=x, xlim=xlim, xlim_Pb_loss=xlim_Pb_loss, dist_type=dist_type,
                                plot_ref_age=plot_ref_age, ref_age=None, ref_age_2s_uncert=None, dates_input_CA=dates_CA,
                                                    errors_1s_input_CA=errors_CA);
        fig.savefig(str(label)+'/'+'fig_'+str(dist_type)+'.pdf')
    
workbook.close()

In [None]:
# Make a figure that compares the true pdfs versus the modeled pdfs
plt.plot(xage_comb, conv_pdf_fit, '--', label='Modeled', color='red')
plt.plot(xage_comb, conv_Ma_pdf_comb, '-', label='Actual', color='red')

plt.plot(xage_comb, gmm_Ma_pdf, '--', color='navy', label='Modeled')
plt.plot(xage_comb, norm_Ma_pdf_comb, '-', color='navy', label='Actual')

plt.legend()

plt.xlim(40,60)

In [None]:
# Make a summary figure that illustrates the process of multi-modal modeling

bins = np.linspace(40,60,50)
colors = ['lightgray','slategray','black']
xlim = [40,65]

fig, axs = plt.subplots(3, 1, figsize=(5,10))

Pb_loss_pct_pdf = convFunc.Pb_loss_fun(params, dist_type_original, x)

for i in range(len(ages)):
    axs[0].plot(xage_comb, norm_Ma_pdfs_comb[i]*weightings[i], color='black', alpha=1)
    axs[0].fill_between(xage_comb, norm_Ma_pdfs_comb[i]*weightings[i], y2=0, color=colors[i], alpha=0.5)
    axs[0].plot(xage_comb, norm_Ma_pdf_comb, color='navy')
    axs[0].set_xlim(xlim[0],xlim[1])
    axs[0].set_ylim(0,)
    
for i in range(len(ages)):
    axs[1].plot(xage_comb, conv_Ma_pdfs[i]*weightings[i], color='black', alpha=1)
    axs[1].fill_between(xage_comb, conv_Ma_pdfs[i]*weightings[i], y2=0, color=colors[i], alpha=0.5)
    axs[1].plot(xage_comb, conv_Ma_pdf_comb, color='red')
    axs[1].set_xlim(xlim[0],xlim[1])
    axs[1].set_ylim(0,)

axs[2].plot(xage_comb, norm_Ma_pdf_comb, color='navy')
axs[2].plot(xage_comb, gmm_Ma_pdf, '--', color='navy', label='Modeled')

axs[2].plot(xage_comb, conv_Ma_pdf_comb, color='red')
axs[2].plot(xage_comb, conv_pdf_fit, '--', label='Modeled', color='red')

axs[2].set_ylim(0,)
#plt.twinx()
#plt.hist(norm_Ma_n, bins=bins, histtype='bar', stacked=False, rwidth=1/len(ages)*100, color=['lightgray','slategray','black']);
axs[2].set_xlim(xlim[0],xlim[1])

# Create a smaller subplot to show the distribution of Pb loss in the sample
ax_sub = axs[2].inset_axes([0.65, 0.5, 0.3, 0.4], transform=axs[2].transAxes)

ax_sub.plot(x, np.cumsum(Pb_loss_pct_pdf), color='black')
ax_sub.plot(x, np.cumsum(Pb_loss_pct_pdf_fit), '--', color='black')
#ax_sub.axvline(x=0.0, ymin=0, ymax=1, ls='--', color='gray')
#ax_sub.set_title('''Cumulative
#apparent Pb loss''', ha='center')

# Make the y-axis scale to the 99th percentile (to avoid it compressing when gamma values approach infinity)
#p95_ind = next(i for i, v in enumerate(np.cumsum(Pb_loss_pct_pdf)) if v > 0.95)
#p05_ind = next(i for i, v in enumerate(np.cumsum(Pb_loss_pct_pdf)) if v > 0.05)
#ax_sub.set_xlim(x[p05_ind], 0)
#ax_sub.set_ylim(0,Pb_loss_pct_pdf[p95_ind])

ax_sub.set_xlim(xlim_Pb_loss[0],xlim_Pb_loss[1])
ax_sub.set_ylim(0,1)
ax_sub.set_xlabel('Age offset (%)')

axs[2].set_xlabel('Age (Ma)')

#plt.subplots_adjust(hspace=0)

#plt.plot(xage_comb, conv_Ma_pdf_comb)
#plt.plot(xage_comb, norm_Ma_pdf_comb)
plt.xlim(xlim[0],xlim[1])