# run the initial stuff

In [None]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import base_waves_class_library as bwl
import moments_function_library as mfl
import polyfit_function_library as pfl
import scatter_function_library as sfl
import gmm_library              as gmm

# Databases

creates both delta and GMM scatter distribution datasets

In [None]:
moment_names    = ['mean', 'variance', 'std', 'skew', 'kurtosis', 'inv_kurt', 'hyperskewness', 'hyperkurtosis']
gmm_names       = ['loc_1', 'loc_2', 'loc_3', 'amp_1', 'amp_2', 'amp_3', 'mu1', 'mu2', 'sigma', 'w1'] + moment_names
cols            = ['loc_1', 'loc_2', 'loc_3', 'amp_1', 'amp_2', 'amp_3'] + moment_names
print(cols)

num_records     = 0
mid_low         = -0.1
mid_high        = 0.1
mid_step        = 0.02
extent_low      = 0.02
extent_high     = 0.401
extent_step     = 0.02

# find out how long the database will need to be so we can reserve the whole array in memory up front.
# this will speed up the process
for mid in np.round(np.arange(mid_low, mid_high, mid_step), 2):
    for extent in np.round(np.arange(extent_low, extent_high, extent_step), 2):
        for amp_1 in np.round(np.arange(.05, .96, .05), 2):
            num_records += 1
for mid in np.round(np.arange(mid_low, mid_high, mid_step), 2):
    for extent in np.round(np.arange(extent_low, extent_high, extent_step), 2):
        loc_1 = np.round(mid - extent, 2)
        loc_3 = np.round(mid + extent, 2)
        for spot in np.round(np.arange(.02, extent*2-.01, .02), 2):
            for amp_1 in np.round(np.arange(.05, .91, .05), 2):
                for amp_2 in np.round(np.arange(.05, .96-amp_1, .05), 2):
                    num_records += 1

print(num_records)

SampleRate  = 1000

x           = np.round(np.linspace(-2, 2, 4*int(SampleRate), endpoint=False), 3)
data_array  = np.zeros((num_records, len(cols)))
gmm_array   = np.zeros((num_records, len(gmm_names)))

start       = time.time()
idx         = 0

# create the database
# two deltas loop
for mid in np.round(np.arange(mid_low, mid_high, mid_step), 2):
    for extent in np.round(np.arange(extent_low, extent_high, extent_step), 2):
        loc_1 = np.round(mid - extent, 2)
        loc_2 = np.round(mid + extent, 2)
        for amp_1 in np.round(np.arange(.05, .96, .05), 2):
            amp_2           = np.round(1.0 - amp_1, 2)
            delta           = sfl.Delta(x, loc1=loc_1, loc2=loc_2, amp1=amp_1, amp2=amp_2)
            ems             = mfl.Moments(x, delta.scatter)
            vals            = [loc_1, loc_2, .49, amp_1, amp_2, 0.]
            data_array[idx] = np.concatenate((vals, ems.moments))
            target_moments  = [ems.mean, ems.variance, ems.std, ems.skew, ems.kurtosis]
            results, _, _   = gmm.match_moments(target_moments)
            gmm_scatter     = gmm.gmm_pdf(x, results)
            gmm_ems         = mfl.Moments(x, gmm_scatter)
            gmm_vals        = np.concatenate((vals, results))
            gmm_array[idx]  = np.concatenate((gmm_vals, gmm_ems.moments))
            idx += 1
            if idx % 1000 == 0:
                print(idx)
# three deltas loop
for mid in np.round(np.arange(mid_low, mid_high, mid_step), 2):
    for extent in np.round(np.arange(extent_low, extent_high, extent_step), 2):
        loc_1 = np.round(mid - extent, 2)
        loc_3 = np.round(mid + extent, 2)
        for spot in np.round(np.arange(.02, extent*2-.01, .02), 2):
            loc_2 = np.round(spot + loc_1, 2)
            for amp_1 in np.round(np.arange(.05, .91, .05), 2):
                for amp_2 in np.round(np.arange(.05, .96-amp_1, .05), 2):
                    amp_3           = np.round(1.0 - amp_1 - amp_2, 2)
                    delta           = sfl.Delta(x, loc1=loc_1, loc2=loc_2, loc3=loc_3,
                                                amp1=amp_1, amp2=amp_2, amp3=amp_3)
                    ems             = mfl.Moments(x, delta.scatter)
                    vals            = [loc_1, loc_2, loc_3, amp_1, amp_2, amp_3]
                    data_array[idx] = np.concatenate((vals, ems.moments))
                    target_moments  = [ems.mean, ems.variance, ems.std, ems.skew, ems.kurtosis]
                    results, _, _   = gmm.match_moments(target_moments)
                    gmm_scatter     = gmm.gmm_pdf(x, results)
                    gmm_ems         = mfl.Moments(x, gmm_scatter)
                    gmm_vals        = np.concatenate((vals, results))
                    gmm_array[idx]  = np.concatenate((gmm_vals, gmm_ems.moments))
                    idx += 1
                    if idx % 1000 == 0:
                        print(idx)

database    = pd.DataFrame(data_array, columns=cols)
database2   = pd.DataFrame(gmm_array, columns=gmm_names)
end         = time.time()
print("Time taken: ", (end - start)/60)

database.to_csv('delta_database.csv', index=False)
database2.to_csv('gmm_database.csv', index=False)

##### clean up gmm database

GMM database may not have perfectly fit to every delta scatterer. these records should be removed in the case that they fall outside the parameter space of the delta dataset

In [None]:
db2['good']     = True

SampleRate      = 100
ScatterLen      = 4
x               = np.round(np.linspace(-ScatterLen/2, ScatterLen/2, ScatterLen*SampleRate, endpoint=False), 3)

for idx, row in db2.iterrows():
    if idx % 1000 == 0:
        print(idx)
    gmm_scatter = sfl.GMM(x, row['mu1'], row['mu2'], row['sigma'], row['w1'])
    mnts        = mfl.Moments(gmm_scatter.x, gmm_scatter.scatter)
    mnts_short  = [mnts.mean, mnts.std, mnts.skew, mnts.kurtosis]

    if (
        (np.round(mnts.mean, 2) != np.round(row['mean'], 2))
        or (np.round(mnts.std, 2) != np.round(row['std'], 2))
        or (np.round(mnts.skew, 2) != np.round(row['skew'], 2))
        or (np.round(mnts.kurtosis, 2) != np.round(row['kurtosis'], 2))
        or (np.round(db2.loc[idx, 'mean'], 2) != np.round(db.loc[idx, 'mean'], 2))
        or (np.round(db2.loc[idx, 'std'], 2) != np.round(db.loc[idx, 'std'], 2))
        or (np.round(db2.loc[idx, 'skew'], 2) != np.round(db.loc[idx, 'skew'], 2))
        or (np.round(db2.loc[idx, 'kurtosis'], 2) != np.round(db.loc[idx, 'kurtosis'], 2))
    ):
        db2.loc[idx, 'good'] = False

db2     = db2[db2['good'] == True]

print(db.shape)
print(db2.shape)

In [None]:
db2.to_csv('gmm_database.csv', index=False)

# Polynomial Fits
(and new, complete database for ML training)

In [None]:
db      = pd.read_csv('delta_database.csv')
db2     = pd.read_csv('gmm_database.csv')

### reports (optional)

In [None]:
plt.rcParams['figure.figsize'] = [4, 2]
plt.rcParams['figure.dpi'] = 100

plt.hist(db['mean'], bins=100)
plt.show()

plt.hist(db['std'], bins=100)
plt.show()

plt.hist(db['variance'], bins=100)
plt.show()

plt.hist(db['skew'], bins=100)
plt.show()

plt.hist(db['kurtosis'], bins=100)
plt.show()

plt.hist(1/db['kurtosis'], bins=100)
plt.show()

## tests

In [None]:
plt.rcParams['figure.dpi'] = 400
fig, axs = plt.subplots(2, 1, figsize=(4, 4))

# axs[0].plot(sr1.t, np.abs(sr1.signal), label='abs')
axs[0].plot(sr1.t, np.real(sr1.signal), label='real')
axs[0].plot(sr1.t, np.imag(sr1.signal), label='imag')
axs[0].set_xlim(-2.5, 2.5)
axs[0].set_ylim(-6.2, 5)
axs[0].legend(loc='lower right')
axs[0].set_xlabel(r'Time (units of $B^{-1}$)')
axs[0].vlines([-1.5, -0.5, 0.5, 1.5], -6.5, 5.5, linestyles='dashed', colors='r')
axs[0].text(-2.1, 3.5, r'$t_1$', fontsize=12)
axs[0].text(-1.1, 3.5, r'$t_2$', fontsize=12)
axs[0].text(-0.1, 3.5, r'$t_3$', fontsize=12)
axs[0].text(0.9, 3.5, r'$t_4$', fontsize=12)
axs[0].text(1.9, 3.5, r'$t_5$', fontsize=12)

ft = np.fft.fftfreq(len(sr1.signal), 1/SampleRate)
fts = np.fft.fftshift(ft)
ffts = np.fft.fftshift(np.real(np.fft.fft(sr1.signal)))

axs[1].plot(fts, ffts/np.max(np.abs(ffts)))
axs[1].set_xlim(-.5, 1.5)
axs[1].set_xlabel('Frequency (units of $B$)')

plt.tight_layout()
plt.show()

## final database creation

### base functions and variable dump

In [None]:
# must run this cell every time you create a new final database

power           = 4
snr             = 0
segments        = 5
chunk_size      = 1000
real            = False
add_noise       = False if (snr == 0) else True
bw              = 2*np.pi
SampleRate      = 100
SignalLen       = 10
ScatterLen      = 2
t               = np.round(np.linspace(-SignalLen/2, SignalLen/2, SignalLen*SampleRate, endpoint=False), 3)
x               = np.round(np.linspace(-ScatterLen/2, ScatterLen/2, ScatterLen*SampleRate, endpoint=False), 3)
poly_columns    = []
coefs           = [1, 1, -1, -1, -1, -1]
f_low           = 0
f_high          = f_low + 1
sr              = bwl.SuperRandom(f_low=f_low*bw, f_high=f_high*bw, t=t, coefs=coefs)
use_base        = False

for seg in range(segments):
    for coef in range((power+1)):
        poly_columns.append(f'segment{seg}_{coef}_real')
for seg in range(segments):
    for coef in range((power+1)):
        poly_columns.append(f'segment{seg}_{coef}_imag')

corr_columns    = ['0_lag_corr_base', 'best_corr_base', 'best_lag_base']

### delta version

In [None]:
for snr in [10, 100]:
    add_noise       = False if (snr == 0) else True
    moment_columns  = db.columns.tolist()
    all_columns     = moment_columns + poly_columns + corr_columns
    output_file     = f'db_delta_snr{snr}_complex_{f_low}to{f_high}.csv'

    if os.path.exists(output_file):
        processed_df = pd.read_csv(output_file)
        start_idx = len(processed_df)
        print(f"Resuming from record {start_idx}")
    else:
        start_idx = 0
        print("Starting processing from the beginning")

    st = time.time()

    # Process in chunks to deal with running out of processing time
    for start in range(start_idx, len(db), chunk_size):
        end         = min(start + chunk_size, len(db))
        chunk       = db.iloc[start:end].copy()

        print(f"Processing records {start} to {end - 1}")

        start_time  = time.time()
        chunk_df    = chunk.apply(lambda row: pfl.add_poly_coefs(row, x, sr, power, add_noise=add_noise,
                                                                snr=snr, segments=segments, real=real, f_low=f_low*bw,
                                                                SampleRate=SampleRate, use_base=use_base),
                                axis=1).tolist()
        chunk_df    = pd.DataFrame(chunk_df, columns=all_columns)

        if os.path.exists(output_file):
            chunk_df.to_csv(output_file, mode='a', header=False, index=False)
        else:
            chunk_df.to_csv(output_file, mode='w', header=True, index=False)

        end_time = time.time()

        print((end_time - start_time)/60)

    et = time.time()
    print("Processing complete!")
    print("Time taken: ", (et - st)/60)

### GMM version

In [None]:
for snr in [10, 100]:
    add_noise       = False if (snr == 0) else True
    moment_columns2 = ['mu1', 'mu2', 'sigma', 'w1', 'mean', 'variance', 'std', 'skew',
                    'kurtosis', 'inv_kurt', 'hyperskewness', 'hyperkurtosis']
    corr_columns2    = ['0_lag_corr_delta', 'best_corr_delta', 'best_lag_delta']
    all_columns     = moment_columns2 + poly_columns + corr_columns + corr_columns2
    output_file     = f'db_gmm_snr{snr}_complex_{f_low}to{f_high}.csv'

    if os.path.exists(output_file):
        processed_df = pd.read_csv(output_file)
        start_idx = len(processed_df)
        print(f"Resuming from record {start_idx}")
    else:
        start_idx = 0
        print("Starting processing from the beginning")

    # Process in chunks to deal with running out of processing time
    for start in range(start_idx, len(db2), chunk_size):
        end             = min(start + chunk_size, len(db2))
        chunk           = db2.iloc[start:end].copy()

        print(f"Processing records {start} to {end - 1}")

        start_time      = time.time()
        processed_chunk = chunk.apply(lambda row: pfl.add_poly_coefs_gmm(row, x, sr, power, add_noise=add_noise, snr=snr,
                                                                        segments=segments, real=real, f_low=f_low*bw,
                                                                        SampleRate=SampleRate, use_base=use_base),
                                    axis=1).tolist()
        chunk_df        = pd.DataFrame(processed_chunk, columns=all_columns)

        if os.path.exists(output_file):
            chunk_df.to_csv(output_file, mode='a', header=False, index=False)
        else:
            chunk_df.to_csv(output_file, mode='w', header=True, index=False)

        end_time        = time.time()

        print((end_time - start_time)/60)

    print("Processing complete!")