# Example

- check if .hscfg file in your home folder is correctly configured (see hsdsaccess_example.ipynb)


In [None]:
study = "/SILICON_STUDY/"

In [None]:
from ramanchada2.spectrum import from_chada
from ramanchada2.io.HSDS import visit_domain, filter_dataset, read_cha
import math
import h5pyd


def load_dataset(parentdomain,domain,results=[],h5module=h5pyd):
    with h5module.File(domain) as f:
        wavelength = f["annotation_study"].attrs["wavelength"]
        instrument = f["annotation_study"].attrs["instrument"]
        laser_power = f["annotation_study"].attrs["laser_power"]
        provider = f["annotation_study"].attrs["provider"]
        sample = f["annotation_sample"].attrs["sample"]
    results.append((domain,provider,instrument,wavelength,laser_power,sample))

query_sample = "S0N"

results = []
# query by sample
#visit_domain(study, process_dataset=filter_dataset,kwargs={"process_file" : load_dataset,"sample": query_sample, 
#                            "kwargs" : {"results" : results}})
# retrieve everything under study
visit_domain(study, process_dataset=load_dataset, kwargs = {"results" : results})

In [None]:
import pandas as pd
df = pd.DataFrame(results,columns=["domain","provider","instrument","wavelength","laser_power","sample"])
df.head()

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np

#test follows, to be applied to all rows

In [None]:
domain = df[df["sample"]=="S0B"].iloc[0]["domain"]


def find_peaks_bgm(spe,n_components=3,max_iter=10000,n_samples=5000):
    bgm = spe.bayesian_gaussian_mixture(n_samples=n_samples,  
                n_components=n_components,
                max_iter=max_iter,
                moving_minimum_window=16,
                random_state=42,
                trim_range=None)
    bgm_peaks = [[mean[0], np.sqrt(cov[0][0]), weight]
                     for mean, cov, weight in
                     zip(bgm.means_, bgm.covariances_, bgm.weights_)]
    bgm_peaks = sorted(bgm_peaks, key=lambda x: x[2], reverse=True)
    n_peaks = (np.round(bgm.weights_, 2) > 0).sum()
    return bgm,bgm_peaks[:n_peaks]

crop_range=[450,600]
spe = from_chada(domain,h5module=h5pyd)
spe = spe.trim_axes("x-axis",crop_range)
spe.plot()
bgm,bgm_peaks = find_peaks_bgm(spe)


In [None]:

def normalize_area(spe):
    res = spe.y # - np.min(spe.y)
    res /= (np.sum(res) * (spe.x[1]-spe.x[0]))
    return res

def plotdist(bgm,bgm_peaks,spe,threshold=0.00001):
    #new_spe = spe.normalize('unity')
    spe = spe - spe.moving_minimum(16)    
    y_norm = normalize_area(spe)
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,2))
    ax1.plot(spe.x,y_norm,':',label='spe')

    _xrange = np.linspace(min(spe.x),max(spe.x),1000)
    _tmp = None
    for e in bgm_peaks:
        weight = e[2]
        if weight>=threshold:
            mu = e[0]
            sigma =e[1]
            gm = stats.norm(mu, sigma)
            if _tmp is None:
                _tmp = weight*gm.pdf(spe.x)
            else:
                _tmp = _tmp + weight*gm.pdf(spe.x)
            #print(_tmp)
            ax1.scatter(mu, weight*gm.pdf(mu),label=e[1])
            ax2.plot(spe.x,y_norm,':')
            ax2.plot(_xrange, weight*gm.pdf(_xrange),'-')
            
            #ax2.plot(spe.x,new_spe.y,'*')
    ax1.plot(spe.x,_tmp,'.')
    #ax1.plot(_xrange,bgm.predict_proba(list(map(lambda x: [x], _xrange))),'.')
    X,y = bgm.sample(100)
    print(y)
    #ax1.plot(list(map(lambda x: x[0], X)),y,'.')
    return (ax1,ax2)

print(bgm_peaks)
plotdist(bgm,bgm_peaks,spe)

In [None]:
def process_row(domain):
    try:
        print(domain)
        crop_range=[450,600]
        spe = from_chada(domain,h5module=h5pyd)
        spe = spe.trim_axes("x-axis",crop_range)
        #spe.plot()
        bgm,bgm_peaks = find_peaks_bgm(spe)
        #apply peak finding tbd
        res = []
        for peak in bgm_peaks:
            res.append(peak[0])
            res.append(peak[1])
            res.append(peak[2])
        
        return pd.Series(res)    

    except Exception as err:
        print(domain,err)
        return pd.Series([None,None,None,None,None,None,None,None,None])

cols = ["peak1_mean","peak1_sigma","peak1_weight","peak2_mean","peak2_sigma","peak2_weight","peak3_mean","peak3_sigma","peak3_weight"]
#df[cols] = df[df["domain"]=="/SILICON_STUDY/ELODIZ/ELODIZ_NEEGALA/532/S0B-1_SEX139-532_Day1.cha"].apply(lambda row: process_row(row["domain"]),axis=1)

df[cols] = df[df["sample"]!="Neon"].apply(lambda row: process_row(row["domain"]),axis=1)

In [None]:
df.to_csv("silica.csv")


In [None]:
import plotly.express as px


fig = px.histogram(df.dropna(),x="peak1_mean",nbins=100,facet_row="sample",color="provider",width=1200, height=600,template="simple_white")
fig.update_traces( textfont_size=8)
fig.show()