In [None]:
import pickle
import torch

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import math

import os
import gc

import random

In [None]:
TRAINING_DATASET="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/train_mixedSpectraCrux_v6.2.pkl"

ONE_EXPERIMENT="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/Fetal_Testis_Gel_Velos_27_crux_q0.01_pvalue-63.91983.pkl"

VALIDATION_DATASET="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/test_mixedSpectraCrux_v6.2.pkl"

TEST_DATASET="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_all_q0.01_cell_state/sequences/sample_experiment_v6.2.pkl"

OUTPUT_FOLDER="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/datasets/analysis"

TRAIN_DATASET_MZ_INFO="train_6.2_mz_info.pkl"
TRAIN_DATASET_INTENSITY_INFO="train_6.2_intensity_info.pkl"
TRAIN_DATASET_PEPMASS_INFO="train_6.2_pepmass_info.pkl"

VALIDATION_DATASET_MZ_INFO="validation_6.2_mz_info.pkl"
VALIDATION_DATASET_INTENSITY_INFO="validation_6.2_intensity_info.pkl"
VALIDATION_DATASET_PEPMASS_INFO="validation_6.2_pepmass_info.pkl"

TEST_LINFENG_DATASET_MZ_INFO="test_linfeng_6.2_mz_info.pkl"
TEST_LINFENG_DATASET_INTENSITY_INFO="test_linfeng_6.2_intensity_info.pkl"
TEST_LINFENG_DATASET_PEPMASS_INFO="test_linfeng_6.2_pepmass_info.pkl"

## Global parameters

In [None]:
NUM_OF_MZ_BINS=2001
NUM_OF_INTENSITIES_BINS=10000
NUM_OF_INTENSITIES_BINS_FOR_CHART = 1000
MAX_COUNT=10000

In [None]:
PERCENTILES_TO_CALCULATE = list(np.arange(0.0, 105.0, 5))

PERCENTILES_TO_CALCULATE += [99.0, 99.9, 99.99, 99.999]
PERCENTILES_TO_CALCULATE.sort()

### Support functions

In [None]:
def describe_data(data_np, percentiles=PERCENTILES_TO_CALCULATE, histogram_bins=1000, output_filename=None, chart=True):
    
    results = {}
    
    results['percentiles'] = list(zip(percentiles, np.percentile(data_np, percentiles)))
    results['mean'] = np.mean(data_np)
    results['std'] = np.std(data_np)
    results['max'] = np.amax(data_np)
    results['min'] = np.amin(data_np)
    results['histogram'], results['bin_edges'] = np.histogram(data_np, histogram_bins)
    
    for i in range(len(results['percentiles'])):
        print("Percentil={}, value={}".format(results['percentiles'][i][0], results['percentiles'][i][1]))
        
    print("Data mean: {}".format(results['mean']))
    print("Data std: {}".format(results['std']))
    print("Data max: {}".format(results['max']))
    print("Data min: {}".format(results['min']))
    print("Total data points: {}".format(data_np.shape[0]))
    
    if chart:
        fig = go.Figure()

        fig.add_trace(go.Bar(y=results['histogram'],
                             x=results['bin_edges'][1:],
                             marker_color='red'))

        fig.show()
    
    if output_filename:
        with open(output_filename, "wb") as outputFile:
            pickle.dump(results, outputFile, pickle.HIGHEST_PROTOCOL)  
    
    return results

In [None]:
def group_and_describe_dataset(which_dataset, num_of_mz_bins=NUM_OF_MZ_BINS, num_of_intensities_bins=NUM_OF_INTENSITIES_BINS):
    all_mz = []
    all_intensities = []
    
    peaks_count = []
    
    for key, same_sequence in which_dataset['spectra'].items():
        for spectrum in same_sequence:
            all_mz.append(spectrum['nzero_peaks'][:, 0] * which_dataset['normalizationParameters']['mz_std'] + which_dataset['normalizationParameters']['mz_mean'])
            all_intensities.append(spectrum['nzero_peaks'][:, 1] * which_dataset['normalizationParameters']['intensity_std'] + which_dataset['normalizationParameters']['intensity_mean'])
            peaks_count.append(spectrum['nzero_peaks'].shape[0])
            
    print ("m/z length={}".format(len(all_mz)))
    print ("intensities length={}".format(len(all_intensities)))

    all_mz_concatenated = torch.cat(all_mz)
    all_mz_np = all_mz_concatenated.numpy()

    all_intensities_concatenated = torch.cat(all_intensities)
    all_intensities_np = all_intensities_concatenated.numpy()
    
    all_peaks_count_np = np.array(peaks_count)
    
    print("\n\nm/z information\n")
    mz_results = describe_data(all_mz_np, histogram_bins = num_of_mz_bins, chart = False)
    
    print("\n\nIntensities information\n")
    intensities_results = describe_data(all_intensities_np, histogram_bins = num_of_intensities_bins, chart = False)

    print("\n\nPeaks count information\n")
    peaks_count_results = describe_data(all_peaks_count_np, chart = False)   
    
    return all_mz_np, all_intensities_np, mz_results, intensities_results, peaks_count_results

In [None]:
#
# Group intensities arrays into m/z bins
#
# Receives two main parameters "which_mz" and "which_intensities" arrays, which for each index contains the
# m/z and the corresponding intensity reading for an entire dataset.
#
# Outputs a list of "num_of_mz_bins" length, which corresponds to m/z nominal masses; each list position contains an
# array with all the intensities reading at that given m/z.
#

def allocate_intensities_to_mz_bins(which_mz, which_intensities, num_of_mz_bins=NUM_OF_MZ_BINS, chart=True):
    mz_bins = [[] for i in range(num_of_mz_bins)]
    
    for i in range(which_mz.shape[0]):
        mz_bins[int(round(which_mz[i]))].append(which_intensities[i])
        
    bin_intensities_count = []
    
    for i in range(len(mz_bins)):
        bin_intensities_count.append(len(mz_bins[i]))
        
    if chart:
        fig = go.Figure()

        fig.add_trace(go.Bar(y=bin_intensities_count,
                             x=list(range(num_of_mz_bins)),
                             marker_color='red'))

        fig.show()
        
    return mz_bins, bin_intensities_count

In [None]:
#
# For each discrete m/z, discretize the existing intensities
#

def discretize_histograms(mz_bins, intensity_cut_off, analyze_tail=False, max_intensity=None, num_of_intensities_bins=NUM_OF_INTENSITIES_BINS_FOR_CHART):
    
    bin_histograms = []
    
    if analyze_tail:
        range_description=(intensity_cut_off, max_intensity)        
    else:
        range_description=(0.0, intensity_cut_off)

    print("analyze_tail={}, defined range={}".format(analyze_tail, range_description))
        
        
    for i in range(len(mz_bins)):
        histogram, bin_edges = np.histogram(mz_bins[i], bins=num_of_intensities_bins, range=range_description)
        bin_histograms.append(histogram)
        
    expanded_data = np.zeros((NUM_OF_MZ_BINS, num_of_intensities_bins))
    
    for i in range(len(mz_bins)):
        expanded_data[i] = bin_histograms[i]
        
    return expanded_data, histogram, bin_edges

In [None]:
def create_mz_intensity_count_chart(expanded_data, intensity_bin_edges, intensity_cut_off=None, max_mz=NUM_OF_MZ_BINS, max_count=MAX_COUNT, chart_title=None):
    
    if intensity_cut_off:
        selected_intensity_bin_edges = intensity_bin_edges[intensity_bin_edges < intensity_cut_off]
    else:
        selected_intensity_bin_edges = intensity_bin_edges
        
    fig = go.Figure(data=[go.Surface(y=list(range(max_mz)), x=selected_intensity_bin_edges, z=expanded_data)])

    fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True))
    
    max_position = np.unravel_index(np.argmax(expanded_data), expanded_data.shape)
    
    print("Maximum value={}; position x={}, y={}".format(expanded_data[max_position[0]][max_position[1]], max_position[1], max_position[0]))
    
    
    fig.update_layout(title=chart_title,
                      width=1500, 
                      height=1000,
                      scene = dict(
                          yaxis = dict(nticks=20, range=[0, max_mz], title="m/z"),
                          xaxis = dict(nticks=50, range=[selected_intensity_bin_edges[0], selected_intensity_bin_edges[-1]], title="intensities"), 
                          zaxis = dict(nticks=50, range=[0, max_count], title="count")
                      ),
                      annotations = [dict(
                          x = max_position[1],
                          y = max_position[0],
                          showarrow = True,
                          text = "maximum",
                          visible = True,
                          startarrowsize = 10
                      )]
                     )
    
    fig.show()

In [None]:
def explore_dataset(dataset_filename, intensity_percentile_cut_off_index=19, focused_percentile_index=18, analyze_tail=False, dataset_type="train"):

    with open(dataset_filename, "rb") as inputFile:
        dataset = pickle.load(inputFile)

    dataset["normalizationParameters"]

    # Analyze mz x intensity values

    all_mz_np, all_intensities_np, mz_results, intensities_results, peaks_count_results = group_and_describe_dataset(dataset)

    intensity_cut_off = intensities_results['percentiles'][intensity_percentile_cut_off_index][1]

    intensity_cut_off_percentage = intensities_results['percentiles'][intensity_percentile_cut_off_index][0]
    
    print("\nValidation intensity at {}={}\n".format(intensity_cut_off_percentage, intensity_cut_off))

    if analyze_tail:
        intensities_indexes = all_intensities_np > intensity_cut_off
    else:
        intensities_indexes = all_intensities_np <= intensity_cut_off

    selected_mz = all_mz_np[intensities_indexes]
    selected_intensities = all_intensities_np[intensities_indexes]

    
    # Group all the intensities in a given discretized m/z ― m/z discretization based only on the rounded m/z value (nominal mass)

    mz_bins, bin_intensities_count = allocate_intensities_to_mz_bins(selected_mz, selected_intensities)

    
    # Graph the count of available intensities per discrete m/z

    expanded_data, _, intensities_bin_edges = discretize_histograms(mz_bins, intensity_cut_off, analyze_tail, intensities_results['max'])

    print("intensities_results max={}".format(intensities_results['max']))
    print("intensities_bin_edges={}".format(intensities_bin_edges))
    
    create_mz_intensity_count_chart(expanded_data, 
                                    intensities_bin_edges, 
                                    max_count=np.max(expanded_data), 
                                    chart_title="Count of m/z (all) x intensity ({} {}) ― {} dataset (PXD000561, q < 0.01, pvalue 10%)".format("after" if analyze_tail else "up to",
                                                                                                                                               intensity_cut_off_percentage,
                                                                                                                                               dataset_type))

    focused_percentile = intensities_results['percentiles'][focused_percentile_index][0]

    if analyze_tail:
        focused_intensity_cut_off = np.percentile(selected_intensities, focused_percentile)
        focused_mz_cut_off = int(np.percentile(selected_mz, focused_percentile))
        
        print("focused_percentile={}, focused_intensity_cut_off={}, focused_mz_cut_off={}".format(focused_percentile, focused_intensity_cut_off, focused_mz_cut_off))
        print("filtered bins edges={}".format(intensities_bin_edges[intensities_bin_edges < focused_intensity_cut_off]))
    else:
        focused_intensity_cut_off = intensities_results['percentiles'][focused_percentile_index][1]
        focused_mz_cut_off = int(mz_results['percentiles'][focused_percentile_index][1])

        
    expanded_data, _, intensities_bin_edges = discretize_histograms(mz_bins, intensity_cut_off, analyze_tail, focused_intensity_cut_off)
        
        
    create_mz_intensity_count_chart(expanded_data, 
                                    intensities_bin_edges, 
                                    max_mz=focused_mz_cut_off, 
                                    max_count=np.max(expanded_data), 
                                    chart_title="Count of m/z (up to {}) x intensity (up to {}) ― {} dataset (PXD000561, q < 0.01, pvalue 10%)".format(focused_percentile,
                                                                                                                                                       focused_percentile,
                                                                                                                                                       dataset_type))

In [None]:
explore_dataset(TRAINING_DATASET)

In [None]:
explore_dataset(TRAINING_DATASET, analyze_tail=True)

In [None]:
explore_dataset(VALIDATION_DATASET, dataset_type="validation")

In [None]:
explore_dataset(VALIDATION_DATASET, analyze_tail=True, dataset_type="validation")

In [None]:
explore_dataset(TEST_DATASET, dataset_type="test")

In [None]:
explore_dataset(TEST_DATASET, analyze_tail=True, dataset_type="test")

## Visualize some spectra

In [None]:
with open(TRAINING_DATASET, "rb") as inputFile:
    dataset = pickle.load(inputFile)

In [None]:
all_mz = []
all_intensities = []

all_sequences = []

peaks_count = []

for key, same_sequence in dataset['spectra'].items():
    for spectrum in same_sequence:
        all_mz.append(spectrum['nzero_peaks'][:, 0] * dataset['normalizationParameters']['mz_std'] + dataset['normalizationParameters']['mz_mean'])
        all_intensities.append(spectrum['nzero_peaks'][:, 1] * dataset['normalizationParameters']['intensity_std'] + dataset['normalizationParameters']['intensity_mean'])
        peaks_count.append(spectrum['nzero_peaks'].shape[0])
        all_sequences.append(key)

print ("m/z length={}".format(len(all_mz)))
print ("intensities length={}".format(len(all_intensities)))

In [None]:
torch.set_printoptions(precision=7)

In [None]:
def plot_spectrum(spectrum_index, add_peak_intensities=False, output_folder=None):
    
    # normalized_intensities = np.sqrt(all_intensities[0] / max(all_intensities[0]))
    normalized_intensities = all_intensities[spectrum_index] / max(all_intensities[spectrum_index])

    print("Number of peaks of spectrum={}: {}".format(spectrum_index, len(normalized_intensities)))

    fig = go.Figure()

    fig.add_trace(go.Bar(y=normalized_intensities,
                         x=all_mz[spectrum_index],
                         width=0.2,
                         marker_color='black'))

    fig.update_layout(title = "Spectrum {} ― Peptide: {}".format(spectrum_index, all_sequences[spectrum_index]),
                      width = 10000, 
                      height = 500,
                      xaxis = dict(
                          range = [min(all_mz[spectrum_index]) - 10, max(all_mz[spectrum_index]) + 10],
                          tickmode = "linear",
                          tick0 = 0.0,
                          dtick = 2.0
                      )
                     )

    if add_peak_intensities:
        annotations=[dict(
                        x = xi,
                        y = yi + ydelta,
                        text = str(round(xi.item(), 7)),
                        xanchor = "auto",
                        yanchor = "auto",
                        font=dict(size=8),
                        showarrow = False,
                     ) for xi, yi, ydelta in zip(all_mz[spectrum_index], 
                                                 normalized_intensities, 
                                                 ([0.02, 0.04] * (round(len(normalized_intensities) / 2) + 1))[:len(normalized_intensities)])
                    ]

        fig.update_layout(annotations = annotations)
        
    fig.show()
    
    if output_folder:
        fig.write_html(os.path.join(output_folder, "spectra_{}_{}.html".format(spectrum_index, all_sequences[spectrum_index])))
        fig.write_image(os.path.join(output_folder, "spectra_{}_{}.png".format(spectrum_index, all_sequences[spectrum_index])))

In [None]:
spectra_to_plot = random.sample(range(len(all_mz)), 10)

In [None]:
for i in spectra_to_plot:
    plot_spectrum(i, output_folder=OUTPUT_FOLDER) 

### Explore intermediate .pkl file information

In [None]:
BACKUP="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/Adult_Frontalcortex_bRP_Elite_85_crux_q0.01_pvalue-63.91983.pkl"

In [None]:
with open(BACKUP, "rb") as inputFile:
    testBackup = pickle.load(inputFile)

In [None]:
testBackup.keys()

In [None]:
testBackup['normalizationParameters']

### Analyze linfeng intensities, to look for winsorizing effects

In [None]:
LINFENG_62="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_all_q0.01_cell_state/sequences/sample_experiment_v6.2.pkl"

In [None]:
with open(LINFENG_62, "rb") as inputFile:
    linfeng_62 = pickle.load(inputFile)

In [None]:
linfeng_62.keys()

In [None]:
linfeng_62['normalizationParameters']

### Analyze mz values

In [None]:
all_mz = []

for key, same_sequence in linfeng_62['spectra'].items():
    for spectrum in same_sequence:
        all_mz.append(spectrum['nzero_peaks'][:, 0] * linfeng_62['normalizationParameters']['mz_std'] + linfeng_62['normalizationParameters']['mz_mean'])
        

In [None]:
print (len(all_mz))

all_mz_concatenated = torch.cat(all_mz)

all_mz_np = all_mz_concatenated.numpy()

In [None]:
mz_results = describe_data(all_mz_np, histogram_bins = 2000, output_filename = os.path.join(OUTPUT_FOLDER, TEST_LINFENG_DATASET_MZ_INFO))

In [None]:
all_mz_concatenated.int()

In [None]:
indexes = torch.tensor([1.0, 2.0, 3.0])

In [None]:
indexes.type()

In [None]:
torch.set_printoptions(precision=20)

In [None]:
all_mz_concatenated[indexes.long()]

### Analyzing the validation results

In [None]:
# EPOCH_DATA_FILENAME="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/pepmass/last_epoch_data.pkl"
# EPOCH_DATA_RANKS_FILENAME="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/pepmass/last_epoch_ranks_data.pkl"
# EPOCH_DATA_EMBEDDINGS_FILENAME="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/pepmass/last_epoch_embeddings_data.pkl"

EPOCH_DATA_FILENAME="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/identifications_fix/last_epoch_data.pkl"
EPOCH_DATA_RANKS_FILENAME="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/identifications_fix/last_epoch_ranks_data.pkl"
EPOCH_DATA_EMBEDDINGS_FILENAME="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/identifications_fix/last_epoch_embeddings_data.pkl"

In [None]:
with open(EPOCH_DATA_FILENAME, "rb") as inputFile:
    epoch_data = pickle.load(inputFile)

In [None]:
with open(EPOCH_DATA_RANKS_FILENAME, "rb") as inputFile:
    epoch_ranks_data = pickle.load(inputFile)

In [None]:
with open(EPOCH_DATA_EMBEDDINGS_FILENAME, "rb") as inputFile:
    epoch_embeddings_data = pickle.load(inputFile)

In [None]:
samples = []

rank_index = 0

embeddings_starting_index = 0

for epoch in epoch_data[-43:]:
    
#     print("Len(epoch['sequence'])={}".format(len(epoch['sequence'])))
    
    for i in range(len(epoch['sequence']) // 2):
        sample = {}
        sample['sequence'] = epoch['sequence'][i * 2]
        sample['anchor'] = epoch['index'][i * 2]
        sample['positive'] = epoch['index'][i * 2 + 1]
        sample['rank'] = epoch_ranks_data[rank_index]
        sample['anchor_embeddings'] = epoch_embeddings_data[embeddings_starting_index + i * 2]
        sample['postive_embeddings'] = epoch_embeddings_data[embeddings_starting_index + i * 2 + 1]
        
        samples.append(sample)
        
        rank_index += 1
    
    embeddings_starting_index += len(epoch['sequence'])
    

In [None]:
len(samples)

In [None]:
for sample in samples:
    if int(sample['rank']) > 7000:
        print(sample)

In [None]:
samples

In [None]:
# TEST_DATASET_FILE_NEW="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/test_mixedSpectraCrux_v5.1.pkl"

TEST_DATASET_FILE_NEW="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/test_mixedSpectraCrux_v6.0.pkl"

In [None]:
with open(TEST_DATASET_FILE_NEW, "rb") as inputLog:
    testNew = pickle.load(inputLog)

In [None]:
testNew.keys()

In [None]:
testNew['spectraCount']

In [None]:
testNew['spectra']['HIHPELR'][0].keys()

In [None]:
testNew['spectra']['ILGIPVIVTEQYPK'][6]['nzero_peaks'].shape

In [None]:
testNew['spectra']['ILGIPVIVTEQYPK'][1]['nzero_peaks'].shape

In [None]:
testNew['spectra']['ILGIPVIVTEQYPK'][6]

In [None]:
testNew['spectra']['ILGIPVIVTEQYPK'][1]

In [None]:
13.68177641947343 * 57.113074883189604 + 4.060123643775214

In [None]:
13.681745427979195 * 57.113074883189604 + 4.060123643775214

In [None]:
3.3978 * 3893.8025116844333 - 2321.0634765625

In [None]:
testNew['spectra']['ANDTTFGLAAGVFTR'][2]

In [None]:
len(testNew['spectra']['ANDTTFGLAAGVFTR'][2]['nzero_peaks'])

In [None]:
testNew['spectra']['ANDTTFGLAAGVFTR'][0]

In [None]:
len(testNew['spectra']['ANDTTFGLAAGVFTR'][0]['nzero_peaks'])

In [None]:
13.426558258393413 * 57.113074883189604 + 4.060123643775214

In [None]:
13.426592455904299 * 57.113074883189604 + 4.060123643775214

In [None]:
testNew['spectra']['VLTSLGDAIK'][3]['nzero_peaks'].shape

In [None]:
testNew['spectra']['VLTSLGDAIK'][11]['nzero_peaks'].shape

In [None]:
len(testNew['spectra']['VLTSLGDAIK'])

In [None]:
(14.725179189788623 * 59.29983008885746) + 4.244749552710952

In [None]:
(8.508590219375996 * 59.29983008885746) + 4.244749552710952

In [None]:
len(testNew['spectra'])

In [None]:
TRAIN_DATASET_NEW="/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/mixedSpectraCrux/sequences/train_mixedSpectraCrux_v5.1.pkl"

In [None]:
with open(TRAIN_DATASET_NEW, "rb") as inputLog:
    trainNew = pickle.load(inputLog)

In [None]:
trainNew['normalizationParameters']['intensity_percentiles'][-3]

In [None]:
trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks'][:,1][trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks'][:,1] < -0.039] = 0

In [None]:
trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks'][:,1]

In [None]:
len(trainNew['spectra'])

In [None]:
trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks'][:,1]

In [None]:
trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks']

In [None]:
all_intensities = trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks'][:,1]

In [None]:
all_intensities

In [None]:
len(torch.cat((all_intensities, trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][1]['nzero_peaks'][:,1])))

In [None]:
type(trainNew['spectra']['SGPFGQIFRPDNFVFGQSGAGNNWAK'][0]['nzero_peaks'][:,1])

#### Explore training data intensities

In [None]:
all_intensities = None

In [None]:
for sequence in trainNew['spectra']:
    for spectrum in trainNew['spectra'][sequence]:
        if type(all_intensities) == torch.Tensor:
            all_intensities = torch.cat((all_intensities, spectrum['nzero_peaks'][:, 1]))
        else:
            all_intensities = spectrum['nzero_peaks'][:, 1]

In [None]:
intensities_histogram, intensities_bin_edges = np.histogram(all_intensities, bins=100000)

fig = go.Figure()

fig.add_trace(go.Bar(y=intensities_histogram[:100],
                     x=intensities_bin_edges[:100],
                     marker_color='red'))

fig.show()

In [None]:
intensities_bin_edges

In [None]:
trainNew['normalizationParameters']

In [None]:
all_intensities.shape

In [None]:
all_intensities.min()

In [None]:
all_intensities.max()

In [None]:
from scipy import stats

In [None]:
stats.describe(all_intensities)

In [None]:
result = np.percentile(all_intensities, np.round(np.arange(0.0, 100.0, 0.05), 2))

In [None]:
len(result)

In [None]:
for i in range(2000):
    print("Percentile={}, intensity={}".format(np.round(i * 0.05, 2), result[i]))

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(y=result,
                     x=np.round(np.arange(0.0, 100.0, 0.05), 2),
                     marker_color='red'))

fig.show()

In [None]:
result2 = np.percentile(all_intensities, np.round(np.arange(0.0, 100.0, 1.0), 2))

In [None]:
len(result2)

In [None]:
for i in range(100):
    print("Percentile={}, intensity={}".format(np.round(i, 2), result2[i]))

In [None]:
result3 = np.percentile(all_intensities, np.round(np.arange(0.0, 100.0, 0.01), 2))

In [None]:
len(result3)

In [None]:
for i in range(10000):
    print("Percentile={}, intensity={}".format(np.round(i * 0.01, 2), result3[i]))

In [None]:
t1 = torch.zeros([2, 3, 2, 4])

In [None]:
t1.shape

In [None]:
t1 = torch.tensor([[[[0, 0, 0, 0], [1, 1, 1, 1]], [[2, 2, 2, 2], [3, 3, 3, 3]], [[4, 4, 4, 4], [5, 5, 5, 5]]], [[[6, 6, 6, 6], [7, 7, 7, 7]], [[8, 8, 8, 8], [9, 9, 9, 9]], [[10, 10, 10, 10], [11, 11, 11, 11]]]])

In [None]:
t1

In [None]:
tam = torch.tensor([1, 2])

In [None]:
t1[range(t1.shape[0]), 0]

In [None]:
tam - 1

In [None]:
t1.shape

In [None]:
type(t1)