In [None]:
import struct
import os
import numpy as np
import pandas as pd
import random
import plotly.graph_objects as go
import math

In [None]:
PVALUES_FILE="/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/clustering/linfeng/sample/test_pvalues_fix/400.dat_allSpectraPvalues.bin"
IDS_FILE="/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/clustering/linfeng/sample/identifications_sample_0.1_nterm/sample_experiment_identifications/percolator.target.psms.txt"
COUNTS_FILE="/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/spectra_count.tsv"

In [None]:
STRUCT_FIELDS = "BId"

In [None]:
def decode_pvalues_file(pvalues_filename):
    
    pvalues = []

    with open(pvalues_filename, "rb") as inputFile:
        while True:
            record = inputFile.read(struct.calcsize(STRUCT_FIELDS))

            if not record:
                break
            else:
                unpacked = struct.unpack_from(STRUCT_FIELDS, record)
                
                pvalues.append(unpacked)
                
                if math.isnan(unpacked[2]):
                    print("nan: {}".format(record))

    print("Decoded {} pvalues from {}".format(len(pvalues), pvalues_filename))
    
    return np.array(pvalues)

In [None]:
def plot_pvalues_histogram(pvalues):
    
    pvalues_df = pd.DataFrame(pvalues, columns = ["file", "scannr", "pvalue"])
    
    print(pvalues_df['pvalue'].describe(percentiles=list(np.round(np.arange(0.0, 1.0, 0.05), 2))))
    
    pvalues_histogram, pvalues_bin_edges = np.histogram(pvalues_df['pvalue'], 1000)

    fig = go.Figure()

    fig.add_trace(go.Bar(y=pvalues_histogram,
                         x=pvalues_bin_edges[1:],
                         marker_color='red'))
    
    fig.show()
    
    return pvalues_df, pvalues_histogram, pvalues_bin_edges

In [None]:
pvalues = decode_pvalues_file(PVALUES_FILE)

In [None]:
pvalues

In [None]:
pvalues[:,2]

In [None]:
pvalues[pvalues[:, 2] <= -50].shape

In [None]:
min(pvalues[:, 2])

In [None]:
max(pvalues[:, 2])

In [None]:
pvalues_df, _, _ = plot_pvalues_histogram(pvalues)

In [None]:
pvalues_df.sort_values(['pvalue', 'file', 'scannr'])

In [None]:
pvalues_df.groupby('file').count()

In [None]:
spectra_count = pd.read_csv(COUNTS_FILE, sep='\t', names=["file", "spectra count"])

In [None]:
spectra_count

In [None]:
ids = pd.read_csv(IDS_FILE, sep='\t')

In [None]:
ids

In [None]:
ids['file_idx'].min()

In [None]:
pvalues_df['file'].min()

In [None]:
subdf = ids[ids['file_idx'] == 15]

In [None]:
subdf[subdf['scan'] == 8991]

In [None]:
sequence = subdf[subdf['scan'] == 8991]['sequence']
sequence

In [None]:
sum(ids['sequence'] == sequence.values[0])

In [None]:
subdf = ids[ids['file_idx'] == 19]
sequence = subdf[subdf['scan'] == 8267]['sequence']
sum(ids['sequence'] == sequence.values[0])

In [None]:
sequence

In [None]:
pvalues_df['scannr'].max()

In [None]:
pvalues_df['scannr'].max()

In [None]:
ids['scan'].max()

In [None]:
for index, row in pvalues_df.sort_values(['pvalue']).iterrows():
    subdf = ids[ids['file_idx'] == row['file']]
    subdf = subdf[subdf['scan'] == (row['scannr'] + 1)]
    
    if subdf.shape[0] > 0:
        print("file:{}, scan:{}, pvalue:{}, spectra count:{}".format(row['file'], 
                                                                     row['scannr'], 
                                                                     row['pvalue'], 
                                                                     subdf['distinct matches/spectrum'].values[0]))
    else:
        print("file:{}, scan:{}, pvalue:{}, spectra count:{}".format(row['file'], 
                                                                     row['scannr'], 
                                                                     row['pvalue'], 
                                                                     "not identified"))        