In [1]:
import pandas as pd
import anndata as ad
import numpy as np
from scipy import sparse
from sklearn.preprocessing import MultiLabelBinarizer

from scviz import pAnnData as pAnnData
from scviz import plotting as scplt
from scviz import utils as scutils
import scanpy as sc

from typing import (  # Meta  # Generic ABCs  # Generic
    TYPE_CHECKING,
    Any,
    Literal,
    Optional,
    List
)

# Functions to prepare pAnnData

In [161]:
# todo! actually check through these for useful stuff
from typing import List

# check if pep_obs_names and prot_obs_names are the same
def check_obs_names(pdata: pAnnData.pAnnData) -> bool:
    if pdata.prot.obs_names is not None and pdata.pep.obs_names is not None:
        return pdata.prot.obs_names == pdata.pep.obs_names
    else:
        return False
        
def merge(pdata1: pAnnData, pdata2: pAnnData) -> pAnnData:
    # merge two pAnnData objects
    # check if pdata1 and pdata2 have the same obs names
    if check_obs_names(pdata1) and check_obs_names(pdata2):
        # merge pdata1 and pdata2
        prot = pdata1.prot.concatenate(pdata2.prot, join="outer")
        pep = pdata1.pep.concatenate(pdata2.pep, join="outer")
        rs = sparse.vstack([pdata1.rs, pdata2.rs])
        return pAnnData(prot, pep, rs)
    else:
        raise ValueError("Protein and peptide obs names must be the same.")
    
def filter(pdata: pAnnData, prot: List[str], pep: List[str]) -> pAnnData:
    # filter protein and peptide data
    # filter protein data
    if prot is not None:
        prot = pdata.prot[:, prot]
    else:
        prot = pdata.prot
    # filter peptide data
    if pep is not None:
        pep = pdata.pep[:, pep]
    else:
        pep = pdata.pep
    return pAnnData(prot, pep, pdata.rs)

def normalize(pdata: pAnnData, axis: Literal['protein', 'peptide', 'both']) -> pAnnData:
    # normalize protein or peptide data
    if axis == 'protein':
        prot = pdata.prot.X / pdata.prot.X.sum(axis=0)
        return pAnnData(prot, pdata.pep, pdata.rs)
    elif axis == 'peptide':
        pep = pdata.pep.X / pdata.pep.X.sum(axis=0)
        return pAnnData(pdata.prot, pep, pdata.rs)
    elif axis == 'both':
        prot = pdata.prot.X / pdata.prot.X.sum(axis=0)
        pep = pdata.pep.X / pdata.pep.X.sum(axis=0)
        return pAnnData(prot, pep, pdata.rs)
    else:
        raise ValueError("axis must be 'protein', 'peptide', or 'both'.")

# Actual import

In [2]:
# test import_proteomeDiscoverer function
pdata = pAnnData.import_proteomeDiscoverer(prot_file='pd_prot.txt', pep_file='pd_pep.txt', obs_columns = ['Sample', 'method', 'duration', 'type'])

--------------------------
Starting import...
--------------------------
Importing from pd_prot.txt
Number of files: 60
Number of proteins: 1571
Importing from pd_pep.txt
Number of files: 60
Number of peptides: 6352
RS matrix successfully computed
Setting rs matrix with dimensions (6352, 1571)
Transposing rs matrix to protein x peptide format
pAnnData object created. Use `print(pdata)` to view the object.


In [4]:
pdata.prot.X

<60x1571 sparse matrix of type '<class 'numpy.float64'>'
	with 94260 stored elements in Compressed Sparse Column format>

In [3]:
pdata.prot.obs

Unnamed: 0,Sample,method,duration,type,quant,protein_count,mbr_count,high_count
F13,Sample,dda,60min,hct116,0.369828,581,318,282
F14,Sample,dda,60min,hct116,0.488224,767,262,565
F15,Sample,dda,60min,hct116,0.503501,791,320,517
F16,Sample,dda,60min,hct116,0.41184,647,332,352
F17,Sample,dda,60min,hct116,0.44112,693,312,407
F18,Sample,dda,60min,hct116,0.413749,650,295,392
F19,Sample,dda,60min,hct116,0.536601,843,303,612
F20,Sample,dda,60min,hct116,0.521961,820,296,594
F21,Sample,dda,60min,hct116,0.567791,892,310,642
F22,Sample,dda,60min,hct116,0.478676,752,335,476


In [3]:
pdata

pAnnData object
Protein (shape: 60 files by 1571 proteins)
obs: Sample, method, duration, type, quant...
var: Protein FDR Confidence: Combined, Master, Accession, Description, Exp. q-value: Combined...
obsm: 
layers: X_mbr, X_raw

Peptide (shape: 60 files by 6352 peptides)
obs: Sample, method, duration, type, quant...
var: Modifications, Qvality PEP, Qvality q-value, # Protein Groups, # Proteins...
layers: X_mbr, X_raw

RS (shape: 1571 proteins by 6352 peptides)


In [6]:
obs_columns = ['Name','Amt','Enzyme','Date','Instrument','Acquisition','Method','Duration','Replicate']

pdata = pAnnData.import_diann(report_file='report.tsv', obs_columns=obs_columns)

Setting rs matrix with dimensions (51510, 7372)
Transposing rs matrix to protein x peptide format


In [7]:
pdata

pAnnData object
Protein (shape: 10 files by 7372 proteins)
obs: Name, Amt, Enzyme, Date, Instrument...
var: First.Protein.Description, Genes
obsm: 
layers: 

Peptide (shape: 10 files by 51510 peptides)
obs: Name, Amt, Enzyme, Date, Instrument...
var: Modified.Sequence, Stripped.Sequence
layers: 

RS (shape: 7372 proteins by 51510 peptides)
