# Pre-Assembling

In [2]:
import pandas as pd
import numpy as np
import anndata as ad
import re
import csv

from scipy.sparse import coo_matrix
from scipy.io import mmwrite
from pathlib import Path

import sctoolbox
import sctoolbox.utils as utils
import sctoolbox.utils.assemblers as assembler

### --- Datei einlesen ---

In [3]:
file_path_pat = "/mnt/workspace_stud/napkon_data/wp2_rna/out.txt"
file_path_gene = "/mnt/workspace_stud/napkon_data/wp2_rna/combined_rna_napkon_4_all.matrix.raw.anno"


path_mtx = str(Path.cwd()) # Aktueller Speicherort/Verzeichnis mit .mtx, barcodes.tsv und genes.tsv
print(path_mtx)

/home/stud9/notebooks/rna_analysis/notebooks/ScRNA-analysis


### 1. --- Patienten-Metadaten lesen und vorbereiten für Assemblierung ---

In [4]:
pat_df = pd.read_csv(file_path_pat, decimal=",", sep="\t")
pat_df = pat_df.fillna(0)
pat_df.tail()

Unnamed: 0,TubeID,meta TubeID,meta pop export_psn,meta pop visit_1_date,meta pop visit_2_date,meta pop visit_3_date,meta pop visit_4_date,meta pop age,meta pop age_dec,meta pop sex,...,rna unassigned reads due to bad mapping quality,rna unassigned reads due to low fragment length,rna unassigned chimeric reads,rna usable sample: mapped reads > 2000000,rna project.run id,rna DDX3Y,rna XIST,rna gender,rna sample id original,rna plate id original
3771,0,0,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,False,plt33.run1 pat_2384,0.0,0.0,0,Sample,Plate 33
3772,0,0,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,False,plt38.run1 pat_2741,0.0,0.0,0,Sample 1,Plate 38
3773,0,0,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,True,plt40.run1 pat_3006,809.0,4.0,M,Sample 27,Plate 40
3774,0,0,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,False,plt40.run1 pat_3033,0.0,0.0,0,Sample 32,Plate 40
3775,0,0,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,False,plt1.run1 pat_146,0.0,0.0,0,Sample 1,Plate 1


In [5]:
pat_df["rna project.run id"] # ID überprüfen

0                         0
1          plt5.run1 pat_26
2                         0
3                         0
4         plt2.run1 pat_195
               ...         
3771    plt33.run1 pat_2384
3772    plt38.run1 pat_2741
3773    plt40.run1 pat_3006
3774    plt40.run1 pat_3033
3775      plt1.run1 pat_146
Name: rna project.run id, Length: 3776, dtype: object

In [5]:
# optional: Patienten-Datei im lokalen Speicherort abspeichern
# pat_df.to_csv('pat_meta.csv', index=False) 

### 2. --- Gen-Metadaten lesen und vorbereiten für Assemblierung ---

In [6]:
data = []

with open(file_path_gene, 'r') as file:
    content = file.read()
    lines = content.splitlines()

    header = lines[0].split('\t')

    for line in lines[1:]:
        elements = line.split('\t')
        data.append(elements)

gene_df = pd.DataFrame(data, columns=header)
gene_df.head()

Unnamed: 0,Ensembl gene id,Ensembl gene,Ensembl biotype,UniProt proteins,UniProt genes,UniProt accessions,UniProt names,UniProt Ensembl transcripts,UniProt Ensembl proteins,UniProt Ensembl gene ids,...,plt49.run1 pat_3563,plt49.run1 pat_3564,plt49.run1 pat_3565,plt49.run1 pat_3566,plt49.run1 pat_3567,plt49.run1 pat_3568,plt49.run1 pat_3569,plt49.run1 pat_3570,plt49.run1 pat_3571,plt49.run1 pat_3572
0,ENSG00000146555,SDK1,protein_coding,Protein sidekick-1,SDK1,Q7Z5N4; F8W6X9; A0A087WTQ6,SDK1_HUMAN; F8W6X9_HUMAN; A0A087WTQ6_HUMAN,ENST00000404826; ENST00000389531; ENST00000615806,ENSP00000385899; ENSP00000374182; ENSP00000478062,ENSG00000146555,...,5,2,2,0,2,0,1,2,1,0
1,ENSG00000283537,ENSG00000283537,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000283537,...,0,0,1,0,0,0,0,0,0,0
2,ENSG00000166523,CLEC4E,protein_coding,C-type lectin domain family 4 member E,"CLEC4E, CLECSF9, MINCLE, UNQ218/PRO244; CLEC4E",Q9ULY5; F8WFA1; F5H5X7,CLC4E_HUMAN; F8WFA1_HUMAN; F5H5X7_HUMAN,"ENST00000299663; ENST00000446457, ENST00000450...","ENSP00000299663; ENSP00000387737, ENSP00000404...",ENSG00000166523,...,198,96,90,287,309,94,62,215,261,131
3,ENSG00000125945,ZNF436,protein_coding,Zinc finger protein 436,"ZNF436, KIAA1710",Q9C0F3,ZN436_HUMAN,"ENST00000314011, ENST00000374608, ENST00000635...","ENSP00000313582, ENSP00000363736, ENSP00000489...",ENSG00000125945,...,40,21,70,39,71,15,26,66,41,17
4,ENSG00000206145,P2RX6P,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000206145,...,0,0,0,0,0,1,0,1,0,0


In [7]:
gene_df.fillna(0)
gene_df.head()

Unnamed: 0,Ensembl gene id,Ensembl gene,Ensembl biotype,UniProt proteins,UniProt genes,UniProt accessions,UniProt names,UniProt Ensembl transcripts,UniProt Ensembl proteins,UniProt Ensembl gene ids,...,plt49.run1 pat_3563,plt49.run1 pat_3564,plt49.run1 pat_3565,plt49.run1 pat_3566,plt49.run1 pat_3567,plt49.run1 pat_3568,plt49.run1 pat_3569,plt49.run1 pat_3570,plt49.run1 pat_3571,plt49.run1 pat_3572
0,ENSG00000146555,SDK1,protein_coding,Protein sidekick-1,SDK1,Q7Z5N4; F8W6X9; A0A087WTQ6,SDK1_HUMAN; F8W6X9_HUMAN; A0A087WTQ6_HUMAN,ENST00000404826; ENST00000389531; ENST00000615806,ENSP00000385899; ENSP00000374182; ENSP00000478062,ENSG00000146555,...,5,2,2,0,2,0,1,2,1,0
1,ENSG00000283537,ENSG00000283537,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000283537,...,0,0,1,0,0,0,0,0,0,0
2,ENSG00000166523,CLEC4E,protein_coding,C-type lectin domain family 4 member E,"CLEC4E, CLECSF9, MINCLE, UNQ218/PRO244; CLEC4E",Q9ULY5; F8WFA1; F5H5X7,CLC4E_HUMAN; F8WFA1_HUMAN; F5H5X7_HUMAN,"ENST00000299663; ENST00000446457, ENST00000450...","ENSP00000299663; ENSP00000387737, ENSP00000404...",ENSG00000166523,...,198,96,90,287,309,94,62,215,261,131
3,ENSG00000125945,ZNF436,protein_coding,Zinc finger protein 436,"ZNF436, KIAA1710",Q9C0F3,ZN436_HUMAN,"ENST00000314011, ENST00000374608, ENST00000635...","ENSP00000313582, ENSP00000363736, ENSP00000489...",ENSG00000125945,...,40,21,70,39,71,15,26,66,41,17
4,ENSG00000206145,P2RX6P,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000206145,...,0,0,0,0,0,1,0,1,0,0


In [8]:
gene_df.to_csv('gene_meta.csv', index =False) # Gen-Datei speichern

In [9]:
features = gene_df.loc[:,['Ensembl gene id', 'Ensembl gene', 'Ensembl biotype', 'UniProt proteins', 'UniProt genes', 'UniProt accessions', 'UniProt names', 'UniProt Ensembl transcripts', 'UniProt Ensembl proteins', 'UniProt Ensembl gene ids', 'Ensembl chr', 'Ensembl start', 'Ensembl stop', 'Ensembl strand', 'KEGG PATHWAY terms', 'KEGG PATHWAY ids', 'Gene Ontology terms', 'Gene Ontology ids']] # Auswahl der ersten 18 Spalten -> die Gene ids und Spalten mit Metadaten
features.head()

Unnamed: 0,Ensembl gene id,Ensembl gene,Ensembl biotype,UniProt proteins,UniProt genes,UniProt accessions,UniProt names,UniProt Ensembl transcripts,UniProt Ensembl proteins,UniProt Ensembl gene ids,Ensembl chr,Ensembl start,Ensembl stop,Ensembl strand,KEGG PATHWAY terms,KEGG PATHWAY ids,Gene Ontology terms,Gene Ontology ids
0,ENSG00000146555,SDK1,protein_coding,Protein sidekick-1,SDK1,Q7Z5N4; F8W6X9; A0A087WTQ6,SDK1_HUMAN; F8W6X9_HUMAN; A0A087WTQ6_HUMAN,ENST00000404826; ENST00000389531; ENST00000615806,ENSP00000385899; ENSP00000374182; ENSP00000478062,ENSG00000146555,chr7,3301252,4269000,+,,,"eye development, neural retina development, bi...","GO:0001654, GO:0003407, GO:0005488, GO:0005515..."
1,ENSG00000283537,ENSG00000283537,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000283537,chr7,143620943,143645675,+,,,,
2,ENSG00000166523,CLEC4E,protein_coding,C-type lectin domain family 4 member E,"CLEC4E, CLECSF9, MINCLE, UNQ218/PRO244; CLEC4E",Q9ULY5; F8WFA1; F5H5X7,CLC4E_HUMAN; F8WFA1_HUMAN; F5H5X7_HUMAN,"ENST00000299663; ENST00000446457, ENST00000450...","ENSP00000299663; ENSP00000387737, ENSP00000404...",ENSG00000166523,chr12,8533305,8540905,-,Tuberculosis,hsa05152,"cell activation, cytokine production, regulati...","GO:0001775, GO:0001816, GO:0001817, GO:0001819..."
3,ENSG00000125945,ZNF436,protein_coding,Zinc finger protein 436,"ZNF436, KIAA1710",Q9C0F3,ZN436_HUMAN,"ENST00000314011, ENST00000374608, ENST00000635...","ENSP00000313582, ENSP00000363736, ENSP00000489...",ENSG00000125945,chr1,23359448,23369836,-,,,nucleic acid binding transcription factor acti...,"GO:0001071, GO:0003676, GO:0003677, GO:0003700..."
4,ENSG00000206145,P2RX6P,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000206145,chr22,21035243,21045017,-,,,,


In [10]:
features.to_csv("variables.tsv", sep="\t", index=False, header=False) # Variables-Datei speichern

### 3. --- Barcodes Datei aus Patienten-Metadaten erstellen ---

In [11]:
# Barcodes-Datei (Patienten + Metadaten)

patient_ids = gene_df.columns[18:]  # Patienten-Spalten starten ab Spalte 18 
patient_ids

Index(['plt1.run1 pat_88', 'plt1.run1 pat_89', 'plt1.run1 pat_90',
       'plt1.run1 pat_91', 'plt1.run1 pat_92', 'plt1.run1 pat_93',
       'plt1.run1 pat_94', 'plt1.run1 pat_95', 'plt1.run1 pat_96',
       'plt1.run1 pat_97',
       ...
       'plt49.run1 pat_3563', 'plt49.run1 pat_3564', 'plt49.run1 pat_3565',
       'plt49.run1 pat_3566', 'plt49.run1 pat_3567', 'plt49.run1 pat_3568',
       'plt49.run1 pat_3569', 'plt49.run1 pat_3570', 'plt49.run1 pat_3571',
       'plt49.run1 pat_3572'],
      dtype='object', length=3661)

In [12]:
barcodes = pd.DataFrame({
    "rna project.run id": patient_ids # Benennung der Spalte in barcodes.tsv mit "rna project.run id"
}).merge(pat_df, left_on="rna project.run id", right_on="rna project.run id", how="left")  # Metadaten anfügen
barcodes = barcodes.fillna(0)
barcodes.to_csv("barcodes.tsv", sep="\t", index=False, header=False)
barcodes.tail()

Unnamed: 0,rna project.run id,TubeID,meta TubeID,meta pop export_psn,meta pop visit_1_date,meta pop visit_2_date,meta pop visit_3_date,meta pop visit_4_date,meta pop age,meta pop age_dec,...,rna unassigned reads not mapping,rna unassigned reads due to bad mapping quality,rna unassigned reads due to low fragment length,rna unassigned chimeric reads,rna usable sample: mapped reads > 2000000,rna DDX3Y,rna XIST,rna gender,rna sample id original,rna plate id original
3656,plt49.run1 pat_3568,0,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0
3657,plt49.run1 pat_3569,0,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0
3658,plt49.run1 pat_3570,0,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0
3659,plt49.run1 pat_3571,0,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0
3660,plt49.run1 pat_3572,0,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0


In [13]:
barcodes = pd.read_csv("barcodes.tsv", decimal=".", sep="\t") # Barcodes-Datei speichern

### 4. --- Count-Matrix erstellen aus Gen-Metadaten ---

In [14]:
counts = gene_df.iloc[:, 18:].fillna(0) # Zählwerte extrahieren und NaN durch 0 ersetzen
counts

Unnamed: 0,plt1.run1 pat_88,plt1.run1 pat_89,plt1.run1 pat_90,plt1.run1 pat_91,plt1.run1 pat_92,plt1.run1 pat_93,plt1.run1 pat_94,plt1.run1 pat_95,plt1.run1 pat_96,plt1.run1 pat_97,...,plt49.run1 pat_3563,plt49.run1 pat_3564,plt49.run1 pat_3565,plt49.run1 pat_3566,plt49.run1 pat_3567,plt49.run1 pat_3568,plt49.run1 pat_3569,plt49.run1 pat_3570,plt49.run1 pat_3571,plt49.run1 pat_3572
0,2,0,0,3,3,0,1,1,16,11,...,5,2,2,0,2,0,1,2,1,0
1,0,0,0,0,0,0,0,0,0,3,...,0,0,1,0,0,0,0,0,0,0
2,225,34,13,102,156,175,389,450,224,157,...,198,96,90,287,309,94,62,215,261,131
3,73,17,2,48,102,27,65,62,90,73,...,40,21,70,39,71,15,26,66,41,17
4,0,0,0,0,4,0,2,1,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57026,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
sparse_matrix = coo_matrix(counts.values) # Sparse-Matrix erstellen
sparse_matrix

<COOrdinate sparse matrix of dtype 'object'
	with 208790491 stored elements and shape (57031, 3661)>

In [16]:
mmwrite("matrix.mtx", sparse_matrix) # MTX-Datei speichern

### 5. --- Anndata-Objekt aus .mtx, barcodes.tsv und genes.tsv erstellen ---

In [17]:
mtx_tsv = 'matrix.mtx' # Datei, die die Counts enthält
barcodes_tsv = 'barcodes.tsv' # Datei, die Barcode-Informationen enthält
variables_tsv = 'variables.tsv' # Datei, die variable Informationen enthält

In [18]:
print(pat_df.shape, barcodes.shape, gene_df.shape, features.shape) # Shape überprüfen

(3776, 75) (3660, 75) (57031, 3679) (57031, 18)


In [19]:
if path_mtx:
    adata = assembler.from_mtx(path_mtx, mtx=mtx_tsv, barcodes=barcodes_tsv, variables=variables_tsv)

[INFO] Reading files: 1 of 1 


### 6. --- AnnData vorbereiten ---

In [37]:
# Adata Obs einen Header hinzufügren
adata.obs_names.name = 'rna project.run id'
obs_columns_list = pat_df.columns.tolist()
obs_columns_list.remove('rna project.run id')
obs_columns_list.append('filename') # möglicherweise muss die komplette Spalte raus
obs_columns_list.append('rel_path') # möglicherweise muss die komplette Spalte raus
obs_header = obs_columns_list
adata.obs.columns = obs_header
adata

AnnData object with n_obs × n_vars = 3661 × 57031
    obs: 'TubeID', 'meta TubeID', 'meta pop export_psn', 'meta pop visit_1_date', 'meta pop visit_2_date', 'meta pop visit_3_date', 'meta pop visit_4_date', 'meta pop age', 'meta pop age_dec', 'meta pop sex', 'meta pop c19_severity', 'meta pop cohort', 'meta suep_hap export_psn', 'meta suep_hap baseline_date', 'meta suep_hap end_acute_date', 'meta suep_hap end_acute_visit_date', 'meta suep_hap m3_fu_date', 'meta suep_hap m6_fu_date', 'meta suep_hap m12_fu_date', 'meta suep_hap m24_fu_date', 'meta suep_hap age', 'meta suep_hap age_dec', 'meta suep_hap sex', 'meta suep_hap c19_severity', 'meta suep_hap cohort', 'meta export_psn', 'meta age', 'meta age_dec', 'meta sex', 'meta c19_severity', 'meta cohort', 'rna TubeID', 'rna id', 'rna fastq1', 'rna project', 'rna run', 'rna plate', 'rna organism', 'rna assembly', 'rna release', 'rna parameters', 'rna reads processed', 'rna reads with exactly 1 alignment', 'rna reads with multiple alignments

In [38]:
var_header = features.columns.tolist()
var_header.remove("Ensembl gene id")
adata.var_names.name = "Ensembl gene id"
adata.var.columns = var_header

In [39]:
with pd.option_context('display.max_rows', 5,'display.max_columns', None):
    display(adata.obs)
    display(adata.var)

Unnamed: 0_level_0,TubeID,meta TubeID,meta pop export_psn,meta pop visit_1_date,meta pop visit_2_date,meta pop visit_3_date,meta pop visit_4_date,meta pop age,meta pop age_dec,meta pop sex,meta pop c19_severity,meta pop cohort,meta suep_hap export_psn,meta suep_hap baseline_date,meta suep_hap end_acute_date,meta suep_hap end_acute_visit_date,meta suep_hap m3_fu_date,meta suep_hap m6_fu_date,meta suep_hap m12_fu_date,meta suep_hap m24_fu_date,meta suep_hap age,meta suep_hap age_dec,meta suep_hap sex,meta suep_hap c19_severity,meta suep_hap cohort,meta export_psn,meta age,meta age_dec,meta sex,meta c19_severity,meta cohort,rna TubeID,rna id,rna fastq1,rna project,rna run,rna plate,rna organism,rna assembly,rna release,rna parameters,rna reads processed,rna reads with exactly 1 alignment,rna reads with multiple alignments,rna reads with no alignment,rna reads aligned,rna % reads aligned,rna duplicate reads,rna % duplicate reads,rna mitochondrial reads,rna % mitochondrial reads,rna rrna subunit reads,rna % rrna subunit reads,rna pcr bottleneck coefficient,rna mean fragment size (only for paired-end),"rna reads used for further steps (after optional filters for multimap, duplicate, mitochondria, rrna)",rna parameters.1,rna reads processed.1,rna reads assigned,rna % reads assigned total,rna % features with cpm >= 0.5,rna unassigned reads overlapping multiple genes,rna unassigned reads mapping to multiple locations,rna unassigned reads mapping to no feature,rna unassigned reads not mapping,rna unassigned reads due to bad mapping quality,rna unassigned reads due to low fragment length,rna unassigned chimeric reads,rna usable sample: mapped reads > 2000000,rna DDX3Y,rna XIST,rna gender,rna sample id original,rna plate id original,filename,rel_path
rna project.run id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
plt1.run1 pat_88,LV1007699778,LV1007699778,0,0,0,0,0,0.0,0,0,0,0,hap_0178,17/11/2021,24/11/2021,24/11/2021,0,0,0,0,65.0,60-69,M,severe,hap,hap_0178,65.0,60-69,M,severe,hap,LV1007699778,pat_88,AAAVY52HV_NAPKON_Plate1_2_22s004137-1-1_Valasa...,plt1,run1,plate1,human,hg38,104.0,"mode: single-end, keep duplicates: no, keep mu...",29112743,23152813,4044454,1915476,27197267,93.0,20093139,73.0,1714158,6.0,1868907,6.0,0.28,0.0,6644895,-t exon -g gene_id -s 2; multi-mapping: no; du...,6644895,4810861,16.0,49.0,216.360,0.0,1617674,0.0,0.0,0.0,0.0,True,844.0,8.0,M,Sample 49,Plate 1,matrix.mtx,.
plt1.run1 pat_89,LV1007699743,LV1007699743,0,0,0,0,0,0.0,0,0,0,0,suep_0425,08/11/2021,17/11/2021,17/11/2021,0,0,0,0,61.0,60-69,M,mild,suep,suep_0425,61.0,60-69,M,mild,suep,LV1007699743,pat_89,AAAVY52HV_NAPKON_Plate1_2_22s004137-1-1_Valasa...,plt1,run1,plate1,human,hg38,104.0,"mode: single-end, keep duplicates: no, keep mu...",24355220,13253462,7713379,3388379,20966841,86.0,17678467,84.0,1475002,7.0,6372269,30.0,0.22,0.0,2989995,-t exon -g gene_id -s 2; multi-mapping: no; du...,2989995,2155768,8.0,45.0,97.961,0.0,736266,0.0,0.0,0.0,0.0,True,670.0,4.0,M,Sample 50,Plate 1,matrix.mtx,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
plt49.run1 pat_3571,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0.00,0.0,0,0,0,0,0.0,0.0,0.000,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,matrix.mtx,.
plt49.run1 pat_3572,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0.00,0.0,0,0,0,0,0.0,0.0,0.000,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,matrix.mtx,.


Unnamed: 0_level_0,Ensembl gene,Ensembl biotype,UniProt proteins,UniProt genes,UniProt accessions,UniProt names,UniProt Ensembl transcripts,UniProt Ensembl proteins,UniProt Ensembl gene ids,Ensembl chr,Ensembl start,Ensembl stop,Ensembl strand,KEGG PATHWAY terms,KEGG PATHWAY ids,Gene Ontology terms,Gene Ontology ids
Ensembl gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ENSG00000146555,SDK1,protein_coding,Protein sidekick-1,SDK1,Q7Z5N4; F8W6X9; A0A087WTQ6,SDK1_HUMAN; F8W6X9_HUMAN; A0A087WTQ6_HUMAN,ENST00000404826; ENST00000389531; ENST00000615806,ENSP00000385899; ENSP00000374182; ENSP00000478062,ENSG00000146555,chr7,3301252,4269000,+,,,"eye development, neural retina development, bi...","GO:0001654, GO:0003407, GO:0005488, GO:0005515..."
ENSG00000283537,ENSG00000283537,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000283537,chr7,143620943,143645675,+,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000270696,ENSG00000270696,lncRNA,,,,,,,ENSG00000270696,chr2,75660462,75662208,+,,,,
ENSG00000229906,SNRPGP11,processed_pseudogene,,,,,,,ENSG00000229906,chr13,51496020,51496251,-,,,,


In [40]:
pat_df.columns.tolist()

['TubeID',
 'meta TubeID',
 'meta pop export_psn',
 'meta pop visit_1_date',
 'meta pop visit_2_date',
 'meta pop visit_3_date',
 'meta pop visit_4_date',
 'meta pop age',
 'meta pop age_dec',
 'meta pop sex',
 'meta pop c19_severity',
 'meta pop cohort',
 'meta suep_hap export_psn',
 'meta suep_hap baseline_date',
 'meta suep_hap end_acute_date',
 'meta suep_hap end_acute_visit_date',
 'meta suep_hap m3_fu_date',
 'meta suep_hap m6_fu_date',
 'meta suep_hap m12_fu_date',
 'meta suep_hap m24_fu_date',
 'meta suep_hap age',
 'meta suep_hap age_dec',
 'meta suep_hap sex',
 'meta suep_hap c19_severity',
 'meta suep_hap cohort',
 'meta export_psn',
 'meta age',
 'meta age_dec',
 'meta sex',
 'meta c19_severity',
 'meta cohort',
 'rna TubeID',
 'rna id',
 'rna fastq1',
 'rna project',
 'rna run',
 'rna plate',
 'rna organism',
 'rna assembly',
 'rna release',
 'rna parameters',
 'rna reads processed',
 'rna reads with exactly 1 alignment',
 'rna reads with multiple alignments',
 'rna reads 

In [41]:
adata.obs['rna usable sample: mapped reads > 2000000'] = adata.obs['rna usable sample: mapped reads > 2000000'].astype(str) 

In [42]:
# .obs-Spaltennamen, die gelöscht werden sollen
drop_obs = []

# .obs-Spaltennamen, die geändert werden sollen. z.B. "old_name": "new_name"
# Beispiel: "old_name" = "rna usable sample: mapped reads > 2000000" -> Regex bevorzugt
rename_obs = {'rna reads used for further steps (after optional filters for multimap, duplicate, mitochondria, rrna)':'rna reads used after filters', 'rna usable sample: mapped reads > 2000000':'mapped reads > 2000000'} 

#Namen ändern mithilfe von Regex
replace_obs = {'unassigned':'unass.','due to':'',' ':'_','%':'pct',':':'_of','>=':'at_least','>':'over'}
replace_var = {' ':'_'}

# .var-Spaltennamen, die gelöscht werden sollen
drop_var = []

# .var-Spaltennamen, die geändert werden sollen. z.B. "old_name": "new_name"
rename_var = {}


In [44]:
# Änderungen an .obs-Spaltennamen in eine Kopie absspeichern
obs = adata.obs.copy()

obs.drop(columns=drop_obs, inplace=True)
obs.rename(columns=rename_obs, errors='raise', inplace=True)

#replace
for x, y in replace_obs.items():
    obs.columns= obs.columns.str.replace(x, y,regex=True)
    
    
# Änderungen an .var-Spaltennamen in eine Kopie absspeichern
var = adata.var.copy()

var.drop(columns=drop_var, inplace=True)
var.rename(columns=rename_var, errors='raise', inplace=True)

for x, y in replace_var.items():
    var.columns= var.columns.str.replace(x, y,regex=True) 

# Änderungen an AnnData anwenden
adata.obs = obs
adata.var = var


### 7. --- AnnData-Objekt speichern ---

In [45]:
display(adata) # Übersicht
with pd.option_context('display.max_rows', 5,'display.max_columns', None):
    display(adata.obs)
    display(adata.var)

AnnData object with n_obs × n_vars = 3661 × 57031
    obs: 'TubeID', 'meta_TubeID', 'meta_pop_export_psn', 'meta_pop_visit_1_date', 'meta_pop_visit_2_date', 'meta_pop_visit_3_date', 'meta_pop_visit_4_date', 'meta_pop_age', 'meta_pop_age_dec', 'meta_pop_sex', 'meta_pop_c19_severity', 'meta_pop_cohort', 'meta_suep_hap_export_psn', 'meta_suep_hap_baseline_date', 'meta_suep_hap_end_acute_date', 'meta_suep_hap_end_acute_visit_date', 'meta_suep_hap_m3_fu_date', 'meta_suep_hap_m6_fu_date', 'meta_suep_hap_m12_fu_date', 'meta_suep_hap_m24_fu_date', 'meta_suep_hap_age', 'meta_suep_hap_age_dec', 'meta_suep_hap_sex', 'meta_suep_hap_c19_severity', 'meta_suep_hap_cohort', 'meta_export_psn', 'meta_age', 'meta_age_dec', 'meta_sex', 'meta_c19_severity', 'meta_cohort', 'rna_TubeID', 'rna_id', 'rna_fastq1', 'rna_project', 'rna_run', 'rna_plate', 'rna_organism', 'rna_assembly', 'rna_release', 'rna_parameters', 'rna_reads_processed', 'rna_reads_with_exactly_1_alignment', 'rna_reads_with_multiple_alignments

Unnamed: 0_level_0,TubeID,meta_TubeID,meta_pop_export_psn,meta_pop_visit_1_date,meta_pop_visit_2_date,meta_pop_visit_3_date,meta_pop_visit_4_date,meta_pop_age,meta_pop_age_dec,meta_pop_sex,meta_pop_c19_severity,meta_pop_cohort,meta_suep_hap_export_psn,meta_suep_hap_baseline_date,meta_suep_hap_end_acute_date,meta_suep_hap_end_acute_visit_date,meta_suep_hap_m3_fu_date,meta_suep_hap_m6_fu_date,meta_suep_hap_m12_fu_date,meta_suep_hap_m24_fu_date,meta_suep_hap_age,meta_suep_hap_age_dec,meta_suep_hap_sex,meta_suep_hap_c19_severity,meta_suep_hap_cohort,meta_export_psn,meta_age,meta_age_dec,meta_sex,meta_c19_severity,meta_cohort,rna_TubeID,rna_id,rna_fastq1,rna_project,rna_run,rna_plate,rna_organism,rna_assembly,rna_release,rna_parameters,rna_reads_processed,rna_reads_with_exactly_1_alignment,rna_reads_with_multiple_alignments,rna_reads_with_no_alignment,rna_reads_aligned,rna_pct_reads_aligned,rna_duplicate_reads,rna_pct_duplicate_reads,rna_mitochondrial_reads,rna_pct_mitochondrial_reads,rna_rrna_subunit_reads,rna_pct_rrna_subunit_reads,rna_pcr_bottleneck_coefficient,rna_mean_fragment_size_(only_for_paired-end),rna_reads_used_after_filters,rna_parameters.1,rna_reads_processed.1,rna_reads_assigned,rna_pct_reads_assigned_total,rna_pct_features_with_cpm_at_least_0.5,rna_unass._reads_overlapping_multiple_genes,rna_unass._reads_mapping_to_multiple_locations,rna_unass._reads_mapping_to_no_feature,rna_unass._reads_not_mapping,rna_unass._reads__bad_mapping_quality,rna_unass._reads__low_fragment_length,rna_unass._chimeric_reads,mapped_reads_over_2000000,rna_DDX3Y,rna_XIST,rna_gender,rna_sample_id_original,rna_plate_id_original,filename,rel_path
rna project.run id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
plt1.run1 pat_88,LV1007699778,LV1007699778,0,0,0,0,0,0.0,0,0,0,0,hap_0178,17/11/2021,24/11/2021,24/11/2021,0,0,0,0,65.0,60-69,M,severe,hap,hap_0178,65.0,60-69,M,severe,hap,LV1007699778,pat_88,AAAVY52HV_NAPKON_Plate1_2_22s004137-1-1_Valasa...,plt1,run1,plate1,human,hg38,104.0,"mode: single-end, keep duplicates: no, keep mu...",29112743,23152813,4044454,1915476,27197267,93.0,20093139,73.0,1714158,6.0,1868907,6.0,0.28,0.0,6644895,-t exon -g gene_id -s 2; multi-mapping: no; du...,6644895,4810861,16.0,49.0,216.360,0.0,1617674,0.0,0.0,0.0,0.0,True,844.0,8.0,M,Sample 49,Plate 1,matrix.mtx,.
plt1.run1 pat_89,LV1007699743,LV1007699743,0,0,0,0,0,0.0,0,0,0,0,suep_0425,08/11/2021,17/11/2021,17/11/2021,0,0,0,0,61.0,60-69,M,mild,suep,suep_0425,61.0,60-69,M,mild,suep,LV1007699743,pat_89,AAAVY52HV_NAPKON_Plate1_2_22s004137-1-1_Valasa...,plt1,run1,plate1,human,hg38,104.0,"mode: single-end, keep duplicates: no, keep mu...",24355220,13253462,7713379,3388379,20966841,86.0,17678467,84.0,1475002,7.0,6372269,30.0,0.22,0.0,2989995,-t exon -g gene_id -s 2; multi-mapping: no; du...,2989995,2155768,8.0,45.0,97.961,0.0,736266,0.0,0.0,0.0,0.0,True,670.0,4.0,M,Sample 50,Plate 1,matrix.mtx,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
plt49.run1 pat_3571,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0.00,0.0,0,0,0,0,0.0,0.0,0.000,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,matrix.mtx,.
plt49.run1 pat_3572,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0.00,0.0,0,0,0,0,0.0,0.0,0.000,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,matrix.mtx,.


Unnamed: 0_level_0,Ensembl_gene,Ensembl_biotype,UniProt_proteins,UniProt_genes,UniProt_accessions,UniProt_names,UniProt_Ensembl_transcripts,UniProt_Ensembl_proteins,UniProt_Ensembl_gene_ids,Ensembl_chr,Ensembl_start,Ensembl_stop,Ensembl_strand,KEGG_PATHWAY_terms,KEGG_PATHWAY_ids,Gene_Ontology_terms,Gene_Ontology_ids
Ensembl gene id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ENSG00000146555,SDK1,protein_coding,Protein sidekick-1,SDK1,Q7Z5N4; F8W6X9; A0A087WTQ6,SDK1_HUMAN; F8W6X9_HUMAN; A0A087WTQ6_HUMAN,ENST00000404826; ENST00000389531; ENST00000615806,ENSP00000385899; ENSP00000374182; ENSP00000478062,ENSG00000146555,chr7,3301252,4269000,+,,,"eye development, neural retina development, bi...","GO:0001654, GO:0003407, GO:0005488, GO:0005515..."
ENSG00000283537,ENSG00000283537,transcribed_unprocessed_pseudogene,,,,,,,ENSG00000283537,chr7,143620943,143645675,+,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000270696,ENSG00000270696,lncRNA,,,,,,,ENSG00000270696,chr2,75660462,75662208,+,,,,
ENSG00000229906,SNRPGP11,processed_pseudogene,,,,,,,ENSG00000229906,chr13,51496020,51496251,-,,,,


In [46]:
adata_output = "anndata_1.h5ad"
utils.adata.save_h5ad(adata, adata_output) # speichern

[INFO] The adata object was saved to: anndata_1.h5ad


In [47]:
sctoolbox.settings.close_logfile()