# Daten Exploration & QC

## Benötigte Python Module laden

In [None]:
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

## Daten einlesen und ins richtige Format bringen

In [None]:
data = pd.read_csv("GSE2034_data.csv", index_col=0)
metadata = pd.read_csv("GSE2034_metadata.csv", index_col=0).reindex(data.columns)

In [None]:
dds = ad.AnnData(
    X=data.T,
    obs=metadata,
)

In [None]:
data

## Wie sieht dieses Format dann aus?

In [None]:
dds.X

In [None]:
dds.obs

In [None]:
dds.var

## Berechnen der von QC Metriken und Visualisierung

In [None]:
sc.pp.calculate_qc_metrics(dds, inplace=True)

In [None]:
dds.var

In [None]:
dds.var.total_counts.hist(figsize=(5, 5))
plt.xlabel("total_counts")
plt.ylabel("Anzahl Gene")

In [None]:
dds.obs.n_genes_by_counts.hist()

## Normalisierung der Daten

In [None]:
dds.layers["counts"] = dds.X

dds.X = dds.X.astype(float)

sc.pp.normalize_per_cell(dds)

In [None]:
sc.pp.log1p(dds)

dds.raw = dds.copy()

In [None]:
dds.var.log1p_total_counts.hist(log=True)

## Visualizerung der Daten & Überprüfung der Metadaten

In [None]:
sc.pp.pca(dds, use_highly_variable=False)

In [None]:
sc.pl.heatmap(dds, groupby="relapse", var_names=["XIST", "UTY"])

In [None]:
sc.pl.pca(
    dds, color=["ER Status", "time to relapse or last follow-up (months)", "relapse"]
)

## Speichern der Daten für den nächsten Schritt

In [None]:
dds.write_h5ad("processed_data.h5ad")

# Benötigte Code Schnipsel in zufälliger Reihenfolge

In [None]:
dds.obs.n_genes_by_counts.hist()

In [None]:
dds.obs

In [None]:
sc.pl.pca(
    dds, color=["ER Status", "time to relapse or last follow-up (months)", "relapse"]
)

In [None]:
dds.layers["counts"] = dds.X

dds.X = dds.X.astype(float)

sc.pp.normalize_per_cell(dds)

In [None]:
sc.pp.log1p(dds)

dds.raw = dds.copy()

In [None]:
sc.pp.log1p(dds)

dds.raw = dds.copy()

In [None]:
dds.var

In [None]:
dds.var.total_counts.hist()

In [None]:
dds.write_h5ad("processed_data.h5ad")

In [None]:
dds.obs.total_counts.hist(figsize=(5, 5))

In [None]:
dds = ad.AnnData(
    X=data.T,
    obs=metadata,
)

In [None]:
dds.X

In [None]:
sc.pl.heatmap(dds, groupby="ER Status", var_names=["XIST", "UTY"])

In [None]:
sc.pp.pca(dds, use_highly_variable=False)

In [None]:
dds.obs.total_counts.plot.box(figsize=(5, 5))

In [None]:
data = pd.read_csv("GSE2034_data.csv", index_col=0)
metadata = pd.read_csv("GSE2034_metadata.csv", index_col=0).reindex(data.columns)

In [None]:
sc.pp.calculate_qc_metrics(dds, inplace=True)