# Data Modeling

In [None]:
import scanpy as sc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from scipy.cluster.hierarchy import fcluster
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree

In [None]:
def train_test_split(adata, fraction: float):
    """Split data object into train & test split.
    
    Args:
    -----
        adata: The input dataset.
        fraction (float): The size of the test set as fraction of the total dataset.
        
    Returns:
    --------
        training data, test data
    """
    test_idx = adata.obs.sample(frac=fraction, random_state=42).index
    return adata[~adata.obs_names.isin(test_idx)].copy(), adata[test_idx].copy()

In [None]:
def agglomerative_clustering(
    adata, nclusters: int, groupby: str = "Sample", n_pcs: int = None
):
    """Conduct agglomerative clustering. 
    
    Result is saved in adata.obs['Cluster'].
    
    Args:
    -----
        adata: The input dataset.
        nclusters (int): The number of expected clusters.
        groupby (str): Average information by `groupby`.
        n_pcs (int): Number of principle compontents to use. 
            Defaults to all.
    """
    if groupby == "Sample" and "Sample" not in adata.obs.columns:
        adata.obs["Sample"] = adata.obs_names.astype("category")
    sc.tl.dendrogram(adata, groupby=groupby)
    labels = adata.obs[groupby].cat.categories
    clusters = fcluster(
        adata.uns[f"dendrogram_{groupby}"]["linkage"], t=nclusters, criterion="maxclust"
    )
    clusters = pd.Series(clusters, index=labels).astype("category")
    clusters = clusters.reindex(adata.obs[groupby].values).values
    adata.obs["Cluster"] = clusters

# Daten einlesen

In [None]:
# Daten einlesen, welches Format ? 

data = sc.read("processed_data.h5ad")

## Differentielle Expressionsanalyse

In [None]:
# DE Analyse durchführen



In [None]:
# Extrahieren der Ergebnisse

In [None]:
# Darstellen der Ergebnisse

## Clustering

In [None]:
# Clustering

In [None]:
# Darstellen des Dendrograms

In [None]:
# Darstellen als PCA

In [None]:
# Darstellen als Heatmap

## Random Forest

In [None]:
import pandas as pd

In [None]:
data.obs

In [None]:
df = pd.DataFrame(data.obs)

In [None]:
# Aufteilen der Daten in Training & Test Daten

#from sklearn.model_selection import train_test_split
test, train = train_test_split(data, 0.9)

In [None]:
len(test)

In [None]:
len(train)

In [None]:
model = RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=None, 
                                              min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                              max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                              bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, 
                                              warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None 
                        )

In [None]:
model.fit(train.X, train.obs.relapse)

In [None]:
# Random Forest: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
# Fitten des Random Forest

In [None]:
# Vorhersage der auf den Training daten

pred1 = model.predict(train.X)
pred2 = model.predict(test.X)

In [None]:
# Darstellen als Confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test.obs.relapse.values, pred2)
cm

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
plt.figure
sn.heatmap (cm, annot=True)
plt.xlabel("predicted")
plt.ylabel("truth")

In [None]:
# Wie gut ist die Performance ? Welche Metriken kennen wir ? 

model.score(test.X, test.obs.relapse)

In [None]:
# Wiederholen für die Test Daten?

In [None]:
# Was ist der Unterschied ? Wie unterscheidet sich die Perfomance?

# Code-Snippets, die hilfreich sein könnten

In [None]:
# https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
sc.tl.rank_genes_groups(
    "<data>", groupby="<column name>", groups=["<diseased>"], reference="<reference>", method="t-test"
)

In [None]:
#https://scanpy.readthedocs.io/en/stable/generated/scanpy.get.rank_genes_groups_df.html
results = sc.get.rank_genes_groups_df("<data>", group="<diseased>", pval_cutoff=0.05).query("abs(logfoldchanges) > 0.585")

In [None]:
# https://scanpy.readthedocs.io/en/stable/generated/scanpy.pl.clustermap.html
sc.pl.clustermap(
    "<data>",
    obs_keys="<column name>",
    use_raw=False,
)

In [None]:
# https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.dendrogram.html
sc.pl.dendrogram("<data>", groupby="Sample")

In [None]:
# https://scanpy.readthedocs.io/en/stable/generated/scanpy.pl.pca.html
sc.pl.pca("<data>", color="<column name>")

In [None]:
# https://scanpy.readthedocs.io/en/latest/generated/scanpy.pl.clustermap.html#
sc.pl.clustermap("<data>", obs_keys="<column name>", use_raw=False)

In [None]:
# https://www.pycm.io/doc/index.html#Direct-CM
traincm = pycm.ConfusionMatrix(actual_vector="<training data>".obs.relapse.values.astype(str), predict_vector="<predicted data>")
traincm.plot(number_label=True)

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
tree.plot_tree(rf.estimators_["<tree index>"],
               feature_names = "<feature_names>", 
               class_names="<class_names>",
               impurity=False,
               filled = True);