# Preprocessing

In [None]:
import pandas as pd
import seaborn as sns

Clean up data. Classify samples into supergroups.

In [None]:
data_dir = "../data/"

path = data_dir + "tissue_filtered_neuroplastoma.csv"

df = pd.read_csv(path, dtype={"disease": str})
df = df.fillna(value={"disease": "control"})
df = df.iloc[:, 2:]
df = df[df.source != "hOB"]
df = df[(df.tissue != "Cells - EBV-transformed lymphocytes")
        & (df.tissue != "Cells - Cultured fibroblasts")]
df["group"] = df["disease"]
df.loc[df.tissue.str.startswith("Brain"), "group"] = "brain"
df['group'] = df['group'].replace('control', 'other')
#df.loc[df.tissue.str.startswith("Nerve -"), "group"] = "nerve"
df["disease"] = df["disease"] != "control"
df = df.sort_values("tissue").reset_index(drop=True)
display(df.head())
df.to_csv(data_dir + "data_scaled.csv", index=False)

Data is scaled as the following: `log_2(data + 1)`. Bring it to its original scale.

In [None]:
df.iloc[:, 3:-1] = 2 ** df.iloc[:, 3:-1] - 1
display(df.head())
df.to_csv(data_dir + "data_unscaled.csv", index=False)

# Exploration

In [None]:
sns.set_style("ticks")

In [None]:
plot = sns.countplot(
    data=df,
    x="tissue",
    order=df.tissue.value_counts().index,
    color="skyblue"
)
_ = plot.bar_label(plot.containers[0])
_ = plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right")
plot.get_figure().set_size_inches(18, 6)
sns.despine()

plot.get_figure().savefig("../misc/countplot.svg", format="svg", bbox_inches="tight")

In [None]:
plot = sns.displot(df, x='tissue', hue='group', multiple='stack', aspect=3)
_ = plot.set_xticklabels(rotation=40, ha="right")
sns.despine()
plot.savefig("../misc/countplot2.svg", format="svg", bbox_inches="tight")