In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import load

df = load.load_df()

df
df_genes = df.iloc[:, 3:-1]
genes = df_genes.columns

df_unscaled = df.copy()


In [None]:
import seaborn as sns

sns.set_style("ticks")
plot = sns.countplot(
    data=df_unscaled,
    x="tissue",
    order=df_unscaled.tissue.value_counts().index,
    color="skyblue",
)
_ = plot.bar_label(plot.containers[0])

_ = plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right")
plot.get_figure().set_size_inches(18, 6)
sns.despine()

plot.get_figure().savefig("../misc/countplot.svg", format="svg", bbox_inches="tight")

In [None]:
from scoring import Scoring, linkage_map
from plotting import Plotting

su = Scoring(df_unscaled)
plotting = Plotting(df_unscaled, su)

## GA

In [None]:
from ga import GA

ga = GA(genes, su, plotting)

In [None]:
#ga.num_generations = 20
for meth, log in [("single", True)]:
    for num_genes in [5]:
        for rep in range(5):
            ga.run_and_save_instance(num_genes, meth, log)

## ML

In [None]:
from ml import ML

X = df_genes
y = df["disease"]
ml = ML(42, X, y, Scoring(df))

In [None]:
n_rep = 20
for n_feat in [5, 10, 20]:
    ml.fit_and_save(n_feat, n_rep)    

## Collect results

In [None]:
import load

results = load.Results(su)
#%%time
# for n in [5, 10, 20]:
#   sols = results.collect_and_save(n, linkage_map.keys())
_
# for n in [5, 10, 20]:
#   sols = results.collect_and_save(n, ["rf"])

In [None]:
%%time

from scoring import linkage_map_log
for n in [2,3,4,5,6,7,8,9,10,20]:
    sols = results.collect_and_save(n, linkage_map_log.keys())

In [None]:
%%time

from scoring import linkage_map_log
for n in [5]:
    sols = results.collect_and_save(n, linkage_map_log.keys())

In [None]:
sols

In [None]:
%%time

from scoring import linkage_map_log
for n in [10]:
    sols = results.collect_and_save(n, linkage_map_log.keys())
sols

## Load collected results

In [None]:
dfs = load.load_solutions([2, 3, 4, 5, 6, 7, 8, 9, 10, 20])
dfs[0].head()

In [None]:
dfs = load.load_solutions([5, 10])
a = dfs[0]  # .sort_values("quantile", ascending=False)
a

In [None]:
plotting.boxplot(a.iloc[0, :5], log=True)

In [None]:
dfs[0].approach.unique()

In [None]:
import pandas as pd


def x_best(idx, n):
    sols = dfs[idx].iloc[:, :n]
    sel = (
        dfs[idx]
        .loc[
            sols.apply(lambda x: su.score(x, "single_log", True), axis=1)
            .nlargest(10)
            .index
        ]
        .iloc[:, :n]
    )
    sel = pd.concat(
        [
            sel,
            sel.apply(lambda x: su.score(x), axis=1),
            sel.apply(lambda x: su.score(x, log=True), axis=1),
        ],
        axis=1,
    ).join(dfs[0].run_id)
    return sel


def best(idx, n):
    b = x_best(idx, n).iloc[0, :n]
    return b


def best_score(idx, n):
    b = x_best(idx, n).iloc[0]["single_log"]
    return b

In [None]:
best_score(1, 2)

In [None]:
plotting.boxplot(best(8, 9))

In [None]:
scores = [best_score(idx, idx + 1) for idx in range(10)]
scores

In [None]:
import matplotlib.pyplot as plt

plt.xticks(range(1, 11))
plt.grid(True)
plt.xlabel("Solution size")
plt.plot(range(1, 11), scores)

In [None]:
import pandas as pd

s = Scoring(df_unscaled, "neuroblastoma")
pd.concat(
    [
        sel1,
        sel1.apply(lambda x: s.score(x), axis=1),
        sel1.apply(lambda x: s.score(x, log=True), axis=1),
    ],
    axis=1,
).join(dfs[0].run_id)
pd.concat(
    [
        sel2,
        sel2.apply(lambda x: s.score(x), axis=1),
        sel2.apply(lambda x: s.score(x, log=True), axis=1),
    ],
    axis=1,
)

In [None]:
plotting.boxplot(sel1.iloc[0])

In [None]:
d = df_unscaled[["tissue"] + list(best(4, 5))].loc[
    df_unscaled.tissue == "Nerve - Tibial"
]
d = d.melt(id_vars="tissue")
d

In [None]:
d[d.variable == "KCNQ2"]

In [None]:
import seaborn as sns

plt.figure(figsize=(5, 7))
sns.boxplot(data=d, x="variable", y="value")

In [None]:
sel = ["CHRNA7", "CLN3", "DCC", "SLC24A1", "VEZT"]
sel2 = ["CD99", "KCNQ2", "SMPD4", "VEZT"]

In [None]:
s = Scoring(df_un

In [None]:
sel = dfs[0].iloc[0, :5].values
Scoring(df_unscaled).score(sel, log=True)
plotting.boxplot(sel)

In [None]:
import numpy as np

# ld = df_unscaled.copy()
# ld.iloc[:, 3:] = np.log2(ld.iloc[:, 3:])
sel

Scoring(ld).score(sel[0:5], log=False).single,
Scoring(ld).score(sel, log=False).single
# Plotting(ld, Scoring(ld)).boxplot(sel)

In [None]:
plotting = Plotting(df, Scoring(df))
plotting.boxplot(df_genes.columns)

## Experimental

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sel = dfs[1].iloc[0, :5].values
data = df[["tissue"] + list(sel)]
data = data.melt(id_vars="tissue")
# data

plt.figure(dpi=120, figsize=(40, 5))

plot = sns.violinplot(
    data=data,
    hue="variable",
    x="tissue",
    y="value",
    linewidth=1,
    flierprops=dict(markersize=1),
    dodge=False,
    inner="point",
)



In [None]:
%%time
sns.catplot(
    y="value",
    x="variable",
    col="tissue",
    col_wrap=8,
    data=data,
    kind="violin",
    bw=.1,
    cut=0
)

In [None]:
plot = sns.violinplot(
    data=data[data.tissue == "Nerve"],
    x="variable",
    y="value",
    linewidth=1,
    flierprops=dict(markersize=1),
    dodge=False,
    inner="point",
)

In [None]:
best_sols = []
for d, n in zip(dfs, [5, 10, 20]):
    s = clean_and_eval([d.head(10)], n)
    best_sols.append(s)
    s.to_csv(f"top10_size{n}.csv")
    s.head(5)["single"]

In [None]:
best_sols[0]

In [None]:
vs = best_sols[0].iloc[0, :5].values
scoring.score(vs)
df["s"] = df[vs].sum(axis=1)
n = df[df.tissue == "Nerve"]["s"].min()
h = df[df.tissue != "Nerve"]["s"].max()
n
h
import math

n / h
math.log(n, 2) - math.log(h, 2)

In [None]:
plotting.boxplot(best_sols[0].iloc[5, :5].values)

In [None]:
s = df[list(best_sols[0].iloc[0, :5].values)].sum(axis=1)
s = pd.concat([df["tissue"], s], axis=1)
s[s.tissue == "Nerve"].min()
s[s.tissue != "Nerve"].max()

s[s.tissue == "Nerve"].min().values[1] / s[s.tissue != "Nerve"].max().values[1]