In [None]:
!pip install tqdm

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import io_utils
import pygad
import os
from tqdm import tqdm
import numpy as np
import sys

In [None]:
data_dir = "../data/"

In [None]:
%%time

df = pd.read_csv(data_dir + "data_unscaled.csv")
df
display(df.head())

df_genes = df.iloc[:, 3:-1]
display(df_genes.head())

In [None]:
eps = sys.float_info.epsilon
df.iloc[:, 3:-1] = df.iloc[:, 3:-1] + eps
df

In [None]:
from scoring import Scoring
from plotting import Plotting

S = Scoring(df)
P = Plotting(df, S)

## GA

In [None]:
GENES = np.array(df_genes.columns)

In [None]:
def _fitness_func(solution: [int], idx, key: str, log: bool):
    selected_genes = GENES[solution]
    return S.score(selected_genes, log, key)

In [None]:
def _initial_population(num_genes: int, init_pop_weights):
    sols = []
    for _ in range(sol_per_pop):
        sel = list(init_pop_weights.sample(num_genes, weights=init_pop_weights).index)
        enc = np.isin(genes, sel).nonzero()[0]
        sols.append(enc)
    return sols


def _on_generation(instance, meth, log, run_dir, pbar):
    pbar.update(1)
    if instance.generations_completed % 100 == 0:
        _save_results(instance, meth, log, run_dir)


def _save_best_sols(instance, run_dir):
    best_sols = instance.best_solutions
    if len(best_sols) == 0:
        return
    sols = pd.DataFrame(best_sols).apply(lambda x: GENES[x])
    sols = pd.concat([sols, sols.apply(lambda x: S.score(x, log=log), axis=1)], axis=1)
    instance.best_solutions = []

    path = os.path.join(run_dir, "best_sols.csv")
    if os.path.exists(path):
        sols.to_csv(path, index=False, mode="a", header=False)
    else:
        sols.to_csv(path, index=False)


def _save_results(instance, meth: str, log: bool, run_dir: str):
    _save_best_sols(instance, run_dir)
    instance.plot_fitness().savefig(
        os.path.join(run_dir, "fit_vs_gen.svg"), format="svg"
    )

    solution, solution_fitness, solution_idx = instance.best_solution()
    path = os.path.join(run_dir, f"best_sol_{instance.generations_completed}.svg")
    P.boxplot(GENES[solution], log, path)

    np.save(os.path.join(run_dir, "last_pop"), instance.population)

In [None]:
num_generations = 20000
num_parents_mating = 50
# initial_population
sol_per_pop = 500  # inactive if initial_population
gene_type = int

parent_selection_type = "sss"
keep_parents = -1
crossover_type = "scattered"
random_mutation_min_val = 0
mutation_type = "adaptive"
mutation_probability = (0.15, 0.08)
gene_space = range(0, len(GENES))
stop_criteria = ["saturate_300"]
init_pop_weights = None

In [None]:
def run_and_save_instance(num_genes: int, meth: str, log: bool):
    fitness_func = lambda x, y: _fitness_func(x, y, meth, log)
    run_dir = io_utils.create_run_dir(meth, num_genes)

    with tqdm(total=num_generations) as pbar:
        instance = pygad.GA(
            num_generations=num_generations,
            num_parents_mating=num_parents_mating,
            fitness_func=fitness_func,
            sol_per_pop=sol_per_pop,
            num_genes=num_genes,
            parent_selection_type=parent_selection_type,
            keep_parents=keep_parents,
            mutation_type=mutation_type,
            mutation_probability=mutation_probability,
            gene_space=gene_space,
            gene_type=int,
            allow_duplicate_genes=False,
            on_generation=lambda x: _on_generation(x, meth, log, run_dir, pbar),
            save_best_solutions=True,
            stop_criteria=stop_criteria,
            suppress_warnings=True,
        )

        if init_pop_weights:
            instance.initial_population = _initial_population(
                num_genes, init_pop_weights
            )
        instance.run()

    _save_results(instance, meth, log, run_dir)

In [None]:
for meth, log in [("single", True)]:
    for num_genes in [14, 15, 16, 17, 18, 19, 20]:
        for rep in range(5):
            run_and_save_instance(num_genes, meth, log)

## ML

In [None]:
from ml import ML

X = df_genes
y = df["disease"]
ml = ML(42, X, y, Scoring(df))

In [None]:
n_rep = 20
for n_feat in [5, 10, 20]:
    ml.fit_and_save(n_feat, n_rep)    

## Experimental

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sel = dfs[1].iloc[0, :5].values
data = df[["tissue"] + list(sel)]
data = data.melt(id_vars="tissue")
# data

plt.figure(dpi=120, figsize=(40, 5))

plot = sns.violinplot(
    data=data,
    hue="variable",
    x="tissue",
    y="value",
    linewidth=1,
    flierprops=dict(markersize=1),
    dodge=False,
    inner="point",
)



In [None]:
%%time
sns.catplot(
    y="value",
    x="variable",
    col="tissue",
    col_wrap=8,
    data=data,
    kind="violin",
    bw=.1,
    cut=0
)

In [None]:
plot = sns.violinplot(
    data=data[data.tissue == "Nerve"],
    x="variable",
    y="value",
    linewidth=1,
    flierprops=dict(markersize=1),
    dodge=False,
    inner="point",
)

In [None]:
best_sols = []
for d, n in zip(dfs, [5, 10, 20]):
    s = clean_and_eval([d.head(10)], n)
    best_sols.append(s)
    s.to_csv(f"top10_size{n}.csv")
    s.head(5)["single"]

In [None]:
best_sols[0]

In [None]:
vs = best_sols[0].iloc[0, :5].values
scoring.score(vs)
df["s"] = df[vs].sum(axis=1)
n = df[df.tissue == "Nerve"]["s"].min()
h = df[df.tissue != "Nerve"]["s"].max()
n
h
import math

n / h
math.log(n, 2) - math.log(h, 2)

In [None]:
plotting.boxplot(best_sols[0].iloc[5, :5].values)

In [None]:
s = df[list(best_sols[0].iloc[0, :5].values)].sum(axis=1)
s = pd.concat([df["tissue"], s], axis=1)
s[s.tissue == "Nerve"].min()
s[s.tissue != "Nerve"].max()

s[s.tissue == "Nerve"].min().values[1] / s[s.tissue != "Nerve"].max().values[1]