# Description

TODO: this notebook analyze all clusters from the selected partitions

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# import re
import shutil
import subprocess
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

# import numpy as np
import pandas as pd
import papermill as pm
# import matplotlib.pyplot as plt
# import seaborn as sns
# from IPython.display import HTML

# from clustering.methods import ClusterInterpreter
# from data.recount2 import LVAnalysis
# from data.cache import read_data
# from utils import generate_result_set_name
import conf

# Settings

In [None]:
# select which partitions' clusters will be analyzed
PARTITION_Ks = [45, 41, 38, 28]

# Load best partitions

In [None]:
# output dir for this notebook
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

In [None]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

In [None]:
best_partitions = pd.read_pickle(input_file)

In [None]:
assert best_partitions.index.is_unique

In [None]:
best_partitions.shape

In [None]:
best_partitions.head()

# Select top k partitions

In [None]:
# I take the top 4 partitions (according to their number of clusters)
selected_partition_ks = best_partitions[
    best_partitions["selected"]
].index.sort_values(ascending=False)[:4]
display(selected_partition_ks)

# Run interpretation

In [None]:
CLUSTER_ANALYSIS_OUTPUT_DIR = Path(
    conf.RESULTS["CLUSTERING_INTERPRETATION_OUTPUT_DIR"],
    "cluster_analyses",
).resolve()
display(CLUSTER_ANALYSIS_OUTPUT_DIR)

In [None]:
CLUSTER_ANALYSIS_OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
def run_notebook(input_nb, output_nb, parameters):
    pm.execute_notebook(
        input_nb,
        output_nb,
        progress_bar=False,
        parameters=parameters,
    )
    
#     subprocess.run(
#         [
#             "jupyter",
#             "nbconvert",
#             output_nb,
#             "--to=html",
#         ],
#         check=True,
#     )
    
#     output_nb.unlink()

In [None]:
for part_k in selected_partition_ks:
    print(f"Partition k:{part_k}", flush=True)
    
    output_folder = Path(
        CLUSTER_ANALYSIS_OUTPUT_DIR,
        f"part{part_k}"
    ).resolve()
    shutil.rmtree(output_folder, ignore_errors=True)
    output_folder.mkdir()
    
    part = best_partitions.loc[part_k, "partition"]
    part_clusters = pd.Series(part).value_counts()
    
    # always skip the biggest cluster in each partition
    for c_size_idx, c in enumerate(part_clusters.index[1:]):
        print(f"  Cluster: {c}", flush=True)
        
        input_nb = Path(conf.RESULTS["CLUSTERING_INTERPRETATION_OUTPUT_DIR"], "interpret_cluster.out.ipynb").resolve()
        output_nb = Path(output_folder, f"{c_size_idx:02}-part{part_k}_k{c}.ipynb").resolve()
        parameters = dict(PARTITION_K=part_k, PARTITION_CLUSTER_ID=c)
        
        run_notebook(input_nb, output_nb, parameters)