### Set parameters and names

In [None]:
from CustomObjects import *
%matplotlib inline
# plot images in retina
%config InlineBackend.figure_format = 'retina'
INPUT_GENES    = 'ALL'
INPUT_FEATURES = 'X_FC'
INPUT_NORM     = 'z'
CODINGS_SIZE = 6

ID     = f'{CODINGS_SIZE}D_{INPUT_GENES}_{INPUT_FEATURES}_{INPUT_NORM}'

DIR_FIG  = f'../figures/{ID}_analysis/'
DIR_DATA = f'../data/{ID}_analysis/'
! mkdir -p {DIR_FIG}
! mkdir -p {DIR_DATA}
# set colors


## Import original (No Z-score norm.)

In [None]:
ALL_X = pd.read_csv(f'../data/matrices/ALL/ALL_X.csv').set_index('GENE')
#ALL_X = calculate_mean_features(ALL_X, PREFIXES)
ALL_FC = pd.read_csv(f'../data/matrices/ALL/ALL_FC.csv').set_index('GENE')
assert (ALL_X.index == ALL_FC.index).all()
ORIGINAL = pd.concat([ALL_X,ALL_FC],axis=1)

## Import original and reconstrution for all methods (AE,VAE,PCA...)(Z-score) 

In [None]:
DF   = pd.read_csv(f'../data/AE_{ID}/DF.csv').set_index('GENE')
assert DF.isna().any().sum() == 0
FEATURE_NAMES = DF.select_dtypes(include=['number']).columns

In [None]:
METHODS = ['VAE','AE','PCA','UMAP']
CODE = pd.read_csv(f'{DIR_DATA}CODE.csv').set_index('GENE')
CODE

# Clustering

### GMM hyperparameter optimization

In [None]:
# takes a long time to run  (1h on original feat. and 30min for all others methods)
DF_DICT = GMM_grid_search(CODE, DF[FEATURE_NAMES], METHODS=METHODS, max_k=100, ) 

In [None]:
#for METHOD in METHODS:
    #plot_scores(DF_DICT[METHOD],METHOD)

-----






----

### Train with best k and cov

# k=40

In [None]:
best_cov_dict = {'VAE':'spherical',
                'UMAP':'full',
                'PCA':'full',
                'AE':'full',
                'original':'full'
                }
best_k_dict = {'VAE':40,
                'UMAP':40,
                'PCA':40,
                'AE':40,
                'original':40
                }
#METHODS.append('original')
GMM_train(CODE, METHODS=METHODS, ORIGINAL_DF=DF[FEATURE_NAMES], best_k_dict=best_k_dict,best_cov_dict=best_cov_dict)

In [None]:
for METHOD in METHODS:
    DIR_FIG_METHOD = f'{DIR_FIG}{METHOD}/'
    ! mkdir -p {DIR_FIG_METHOD}
    umap_2d_clusters(CODE, METHOD, best_k_dict=best_k_dict, DIR=DIR_FIG_METHOD, CMAP='gist_ncar', SIZE=10)


In [None]:
for METHOD in METHODS:
    DIR_FIG_METHOD = f'{DIR_FIG}{METHOD}/'
    plot_boxplots_clusters(CODE,f'GMM_{METHOD}_{best_k_dict[METHOD]}', PREFIXES, HM_COL_DICT=HM_COL_DICT ,X_LINE=0, TITLE=f'GMM_{METHOD}_{best_k_dict[METHOD]}')
    plt.savefig(f'{DIR_FIG_METHOD}GMM_violins_{METHOD}_{best_k_dict[METHOD]}_k.png', format="png", bbox_inches="tight",dpi=250)
    
    

#### Save clusters in a dictionary

In [None]:
import pickle

CODE = CODE.reset_index()

for METHOD in METHODS:
    LABELS_COL=f'GMM_{METHOD}_{best_k_dict[METHOD]}'
    DIR= f'../data/{ID}_analysis/{LABELS_COL}/'
    ! mkdir -p {DIR}
    gene_clusters = CODE.groupby(LABELS_COL)['GENE'].agg(list)
    GENE_CLUSTERS = {}
    for label, gene_list in gene_clusters.items():

        GENE_CLUSTERS[label] = {'gene_list': gene_list, 'len': len(gene_list)}
        #print(len(gene_list))
        pd.Series(gene_list).to_csv(f'{DIR}cluster_{label}.list',index=False,header=False)
    with open(f'{DIR}gene_clusters_dict.pkl', 'wb') as f:
        pickle.dump(GENE_CLUSTERS, f)
CODE['GENE'].to_csv(f'{DIR_DATA}background.list',index=False,header=False)

CODE.set_index('GENE',inplace=True)
CODE

# k = 80

In [None]:
METHODS


In [None]:
best_cov_dict = {'VAE':'spherical',
                'UMAP':'full',
                'PCA':'full',
                'AE':'full',
                'original':'full'
                }

best_k_dict = {'VAE':80,
                'UMAP':80,
                'PCA':80,
                'AE':80,
                'original':80
                }

GMM_train(CODE, METHODS=METHODS, ORIGINAL_DF=DF[FEATURE_NAMES], best_k_dict=best_k_dict,best_cov_dict=best_cov_dict)

In [None]:
for METHOD in METHODS:
    DIR_FIG_METHOD = f'{DIR_FIG}{METHOD}/'
    umap_2d_clusters(CODE, METHOD, best_k_dict=best_k_dict, DIR=DIR_FIG_METHOD, CMAP='gist_ncar', SIZE=10)


In [None]:
for METHOD in METHODS:
    DIR_FIG_METHOD = f'{DIR_FIG}{METHOD}/'
    plot_boxplots_clusters(CODE,f'GMM_{METHOD}_{best_k_dict[METHOD]}', PREFIXES, HM_COL_DICT=HM_COL_DICT ,X_LINE=0, TITLE=f'GMM_{METHOD}_{best_k_dict[METHOD]}')
    plt.savefig(f'{DIR_FIG_METHOD}GMM_violins_{METHOD}_{best_k_dict[METHOD]}_k.png', format="png", bbox_inches="tight",dpi=250)
    

#### Save clusters in a dictionary

In [None]:
import pickle

CODE = CODE.reset_index()

CLUST='GMM'
for METHOD in METHODS:

    LABELS_COL=f'GMM_{METHOD}_{best_k_dict[METHOD]}'
    DIR= f'../data/{ID}_analysis/{LABELS_COL}/'
    ! mkdir -p {DIR}
    gene_clusters = CODE.groupby(LABELS_COL)['GENE'].agg(list)
    GENE_CLUSTERS = {}
    for label, gene_list in gene_clusters.items():

        GENE_CLUSTERS[label] = {'gene_list': gene_list, 'len': len(gene_list)}
        #print(len(gene_list))
        pd.Series(gene_list).to_csv(f'{DIR}cluster_{label}.list',index=False,header=False)
    with open(f'{DIR}gene_clusters_dict.pkl', 'wb') as f:
        pickle.dump(GENE_CLUSTERS, f)
CODE['GENE'].to_csv(f'{DIR_DATA}background.list',index=False,header=False)
CODE.set_index('GENE',inplace=True)
CODE