In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

import ugtm
from ugtm import eGTM
import altair as alt

from outlier import set_bounds, bounds_iqr

### Put together dataframe to generate dataset.

In [2]:
dataset_date = '21032023'
lc_date = '25032023'
outlier = 'F'

label_scheme = 'labels_1'

pd.options.display.max_columns = None

dataset = pd.read_csv(f'../processed_data/dataset{dataset_date}_allfeatures_inc_labels_{lc_date}ol{outlier}.csv', low_memory=False)
dataset = dataset[dataset[label_scheme].notna()].reset_index(drop=True)

# Handle outliers based on unsupervised learning data analysis file.
bounds = {
    'Period_fit_g':[0,1],
    'Period_fit_r':[0,1],
}
dataset = set_bounds(dataset, bounds)

iqr_cols = ['CAR_mean_g', 
            'CAR_sigma_g', 
            'Eta_e_g', 
            'CAR_mean_r', 
            'CAR_sigma_r', 
            'Eta_e_r',
            'clr_mean',
            'clr_median',
            'clr_bright',
            'clr_faint',
            'distance']
dataset = bounds_iqr(dataset, iqr_cols, k=3)

iqr_cols2 = ['Freq1_harmonics_amplitude_0_g', 
             'Freq1_harmonics_amplitude_0_r',
             'Freq1_harmonics_amplitude_1_g',
             'Freq1_harmonics_amplitude_1_r',
             'Freq1_harmonics_amplitude_2_g',
             'Freq1_harmonics_amplitude_2_r',
             'Freq1_harmonics_amplitude_3_g',
             'Freq1_harmonics_amplitude_3_r',
             'Freq2_harmonics_amplitude_0_g',
             'Freq2_harmonics_amplitude_0_r',
             'Freq2_harmonics_amplitude_1_g',
             'Freq2_harmonics_amplitude_1_r',
             'Freq2_harmonics_amplitude_2_g',
             'Freq2_harmonics_amplitude_2_r',
             'Freq2_harmonics_amplitude_3_g',
             'Freq2_harmonics_amplitude_3_r',
             'Freq3_harmonics_amplitude_0_g',
             'Freq3_harmonics_amplitude_0_r',
             'Freq3_harmonics_amplitude_1_g',
             'Freq3_harmonics_amplitude_1_r',
             'Freq3_harmonics_amplitude_2_g',
             'Freq3_harmonics_amplitude_2_r',
             'Freq3_harmonics_amplitude_3_g',
             'Freq3_harmonics_amplitude_3_r']

dataset = bounds_iqr(dataset, iqr_cols2, k=10)
# Drop Gaia features
dataset = dataset.iloc[:,:-34]

# Filter dataset
dataset_pts_threshold = dataset[(dataset['n_obs_g']>=20) & (dataset['n_obs_r']>=20)].reset_index(drop=True)
dataset_timespan_threshold = dataset[dataset['temporal_baseline_g']>=365].reset_index(drop=True)

# Dataset going forward
dataset_final = dataset_pts_threshold

print(dataset_final[label_scheme].value_counts())

# Features and labels.
X = dataset_final.iloc[:,14:]
X_cols = X.columns.tolist()
y_names = dataset_final[label_scheme]

# pd.options.display.max_rows = 10
# print(X.isnull().sum())

dataset_final.shape

labels_1
dwarf_nova_SU_UMa    325
dwarf_nova_Z_Cam     152
nova_like            120
nova_like_VY_Scl     109
dwarf_nova_U_Gem     107
polar                101
int_polar             42
dwarf_nova_WZ_Sge     25
AMCVn                 22
nova                  14
Name: count, dtype: int64


(1017, 236)

In [3]:
enc = LabelEncoder().fit(y_names)
y = enc.transform(y_names)

# Train, test, split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

# Scale data.
scaler = StandardScaler()
X_train_proc = scaler.fit_transform(X_train)
X_test_proc = scaler.transform(X_test)
scaler2 = StandardScaler()
X_proc = scaler2.fit_transform(X)

# Impute missing values with knn imputer.
imputer = KNNImputer(n_neighbors=5)
X_train_proc = imputer.fit_transform(X_train_proc)
X_test_proc = imputer.transform(X_test_proc)
imputer2 = KNNImputer(n_neighbors=5)
X_proc = imputer2.fit_transform(X_proc)

# # MinMaxScaler
# scaler3 = MinMaxScaler()
# X_train_proc = scaler3.fit_transform(X_train_proc)
# X_test_proc = scaler3.transform(X_test_proc)
# scaler4 = MinMaxScaler()
# X_proc = scaler4.fit_transform(X_proc)


### eGTM: GTM transformer

eGTM is a sklearn-compatible GTM transformer. Similarly to PCA or t-SNE, eGTM reduces the dimensionality from n_dimensions to 2 dimensions. To generate mean GTM 2D projections:

In [4]:
from ugtm import eGTM

k=16

# Fit GTM on X_train and get 2D projections for X_test
model1 = eGTM(k=k).fit(X_train_proc)
             
X_train_gtm = model1.transform(X_train_proc)
X_test_gtm = model1.transform(X_test_proc)

model2 = eGTM(k=k).fit(X_proc)
X_gtm = model2.transform(X_proc)


In [5]:
size = 30

df_gtm_X = pd.DataFrame(X_gtm, columns=['GTM1', 'GTM2'])

df_gtm_X = pd.concat([df_gtm_X, 
                      pd.Series(dataset_final.index, name='index'), 
                      dataset_final['oid_ztf'], 
                      dataset_final['type_aavso'],
                      pd.Series(y_names, name='label')
                      ], axis=1)

selection = alt.selection_multi(fields=['label'], bind='legend')

projection_full = alt.Chart(df_gtm_X).mark_square().encode(
    x='GTM1',y='GTM2',
    color='label',
    # shape='labels',
    size=alt.value(size),
    tooltip=["label", "index", "oid_ztf", "type_aavso"],
    opacity=alt.condition(selection, alt.value(0.8), alt.value(0.1))
    ).add_params(selection
    ).properties(title="GTM projection of X").interactive()

# use alt.Chart to plot the 2D projections
df_gtm_train = pd.DataFrame(X_train_gtm, columns=['GTM1', 'GTM2'])
df_gtm_train['label'] = enc.inverse_transform(y_train)
projection_train = alt.Chart(df_gtm_train).mark_square().encode(
    x='GTM1',y='GTM2',
    color='label',
    # shape='labels',
    size=alt.value(size),
    tooltip=["label"],
    opacity=alt.condition(selection, alt.value(0.8), alt.value(0.1))
    ).add_params(selection
    ).properties(title="GTM projection of X_train").interactive()

df_gtm_test = pd.DataFrame(X_test_gtm, columns=['GTM1', 'GTM2'])
df_gtm_test['label'] = enc.inverse_transform(y_test)
projection_test = alt.Chart(df_gtm_test).mark_square().encode(
    x='GTM1',y='GTM2',
    color='label',
    # shape='labels',
    size=alt.value(size),
    tooltip=["GTM1", "GTM2", "label"],
    opacity=alt.condition(selection, alt.value(0.8), alt.value(0.1))
    ).add_params(selection
    ).properties(title="GTM projection of X_test").interactive()


alt.hconcat(projection_full, projection_train, projection_test)



### eGTC: GTM classifier

eGTC is a sklearn-compatible GTM classifier. Similarly to PCA or t-SNE, GTM reduces the dimensionality from n_dimensions to 2 dimensions. GTC uses a GTM class map to predict labels for new data (cf. classMap()). Two algorithms are available: the bayesian classifier GTC (uGTC) or the nearest node classifier (uGTCnn). The following example uses the iris dataset:

In [6]:
from ugtm import eGTC
from sklearn import datasets
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import metrics
from sklearn import model_selection

k=16
size = 200

# Predict labels for X_test
gtc = eGTC(k=k)
gtc = gtc.fit(X_train_proc, y_train)
y_pred = gtc.predict(X_test_proc)

print('Classification report for X_test')
print(metrics.classification_report(y_test,y_pred))

gtc2 = eGTC(k=k)
gtc2 = gtc2.fit(X_proc, y)
y_pred_all = gtc2.predict(X_proc)

# print('Classification report for X')
# print(metrics.classification_report(y,y_pred_all))

# Classification map X_train
dfclassmap = pd.DataFrame(gtc.optimizedModel.matX, columns=["x1", "x2"])
dfclassmap["predicted_node_label"] = np.unique(enc.inverse_transform(y))[gtc.node_label]
dfclassmap["probability_of_predominant_class"] = np.max(gtc.node_probabilities,axis=1)

chart4 = alt.Chart(dfclassmap).mark_square().encode(
    x='x1',
    y='x2',
    color='predicted_node_label:N',
    size=alt.value(size),
    # opacity='probability_of_predominant_class:Q',
    tooltip=['x1','x2', 'predicted_node_label:N', 'probability_of_predominant_class:Q']
).properties(title = "Class map X_train", width = 300, height = 300).interactive()

# Classification map X
dfclassmap2 = pd.DataFrame(gtc2.optimizedModel.matX, columns=["x1", "x2"])
dfclassmap2["predicted_node_label"] = np.unique(enc.inverse_transform(y))[gtc2.node_label]
dfclassmap2["probability_of_predominant_class"] = np.max(gtc2.node_probabilities,axis=1)

chart5 = alt.Chart(dfclassmap2).mark_square().encode(
    x='x1',
    y='x2',
    color='predicted_node_label:N',
    size=alt.value(size),
    # opacity='probability_of_predominant_class:Q',
    tooltip=['x1','x2', 'predicted_node_label:N', 'probability_of_predominant_class:Q']
).properties(title = "Class map X", width = 300, height = 300).interactive()

alt.hconcat(chart4, chart5)

Classification report for X_test
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.62      0.71      0.66        98
           2       0.12      0.06      0.08        32
           3       0.20      0.14      0.17         7
           4       0.33      0.39      0.36        46
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00         4
           7       0.38      0.50      0.43        36
           8       0.44      0.45      0.45        33
           9       0.29      0.23      0.26        30

    accuracy                           0.43       306
   macro avg       0.24      0.25      0.24       306
weighted avg       0.39      0.43      0.40       306



In [7]:
# Get GTM transform for X_test
transformed = eGTM(k=16).fit(X_train_proc).transform(X_test_proc)

df4 = pd.DataFrame(transformed, columns=["x1", "x2"])
df4["predicted_label"] = np.unique(enc.inverse_transform(y))[y_pred]
df4["true_label"] = np.unique(enc.inverse_transform(y))[y_test]
df4["probability_of_predominant_class"] = np.max(gtc.posteriors,axis=1)

# Projection of X_test colored by predicted label
chart6 = alt.Chart(df4).mark_circle().encode(
    x='x1',y='x2',
    size=alt.value(100),
    color=alt.Color("predicted_label:N",
           legend=alt.Legend(title="label")),
    opacity="probability_of_predominant_class:Q",
    tooltip=["x1", "x2", "predicted_label:N",
             "true_label:N", "probability_of_predominant_class:Q"]
).properties(title="Predicted labels", width=300, height=300).interactive()

# Projection of X_test colored by true_label
chart7 = alt.Chart(df4).mark_circle().encode(
    x='x1', y='x2',
    color=alt.Color("true_label:N",
                    legend=alt.Legend(title="label")),
    size=alt.value(100),
    tooltip=["x1", "x2", "predicted_label:N",
             "true_label:N", "probability_of_predominant_class:Q"]
).properties(title="True_labels", width=300, height=300).interactive()

alt.hconcat(chart6, chart7)

### Using class probabilities as input

In [8]:
# Preliminaries
# If you try to create a plot that will directly embed a dataset with more than 5000 rows, you will see a MaxRowsError:
# This is not because Altair cannot handle larger datasets, but it is because it is important for the user to think carefully 
# about how large datasets are handled. As noted above in Why does Altair lead to such extremely large notebooks?, 
# it is quite easy to end up with very large notebooks if you make many visualizations of a large dataset, and this error 
# is a way of preventing that.
# If you are certain you would like to embed your dataset within the visualization specification, you can disable the 
# MaxRows check with the following:

# alt.data_transformers.disable_max_rows()


# # These are all our class probabilities from our original model.
# labels2 = enc.classes_.tolist()
# preds_train = mod.predict_proba(X_train)
# preds_test = mod.predict_proba(X_test)

# # Let's add some column names.
# preds_train_df = pd.DataFrame()
# for count, name in enumerate(labels2):
#     preds_train_df[name] = preds_train[:,count]

# preds_test_df = pd.DataFrame()
# for count, name in enumerate(labels2):
#     preds_test_df[name] = preds_test[:,count]


# # Just alter the scalerfit to change the data that is used to scale the data.
# scaler = StandardScaler()
# scaler.fit(preds_train_df)
# # preds_train_df = scaler.fit_transform(preds_train_df)
# # preds_test_df = scaler.transform(preds_test_df)

# # Change train depending on whether you want ugtm to use the training or test data for the latent space.
# # Change test, and labels depending on whether you want to see the projections of the training or test data onto the latent space.
# # Change X set which dataset you want to see the features for, this will be the same test.
# train = preds_train_df # preds_train_df, preds_test_df, preds_train_orig_df
# test = preds_test_df # preds_train_df, preds_test_df, preds_train_orig_df
labels = y # y_train_fnl, y_test_fnl, y_train
# X_set = X_train # X_train_fnl, X_test_fnl, X_train_imp

gtm_model = ugtm.runGTM(X_proc,k=16)

# Use the following if you want to see the projections of the test data onto the latent space.
# transformed=ugtm.transform(optimizedModel=gtm_model,train=train,test=test)

# mean projection
# mean position of each data point in latent space.
# Further information located here: https://ugtm.readthedocs.io/en/latest/ugtm.html?highlight=ugtm.matY#module-ugtm.ugtm_classes/
# For projection of test data use: mean_u = transformed.matMeans, otherwise use:
mean_u = gtm_model.matMeans
# mean_u = transformed.matMeans
#
mean_u = pd.DataFrame(mean_u, columns=['U1','U2'])

# Add the labels to the latent space. Since we are using the training data, we use y_train_fnl, otherwise use y_test_fnl.
mean_u_labels = mean_u.copy()
mean_u_labels['y'] = enc.inverse_transform(labels)
# If using either the non-resampled training data or the test data, use the following to append the original index.
mean_u_labels['index'] = pd.DataFrame(X_proc).index

# Plot the latent space. But do so with a combination of different shapes and colours.

selection = alt.selection_point(fields=['series'], bind='legend')

alt.Chart(mean_u_labels, width=500, height=500).mark_point(size=100).encode(
    x='U1', 
    y='U2',
    color= 'y',
    shape='y',
    tooltip=['y', 'index']
    # opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
    ).interactive()




### Construct reference maps

In [9]:
# matY has shape n_dimensions (number of features) * n_nodes in latent space. 
# Manifold in n-dimensional space (projection of matX in data space); 
# A point matY[:,i] is a center of a Gaussian component i on the manifold in data space. Y=WΦT
# Location of each node in the high dimensional space.
# It is related to the actual probability space through Y=W*phi(transpose).
# We therefore need to normalise the matrix to show the strength of a node's association to a given class relative to the other nodes.

# I suppose you won't be coming tomorrow, I'll see you next week, i.e., please do not come in tomorrow. Sometimes you need to be careful
# about coming in.
refvect = gtm_model.matY

refvect = MinMaxScaler().fit_transform(refvect.T).T
# refvect = scaler.inverse_transform(refvect.T).T


# Here we are plotting matX, the coordinates of the nodes in the 2D space. We will colour code them based on 'label'. So this will be a
# grid of pixels in 2D space that is colour coded by some parameter.
def plot_ref_vect(gtm_matX,label,title,fig_size=(200,200)):
    dfmap = pd.DataFrame(gtm_matX, columns=["x1", "x2"])
    dfmap['label'] = label
    map = alt.Chart(dfmap).mark_square().encode(
        x='x1',
        y='x2',
        color=alt.Color('label:Q',
                        #scale=alt.Scale(scheme='viridis')),
                        scale=alt.Scale(scheme='turbo')),
        size=alt.value(150),
        tooltip=['x1','x2', 'label:Q'],
        #opacity='density'
    ).properties(title = title, width = fig_size[0], height = fig_size[1])
    return map
# %%

labels2 = X_cols
# # Here we save into variables plots for each of the classes. The colour coding is based on matY. matY defines the central
# # position of each Gaussian (node) in feature, or in this case class probability, space. 
gtm_refvect0 = plot_ref_vect(gtm_model.matX,label=refvect[0,:],title=labels2[0])
gtm_refvect1 = plot_ref_vect(gtm_model.matX,label=refvect[1,:],title=labels2[1])
gtm_refvect2 = plot_ref_vect(gtm_model.matX,label=refvect[2,:],title=labels2[2])
gtm_refvect3 = plot_ref_vect(gtm_model.matX,label=refvect[3,:],title=labels2[3])
gtm_refvect4 = plot_ref_vect(gtm_model.matX,label=refvect[4,:],title=labels2[4])
gtm_refvect5 = plot_ref_vect(gtm_model.matX,label=refvect[5,:],title=labels2[5])
gtm_refvect6 = plot_ref_vect(gtm_model.matX,label=refvect[6,:],title=labels2[6])
gtm_refvect7 = plot_ref_vect(gtm_model.matX,label=refvect[7,:],title=labels2[7])
gtm_refvect8 = plot_ref_vect(gtm_model.matX,label=refvect[8,:],title=labels2[8])
gtm_refvect9 = plot_ref_vect(gtm_model.matX,label=refvect[9,:],title=labels2[9])
gtm_refvect10 = plot_ref_vect(gtm_model.matX,label=refvect[10,:],title=labels2[10])
gtm_refvect11 = plot_ref_vect(gtm_model.matX,label=refvect[11,:],title=labels2[11])
gtm_refvect12 = plot_ref_vect(gtm_model.matX,label=refvect[12,:],title=labels2[12])
gtm_refvect13 = plot_ref_vect(gtm_model.matX,label=refvect[13,:],title=labels2[13])

gtm_refs_top = gtm_refvect0 |  gtm_refvect1 |  gtm_refvect2 | gtm_refvect3 | gtm_refvect4
gtm_refs_middle = gtm_refvect4 | gtm_refvect5 | gtm_refvect6 | gtm_refvect7 | gtm_refvect8
gtm_refs_bottom = gtm_refvect9 | gtm_refvect10 | gtm_refvect11 | gtm_refvect12 | gtm_refvect13
gtm_extra = gtm_refvect9
class_maps = alt.vconcat(gtm_refs_top, gtm_refs_middle, gtm_refs_bottom, gtm_extra)
class_maps = class_maps.configure_title(fontSize=20,fontWeight='normal')
class_maps

# %%


In [10]:
from sklearn.preprocessing import minmax_scale

def factor_map(gtm_model, Xfact):
    # Generate a dataframe where the first column is the feature value
    dfclus = pd.DataFrame(Xfact).rename({Xfact.name: 'scale'}, axis=1)
    # matR contains the responsibilities - the posterior probability that a data point
    # belongs to a particular Gaussian - the probability that that Gaussian is responsible for
    # the data point. It has dimensions n_examples x n_nodes.
    # We difine in the membership column the node most responsible for the data point.
    dfclus['membership'] = np.argmax(gtm_model.matR,axis=1)
    # Now group examples by their membership node and then combine the feature values 
    # for each node using their mean value.
    dfclus = dfclus.groupby('membership', as_index=False).agg(np.mean)
    # Now scale the grouped and meaned feature values from 0 to 1.
    dfclus.scale = minmax_scale(dfclus.scale)
    # We now use matX which contains the location in 2D space of all our neurons.
    # This is just a 2D grid. Place in a dataframe.
    df_map = pd.DataFrame(gtm_model.matX, columns=["x1", "x2"])
    # Now assign to each node its node number. This is the node memebership number
    # to which each example was assigned earlier. Some nodes were associated with examples,
    # others were not because examples had greater associations with other Gaussians.
    df_map['membership'] = np.arange(0,256)
    # We now wish to merge the above dataframes to produce a grid on neurons (coordinate values), each one with
    # the mean value of the feature for the examples associated with that neuron (Gaussian).
    df_map = df_map.merge(dfclus,how='left',on='membership')
    # Those nodes without any assigned examples will be given a value of 0
    # df_map.fillna(0,inplace=True)
    # size column is added to assign the size of the square in the plot.
    df_map['size'] = 1
    return df_map

def plot_factor_map(df_map, title='Factor Map',fig_size=(115,115),node_size=1):
    #df_map['size']=df_map['size']*node_size
    return alt.Chart(df_map).mark_square().encode(
        x=alt.X('x1',axis=None),
        y=alt.Y('x2',axis=None),
        color=alt.Color('scale:Q',
                        scale=alt.Scale(scheme='turbo')),
        size=alt.value(100),
        tooltip=['x1','x2','scale:Q'],
        #opacity='density'
    ).properties(title=title, width=fig_size[0], height=fig_size[1])

In [11]:
varnames = X_cols
gtm_model_for_plot = gtm_model # transformed, gtm_model
# X_set = X_test_fnl # X_train_fnl, X_test_fnl, X_train_imp
# varnames = selected
var = 0

# %%
chart = alt.vconcat()
new_line = '\n'
for rr in range(100):
    row = alt.hconcat()
    for cc in range(6):
        if var<len(varnames):
            idx_X_set = X_cols.index(varnames[var])
            row |= plot_factor_map(
                factor_map(gtm_model_for_plot, pd.DataFrame(X_proc).iloc[:,idx_X_set]),
                title=[varnames[var][0:17], varnames[var][17:34]])
            var = var + 1
    chart &= row

chart




In [16]:
X_cols

['Amplitude_g',
 'AndersonDarling_g',
 'Autocor_length_g',
 'Beyond1Std_g',
 'CAR_mean_g',
 'CAR_sigma_g',
 'CAR_tau_g',
 'Con_g',
 'Eta_e_g',
 'FluxPercentileRatioMid20_g',
 'FluxPercentileRatioMid35_g',
 'FluxPercentileRatioMid50_g',
 'FluxPercentileRatioMid65_g',
 'FluxPercentileRatioMid80_g',
 'Freq1_harmonics_amplitude_0_g',
 'Freq1_harmonics_amplitude_1_g',
 'Freq1_harmonics_amplitude_2_g',
 'Freq1_harmonics_amplitude_3_g',
 'Freq1_harmonics_rel_phase_1_g',
 'Freq1_harmonics_rel_phase_2_g',
 'Freq1_harmonics_rel_phase_3_g',
 'Freq2_harmonics_amplitude_0_g',
 'Freq2_harmonics_amplitude_1_g',
 'Freq2_harmonics_amplitude_2_g',
 'Freq2_harmonics_amplitude_3_g',
 'Freq2_harmonics_rel_phase_1_g',
 'Freq2_harmonics_rel_phase_2_g',
 'Freq2_harmonics_rel_phase_3_g',
 'Freq3_harmonics_amplitude_0_g',
 'Freq3_harmonics_amplitude_1_g',
 'Freq3_harmonics_amplitude_2_g',
 'Freq3_harmonics_amplitude_3_g',
 'Freq3_harmonics_rel_phase_1_g',
 'Freq3_harmonics_rel_phase_2_g',
 'Freq3_harmonics_rel_

In [125]:
def plot_factor_map2(df_map, title='Factor Map',fig_size=(150,150),node_size=1):
    #df_map['size']=df_map['size']*node_size
    return alt.Chart(df_map).mark_square().encode(
        x=alt.X('x1',axis=None),
        y=alt.Y('x2',axis=None),
        color=alt.Color('scale:Q',
                        scale=alt.Scale(scheme='turbo')),
        size=alt.value(100),
        tooltip=['x1','x2','scale:Q'],
        #opacity='density'
    ).properties(title=title, width=fig_size[0], height=fig_size[1])

# Select subset of columns

varnames_g = ['Amplitude_g','dif_min_median_g','dif_min_median_g','npeaks_1to2_g','npeaks_2to5_g',
            'npeaks_above5_g','Eta_e_g','CAR_sigma_g','Freq1_harmonics_amplitude_0_g','Skew_g',
            'LinearTrend_g','freq_pwr_max_g','Std_g','MedianAbsDev_g','stdstilllev_t20s10_g',
            'Mean_g','min_mag_g','n_obs_g'
            ]
# Remove _g and add _r
varnames_r = [v.replace('_g','_r') for v in varnames_g]

# Colour and Gaia features
varnames_gaia = ['parallax','pm','clr_mean','clr_bright','bp_rp','bp_g','g_rp','StetsonJ','StetsonL']

# NAM varnames
varnames_nam = ['bp_rp','nu_eff_used_in_astrometry','parallax','absmag_g',
                'PeriodLS_g','npeaks_1to2_g','npeaks_2to5_g','npeaks_above5_g',
                'pnts_leq_rollMedWin20-5mag_r','stdstilllev_t20s10_g','Gskew_g','n_obs_g',
                'dif_min_median_g','kurtosis_r','LinearTrend_r','StetsonJ']


# varnames = selected
var = 0
feature_list = varnames_nam

# %%
chart = alt.vconcat().configure_title(fontSize=16,fontWeight='bold')
count = 0
count = 0
new_line = '\n'
for rr in range(100):
    row = alt.hconcat()
    for cc in range(4):
        if var<len(feature_list):
            idx_X_set = X_cols.index(feature_list[var])
            row |= plot_factor_map2(
                factor_map(gtm_model, pd.DataFrame(X_proc).iloc[:,idx_X_set]),
                title=[feature_list[var][0:20], feature_list[var][20:34]])
            var = var + 1
    chart &= row

chart