In [None]:
# load data and metadata from saved files, combine them, and display unique labels
import numpy as np
import pandas as pd
import pathlib
import matplotlib.pyplot as plt

from glitchstream.glitch_downloader import GlitchDownloader
from glitchstream.deepextractor import DeepExtractor

base_dir = pathlib.Path('./sample_glitches')

data_subs = []
for i in range(10):
	data_subs.append(np.load(base_dir/f'random_glitches_{i}_g_hats.npy',allow_pickle=True))
data = np.concatenate(data_subs,axis=0)

print(data.shape)

metadatas = []
for i in range(10):
	metadatas.append(pd.read_csv(base_dir/f'random_glitches_{i}_metadataframe.csv'))
metadata_df = pd.DataFrame(pd.concat(metadatas,ignore_index=True))

ghat_labels = metadata_df['ml_label'].to_numpy()
ghat_labels_uq = np.unique(ghat_labels)
print(np.unique(metadata_df['ml_label'].to_numpy()))

ghat_df = pd.DataFrame({'ghat':list(data),'label':ghat_labels})

ghat_df.head()

In [None]:
# Normalize the data and plot examples from each of the 4 classes
from sklearn.preprocessing import normalize

df_subs = []

data_normalized = normalize(data,norm="l2",axis=1)
ghat_df_normalized = pd.DataFrame({'ghat':list(data_normalized),'label':ghat_labels})

for label in ghat_labels_uq:
	subgroup = ghat_df_normalized[ghat_df_normalized['label']==label]
	df_subs.append(subgroup)

print(df_subs[1]['label'].values)
seed = np.random.seed(None)
_,axs = plt.subplots(4,1,figsize=(15,12))
for idx,df in enumerate(df_subs):
	randrow = np.random.randint(len(df))
	randglitch = df.iloc[randrow].ghat
	t = np.linspace(-1,1,8192)
	axs[idx].plot(t,randglitch)
	axs[idx].set_title(f'{randrow}:{df.iloc[0].label}')

plt.tight_layout()
plt.show()

In [None]:
# direct PCA and t-SNE visualization of the raw 8192-d data with Omicron labels
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.patches as mpatches


X_raw = data
X_normalized = normalize(X_raw,axis=1,norm='l2')

pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_normalized)
pca_prime = PCA(n_components=20)
X_pca_unormalized = pca_prime.fit_transform(X_raw)

print(f"shape of data after PCA: {X_pca.shape}")
print(f"shape of unormalized data after PCA: {X_pca_unormalized.shape}")

le = LabelEncoder()
labels_coded = le.fit_transform(ghat_labels)
palette = sns.color_palette("husl",len(le.classes_))
colors_remap = dict(zip(le.classes_,palette))

df_sns_pca1 = pd.DataFrame({'x':X_pca[:,0],'y':X_pca[:,1],'labels':ghat_labels})
plt.figure(figsize=(12,9))
sca = sns.scatterplot(data=df_sns_pca1,x='x',y='y',hue='labels',palette=palette,s=70,alpha=0.6)
sca.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title="Omicron labels")
#plt.scatter(X_pca[:,0],X_pca[:,1],alpha=0.5,s=10)
plt.title("normalized PCA first 2 components")
plt.show()

plt.figure(figsize=(12,9))
plt.scatter(X_pca_unormalized[:,0],X_pca_unormalized[:,1],alpha=0.5,s=10)
plt.title("UNormalized PCA first 2 components")
plt.show()


def plot_tsne(X):
	plt.figure(figsize=(12,9))

	tsne_1 = TSNE(n_components=2,perplexity=25,random_state=42)
	X_tsne_1 = tsne_1.fit_transform(X)

	#colors = plt.get_cmap('gist_rainbow')(np.linspace(0,0.95,len(le.classes_)))
	#colors_remap = [colors[labels_coded[i]] for i in range(len(labels_coded))]

	#sns.set(style="whitegrid", context="talk")
	palette = sns.color_palette("husl",len(le.classes_))
	colors_remap = dict(zip(le.classes_,palette))
	legend_handles = []
	for label in le.classes_:
		patch = mpatches.Patch(color=colors_remap[label],label=label)
		legend_handles.append(patch)
	df_sns = pd.DataFrame({
		'x':X_tsne_1[:,0],
		'y':X_tsne_1[:,1],
		'labels': ghat_labels
	})
	sca = sns.scatterplot(data=df_sns,x='x',y='y',hue='labels',palette=palette,s=70,alpha=0.5,edgecolor='w')
	sca.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title="Omicron labels",handles=legend_handles)
	plt.title("t-SNE with Omicron labels")
	# scatter = plt.scatter(X_tsne_1[:,0],X_tsne_1[:,1],c=colors_remap,alpha=0.6,s=50)
	# plt.title('tSNE on raw data with original Omicron labels')

	# handles,_ = scatter.legend_elements()
	# plt.legend(handles,le.classes_,title='Omicron Labels',bbox_to_anchor=(1.05, 1), loc='upper left')
	# plt.grid(True, alpha=0.3)
	# plt.tight_layout()
	plt.show()

plot_tsne(X_normalized)



In [None]:
# transform data into train and test sets | in tslearn format for K-Shape clustering
import tslearn

from sklearn.model_selection import train_test_split
from tslearn.utils import to_time_series_dataset
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.clustering import KShape

np.random.seed(42)

df_train, df_test = train_test_split(ghat_df, train_size=0.8, random_state=42,stratify=ghat_df['label'],shuffle=True)

ghat_train = np.array(df_train.ghat.to_list())
label_train = np.array(df_train.label.to_list())
ghat_test = np.array(df_test.ghat.to_list())
label_test = np.array(df_test.label.to_list())


X_train_ts = to_time_series_dataset(ghat_train)
X_test_ts = to_time_series_dataset(ghat_test)

X_train_ts_scaled = TimeSeriesScalerMeanVariance().fit_transform(X_train_ts) # scale each time series to zero mean and unit variance
sz = X_train_ts_scaled.shape[1]

ks = KShape(n_clusters=len(ghat_labels_uq),random_state=42)
y_pred_ks = ks.fit_predict(X_train_ts_scaled)



# print(df_train.label.value_counts())
# print(df_test.label.value_counts())
# print(df_train.label.value_counts()['Scattered_Light']/len(df_train))
# print(df_test.label.value_counts()['Scattered_Light']/len(df_test))


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [151]:
sz = X_train_ts_scaled.shape[1]
print(sz)

8192
