# Animate
Old animation notebook (experimenting with different visualization techniques)

In [None]:
import umap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import scimap as sm
import anndata as ad

%matplotlib inline

RHO_CUTOFF = 0.4
FRAC = 0.05
sigma = 2

plt.rcParams.update({'font.size': 15})
plt.rcParams["figure.figsize"] = [16,16]

path = '/Users/fraserking/Development/pca/data/pca_inputs/all_sites_pip.csv'
palette = ['red', '#0f9adb', 'blue', '#8d2c8d', '#e8b400', '#007e58', 'gray', 'black']
features = ['n0_log', 'lambda_log', 'Fs', 'Rho_log', 'Sr_log', 'Nt_log']

In [8]:
# Load data
df = pd.read_csv(path)
df = df[df['Rho'] <= RHO_CUTOFF]
df['n0_log'] = np.log10(df['n0']) 
df['lambda_log'] = np.log10(df['lambda'])
df['Nt_log'] = np.log10(df['Nt'])
df['Rho_log'] = np.log10(df['Rho'])
df['Sr_log'] = np.log10(df['Sr'])
df['Sr_log'] = df['Sr_log'].replace(-np.inf, 0)
df['Dm'] = df['D0']
df['Dm_log'] = np.log10(df['Dm'])
df.drop(['D0'], axis=1, inplace=True)
df = df.drop_duplicates(keep='first')
df_all = df.sample(frac=FRAC, random_state=42)

for feature in features:
    mean_value = df_all[feature].mean()
    std_value = df_all[feature].std()
    
    df_all[f"{feature}_norm"] = (df_all[feature] - mean_value) / std_value

scaled_data = df_all.loc[:, ['n0_log_norm', 'lambda_log_norm', 'Fs_norm', 'Rho_log_norm', 'Sr_log_norm', 'Nt_log_norm']].values
print(scaled_data.shape)
print(df_all.shape)



divide by zero encountered in log10



(31018, 6)
(31018, 23)


In [9]:
pca = PCA(n_components = 0.94)
pca.fit(scaled_data)
pca_embedding = pca.transform(scaled_data)

cols=[]
for i in range(pca_embedding.shape[1]):
    cols.append('pca_eof' + str(i+1))
principalDf = pd.DataFrame(data = pca_embedding, columns = cols)
df_all = df_all.reset_index(drop=True)
principalDf = principalDf.reset_index(drop=True)
df_all = pd.concat([df_all, principalDf], axis=1)

(31018, 3)


In [10]:
groups = {
    1: [
        [[-np.inf, -sigma], [-np.inf, np.inf], [-sigma, sigma]]
    ],
    2: [
        [[-sigma, sigma], [sigma, np.inf], [-np.inf, np.inf]]
    ],
    3: [
        [[sigma, np.inf], [-np.inf, np.inf], [-sigma, sigma]]
    ],
    4: [
        [[-sigma, sigma], [-np.inf, -sigma], [-np.inf, np.inf]]
    ],
    5: [
        [[-np.inf, np.inf], [-sigma, sigma], [sigma, np.inf]]
    ],
    6: [
        [[-np.inf, np.inf], [-sigma, sigma], [-np.inf, -sigma]]
    ]
}

def assign_group(row):
    for group, conditions in groups.items():
        for condition in conditions:
            range1, range2, range3 = condition
            if range1[0] <= row['pca_eof1'] <= range1[1] and range2[0] <= row['pca_eof2'] <= range2[1] and range3[0] <= row['pca_eof3'] <= range3[1]:
                return group
    return 7

df_all['group'] = df_all.apply(assign_group, axis=1)
print(df_all)

       Unnamed: 0 site                 time           n0         Nt        Fs  \
0           63103  MQT  2017-11-19 20:50:00     9.077027    352.454  0.512842   
1         3530354  APX  2023-03-25 21:10:00  2377.321730  65128.816  1.193548   
2          229147  MQT  2016-02-16 10:05:00    78.780897   4456.553  1.352688   
3         1642210  FIN  2017-02-16 23:35:00   147.008009   4008.382  0.877391   
4         2733391  FIN  2017-02-06 00:15:00    65.098648   2760.029  0.687049   
...           ...  ...                  ...          ...        ...       ...   
31013     4063554  NSA  2021-11-15 17:20:00    92.255727   2039.439  0.649518   
31014     4685102  YFB  2018-12-05 09:30:00  1750.184026  51886.463  0.806215   
31015     4373961  NSA  2019-12-02 06:45:00  1041.734023  34528.432  1.017316   
31016     2416700  FIN  2015-02-14 13:20:00  1085.418864  35936.810  1.122043   
31017     1201148  MQT  2018-02-04 04:25:00   541.944087  14549.493  0.730933   

            Sr      Ed     

In [11]:
reducer = umap.UMAP(n_neighbors=750, min_dist=0.22, n_components=3, metric='canberra')
umap_embedding = reducer.fit_transform(scaled_data)
print(umap_embedding)

(31018, 3)


In [19]:
adata = ad.AnnData(X=scaled_data)
umap2  = sm.tl.umap(adata, n_neighbors=750, min_dist=0.22, n_components=3, metric='canberra')
umap2.obs['groups'] = df_all['group']


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [21]:
sm.hl.animate (adata, color='kmeans')

[[11.900461    1.3339728   4.2116265 ]
 [-0.67619956  6.1479797   6.888573  ]
 [ 8.117449    7.184049    6.475685  ]
 ...
 [ 1.5753034   4.510138    6.1951303 ]
 [ 1.5544438   3.7654612   5.6034875 ]
 [ 1.3552419   4.7970786   2.515463  ]]


In [None]:
print("All done!")