# Analysing the eigen values

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv(os.path.join("2.process-data", "data", "train_eigen_values.tsv.gz"), sep="\t")

In [3]:
train_data.head()

Unnamed: 0,cell_codes,targets,eigen_value_0000,eigen_value_0001,eigen_value_0002,eigen_value_0003,eigen_value_0004,eigen_value_0005,eigen_value_0006,eigen_value_0007,...,eigen_value_1990,eigen_value_1991,eigen_value_1992,eigen_value_1993,eigen_value_1994,eigen_value_1995,eigen_value_1996,eigen_value_1997,eigen_value_1998,eigen_value_1999
0,OaJHcDs2kh,adrenoceptor,-673.376005,226.999512,19.856234,103.444984,0.996551,12.435002,-34.196611,50.17173,...,0.256675,0.036327,-1.371034,0.52268,0.451433,0.410545,0.766067,-0.489929,-1.232571,-0.181984
1,nwLFF4l070,adrenoceptor,-353.281628,67.610234,40.509354,36.96802,-117.899643,-15.106895,26.007932,16.742913,...,3.430432,3.290681,-0.196751,-3.199812,-1.804525,0.431745,0.362372,-1.270252,0.900353,0.634628
2,JU4SIplWZ7,adrenoceptor,-453.52517,-20.210653,-8.798432,-114.629762,23.357298,-69.57937,183.690544,91.27299,...,0.428241,2.698377,-1.660676,-2.651767,-0.562801,0.601309,-0.641914,1.16277,1.232496,0.530883
3,pqkTwaHa2L,adrenoceptor,-109.195103,-450.929925,10.205896,-232.57144,145.289635,-88.680688,-93.271277,51.418426,...,2.666084,5.704673,0.72076,-3.524865,-0.374867,2.277353,-7.823618,-0.688071,3.111128,0.099316
4,pB2BlQoW94,adrenoceptor,-559.953099,-13.435366,47.59679,64.147829,124.439161,41.770136,-94.237554,11.198108,...,-1.652867,-0.345678,-1.024346,-1.98002,-1.00125,0.668047,1.713858,0.389432,-0.276629,-0.442717


## Scatter plot of the first 2 dimensions

In [4]:
all_targets = train_data.targets.unique()
sample_per_class = 40

small_train_data = pd.DataFrame(columns=train_data.columns)
targets_mapping = {}
for idx, target in enumerate(all_targets):
    targets_mapping[target] = idx
    small_train_data = pd.concat([small_train_data, train_data.loc[train_data.targets == target].sample(sample_per_class)])

color = small_train_data.targets.replace(targets_mapping).to_numpy()



In [5]:
from matplotlib import cm

def create_scatter_plot(
    save_location: str,
    eigenvalue1: int = 0,
    eigenvalue2: int = 1,
    sample_per_class: int = 40,
    ):

    fig, ax = plt.subplots(figsize=(9, 8), dpi=80)
    colormap = cm.get_cmap('tab20', 20)

    for idx, target in enumerate(all_targets):
        targets_mapping[target] = idx
        small_train_data = train_data.loc[train_data.targets == target].sample(sample_per_class)
        x = small_train_data["eigen_value_"+str(eigenvalue1).zfill(4)]
        y = small_train_data["eigen_value_"+str(eigenvalue2).zfill(4)]
        color = np.array([colormap(idx)])
        ax.scatter(x, y, c=color, label=target,
                alpha=0.7, edgecolors='none', cmap='tab20')

    ax.legend(loc="upper right", title="Classes")
    ax.grid(True)

    plt.savefig(os.path.join(save_location, f"scatterplot_eigen_value_{str(eigenvalue1).zfill(4)}_eigen_value_{str(eigenvalue2).zfill(4)}.png"))
    plt.close(fig)

# F-test
The f-test (ANOVA) calculates if all classes have the same mean value

In [6]:
from sklearn.feature_selection import f_classif

minimum_p = 0.0005

fvalues, pvalues = f_classif(train_data.drop(columns=['cell_codes', 'targets']), train_data.targets)
independed_eigen = np.where(pvalues < minimum_p)[0]
print(independed_eigen)

[  0   1   4   5   8  10  12  20  22  24  25  31  32  33  34  40  41  42
  43  52  53  54  56  73  74  75  76  79  80 119 120 192 295]


In [57]:
for i in range(0, len(independed_eigen), 2):
    if i == len(independed_eigen) -1:
        create_scatter_plot(os.path.join("2.process-data", "results"), independed_eigen[i], independed_eigen[-1])
    else:
        create_scatter_plot(os.path.join("2.process-data", "results"), independed_eigen[i], independed_eigen[i + 1])