# Add correlation matrices to Dataframe

In [1]:
import re
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

Load *df_parameter* DataFrame:

In [None]:
df_par = pd.read_pickle('df_parameter.p')
display(df_par.head())
df_par.shape

In [None]:
df_par.loc[:, 'gBB'].unique()
df_par.loc[:, 'gBC'].unique()

In [None]:
n_grid = 200  # size of grid

# create labels for columns, these correspond to the upper triangular of a matrix
cols_x1x2 = [f"x{j+1}_x{i+1}" for i in range(n_grid) for j in range(i+1)]

print(cols_x1x2[:10])

Load npz-files and add pixels as features

1. Create new dataframe with flattened npz-matrices as rows and x1-x2 values as columns
2. Concat parameter df and correlation-matrix df

## First attempt: Reading all at once (not working)

Load npz-files and add pixels as features

1. Create new dataframe with flattened npz-matrices as rows and x1-x2 values as columns
2. Concat parameter df and correlation-matrix df

## Second attempt: Store as pickles and concat

1. Load npz files and change to float32.
2. Correlation functions are symmetric, therefore, store only upper triangular.
3. Flatten correlation matrices, they denote the rows of the DataFrames.
4. Store DataFrames in chunks of 1000 as pickles.
5. Restart notebook to drop loaded memory.
6. Reload DataFrames and concat them to a big DataFrame.
7. Store it as a pickle.

1\. First check if storing the matrices with float16-type is still acceptable or we should go with float32.

In [None]:
gBB, gCC, gAB, gAC, gBC = 1.0, 1.0, -1.0, 0.2, 0.2
mask = (
    (df_par.loc[:, 'gBB'] == gBB) &
    (df_par.loc[:, 'gCC'] == gCC) &
    (df_par.loc[:, 'gAB'] == gAB) &
    (df_par.loc[:, 'gAC'] == gAC) &
    (df_par.loc[:, 'gBC'] == gBC)
)
corrBC_example = np.load(df_par.loc[mask, 'path'].iloc[0] + '/correlation_fct_BC.npz')['corrBC']



xgrid = np.linspace(-2.5, 2.5, 200)
x, y = np.meshgrid(xgrid, xgrid)
corrBC_float64 = np.array(corrBC_example, dtype=np.float64)
corrBC_float32 = np.array(corrBC_example, dtype=np.float32)
corrBC_float16 = np.array(corrBC_example, dtype=np.float16)

print(np.abs(corrBC_float64 - corrBC_float32).max())
print(np.abs(corrBC_float64 - corrBC_float16).max())
print(np.abs(corrBC_float32 - corrBC_float16).max())


Float32 and float16 seems to be fine. However, continue with float32.

Steps: 2. - 4.
- flatten upper triangular and store DataFrames in chunks

In [None]:
if False:  # Execute only once, to preserve memory
    corr_list = []
    count = 0
    for i, path_npz in enumerate(df_par.loc[:, 'path']):

        corrBC = np.load(path_npz + '/correlation_fct_BC.npz')['corrBC']
        
        # flatten the upper triangular and reduce precision
        corrBC_flat = corrBC[np.triu_indices(n_grid)].astype(np.float32)
 
        corr_list.append(corrBC_flat)

        if (i+1) % 1000 == 0:
            count += 1
            df_corr = pd.DataFrame(corr_list, columns=cols_x1x2)
            print(count, df_corr.shape)
            df_corr.to_pickle(f'data/df_corr_batch_{count:02d}.p')
            corr_list = []
    df_corr = pd.DataFrame(corr_list, columns=cols_x1x2)
    count += 1
    print(count, df_corr.shape)
    df_corr.to_pickle(f'data/df_corr_batch_{count:02d}.p')

Steps: 5. - 7.

- restart notebook, reload, concat and save as pickle

In [None]:
df_corr_list = [pd.read_pickle(f'data/df_corr_batch_{count:02d}.p') for count in range(1, 20)]

df_corr = pd.concat(df_corr_list, axis=0)

In [None]:
df_corr.shape

In [None]:
df_corr.to_pickle(f'data/df_corr.p')

## Reduce correlation DataFrame further with PCA

1. Restart notebook and skip the above part to only load the df_corr.p into memory (otherwise it can happen that the kernel dies in the following, at least on my local laptop).
2. Standardize data and apply PCA.
3. Check that the applied PCA does not destroy relevant information.

Start with step 1:

In [2]:
df_corr = pd.read_pickle('data/df_corr.p')

Continue with step 2: Use Principle Component analysis to reduce the number of columns further.

In [3]:
pipe_pca = Pipeline([
    ('std', StandardScaler()),
    ('pca', PCA(n_components=100, random_state=42))
])

arr_corr = pipe_pca.fit_transform(df_corr)

Check results (steps 3)

In [6]:
expl_var = pipe_pca.named_steps['pca'].explained_variance_ratio_

print('Explained variance:', sum(expl_var))

Explained variance: 0.9999999945681466


Transform back and invert Standardscaler:

In [9]:
corr_back_scaled = np.dot(arr_corr, pipe_pca.named_steps['pca'].components_)

corr_backscaled = pipe_pca.named_steps['std'].inverse_transform(corr_back_scaled)

Reconstruct correlation matrix:

In [None]:
def reconstruct_corr_mat(corr_backscaled, index):
    """
    
    """

### Merge df_corr and df_par

In [None]:
df_main.shape