### Profile Comparison
Summary: This notebook is used to explore methods for comparing the outputs of PMF5 to NMF-PY and development of metrics for evaluating the output of NMF-PY.

In [1]:
import os
import sys
import copy
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import numpy as np
import pandas as pd

In [106]:
pmf_output = os.path.join("D:\\", "projects", "nmf_py", "data", "baton-rouge_4f_profiles.txt")
# nmf_output = os.path.join("D:\\", "projects", "nmf_py", "data", "22-11-2022_092612.json")
# nmf_output = os.path.join("..", "tests", "test-save-04.json")
nmf_output = os.path.join("sklearn-nmf.json")

In [107]:
pmf_profiles_df = None
pmf_profile_p_df = None
pmf_profile_t_df = None

pmf_profiles = []
pmf_profile_p = []
pmf_profile_t = []

factors = 4
profiles = factors + 2
species_n = 41

column_labels = None

with open(pmf_output, 'r') as open_file:
    profile_strings = open_file.read()
    t = profile_strings.split('\n')
    j = 0
    for line in t:
        i = line.split('\t')
        if len(i) == profiles:
            if i[0] == '' and i[1] == '':
                i[0] = "run"
                i[1] = "species"
                column_labels = i
                continue
            if j < species_n:
                pmf_profiles.append(i)
            elif j < 2 * (species_n):
                pmf_profile_p.append(i)
            elif j < 3 * (species_n):
                pmf_profile_t.append(i)
            j += 1
    pmf_profiles_df = pd.DataFrame(pmf_profiles, columns=column_labels)
    pmf_profile_p_df = pd.DataFrame(pmf_profile_p, columns=column_labels)
    pmf_profile_t_df = pd.DataFrame(pmf_profile_t, columns=column_labels)
    pmf_profiles_df.drop('run', axis=1, inplace=True)
    pmf_profile_p_df.drop('run', axis=1, inplace=True)
    pmf_profile_t_df.drop('run', axis=1, inplace=True)
            
df_columns = list(pmf_profiles_df.columns)

factor_columns = df_columns[1:]
factor_types = {}
for f in factor_columns:
    factor_types[f] = 'float'
pmf_profiles_df = pmf_profiles_df.astype(factor_types)
pmf_profile_p_df = pmf_profile_p_df.astype(factor_types)
pmf_profile_t_df = pmf_profile_t_df.astype(factor_types)

In [108]:
# PMF5 G -> NMF W (Factor contributions [factors x samples]), PMF5 F -> NMF H (Factor Profile [species x factor])

epoch_dfs = {}

with open(nmf_output, 'br') as json_file:
    json_data = json.load(json_file)
    species_columns = np.array(pmf_profiles_df["species"])
    for i in range(len(json_data)):
        i = str(i)
        nmf_h_data = np.array(json_data[i]["H"])
        nmf_w_data = np.array(json_data[i]["W"])
        nmf_wh_data = np.array(json_data[i]["wh"])
        nmf_wh_data = nmf_wh_data.reshape(nmf_wh_data.shape[1], nmf_wh_data.shape[0])
        
        nmf_h_df = pd.DataFrame(nmf_h_data, columns=species_columns, index=factor_columns)
        nmf_w_df = pd.DataFrame(nmf_w_data, columns=factor_columns)
        nmf_wh_df = pd.DataFrame(nmf_wh_data.T, columns=species_columns)
        
        epoch_dfs[i] = {"WH": nmf_wh_df, "W": nmf_w_df, "H": nmf_h_df}

In [109]:
# Calculate the best matching of factors between PMF5 and NMF-PY by maximizing R^2
from itertools import permutations

epoch_i = 0  # test on single epoch

factor_permutations = list(permutations(factor_columns, len(factor_columns)))
best_permutation = None
best_value = float("inf")

all_r2 = {}

best_permutation_r = None
best_r = float("-inf")
best_model = None
best_factor_r = None

for e in range(len(epoch_dfs)):
    e = str(e)
    nmf_h = epoch_dfs[e]["H"]
    all_r = []
    for perm in factor_permutations:
        values = []
        for i in range(len(factor_columns)):
            pmf_i = pmf_profiles_df[factor_columns[i]].astype(float)
            nmf_i = nmf_h.loc[perm[i]].astype(float)
            corr_matrix = np.corrcoef(nmf_i, pmf_i)
            corr = corr_matrix[0,1]
            r_sq = corr**2
            values.append(r_sq)
        r_avg = np.mean(values)
        r_max = np.max(values)
        r_min = np.min(values)
        if r_avg > best_r:
            best_r = r_avg
            best_permutation_r = perm
            best_model = e
            best_factor_r = values
        all_r.append((perm, r_avg))
    all_r2[e] = all_r
print(f"R2 - Model: {best_model}, Best permutations: {list(best_permutation_r)}, Average: {best_r}, Factors: {best_factor_r}")

R2 - Model: 6, Best permutations: ['Factor 1', 'Factor 4', 'Factor 2', 'Factor 3'], Average: 0.904150054664094, Factors: [0.839906774150836, 0.8900276900235328, 0.9577537630281903, 0.928911991453817]


In [172]:
# create factor species percentages
f_sums = nmf_df.iloc[:,1:factors+1].sum(axis=1)
norm_nmf_df = nmf_df.iloc[:,1:factors+1].divide(f_sums, axis=0)
norm_nmf_df.insert(0, 'species', nmf_df['species'])
norm_nmf_df

Unnamed: 0,species,Factor 1,Factor 2,Factor 3,Factor 4
0,124-Trimethylbenzene,0.011743,0.926631,0.0,0.061627
1,224-Trimethylpentane,0.133249,0.067491,0.636376,0.162884
2,234-Trimethylpentane,0.307241,0.279501,0.225315,0.187942
3,23-Dimethylbutane,0.365053,0.161562,0.0,0.473385
4,23-Dimethylpentane,0.367453,0.266675,0.365872,0.0
5,2-Methylheptane,0.565114,0.092957,0.0,0.341929
6,3-Methylhexane,0.267219,0.0,0.461249,0.271532
7,3-Methylpentane,0.107113,0.024276,0.704801,0.16381
8,Acetylene,0.069798,0.071819,0.677282,0.181102
9,Benzene,0.091228,0.059573,0.73302,0.116179


In [157]:
# Calculating factor metrics per species per sample
# epoch 0
epoch_0_h = epoch_dfs[0].iloc[:,1:factors+1].to_numpy()
epoch_0_w = epoch_w[0]
epoch_0_h = epoch_0_h.reshape(epoch_0_h.shape[1], epoch_0_h.shape[0])

# sample 0
sample_factor_composition = np.tensordot(epoch_0_h[0], epoch_0_w[3], axes=0) #GOOD: the breakdown of the factor contributions to a species by sample
sample_factor_sum = np.sum(sample_factor_composition, axis=1).astype(np.float32) #GOOD: The sum of all factors for each species by sample
sample_factor_sum_T = sample_factor_sum.reshape(sample_factor_sum.shape[0], 1)
sample_factor_percentage = np.divide(
        sample_factor_composition, 
        sample_factor_sum_T, 
        out=np.zeros_like(sample_factor_composition), 
        where=sample_factor_sum_T!=0
    ) #GOOD: The percentage of each factor for each species by sample


(41, 170)

In [163]:
# factor contributions for all samples
factor_i = 0

factor_wh = {}
total_wh = None

for i_f in range(factors):
    f_h = epoch_0_h[i_f]
    f_0_wt = epoch_0_w.reshape(epoch_0_w.shape[1], epoch_0_w.shape[0])[i_f]
    f_wh = np.tensordot(f_h, f_0_wt, axes=0)
    factor_wh[i_f] = f_wh
    if total_wh is None:
        total_wh = f_wh
    else:
        total_wh = total_wh + f_wh
        
percent_factors = {}
for i, wh in factor_wh.items():
    i_factor_p = np.divide(
        wh, 
        total_wh, 
        out=np.zeros_like(wh), 
        where=total_wh!=0
    )
    percent_factors[i] = i_factor_p
percent_factors[0].shape

In [171]:
factor_0 = percent_factors[0]
test_avg = np.mean(factor_0, axis=1)
test_avg

array([0.32155097949484657, 0.5828886499212484, 0.0, 0.15962164314664415,
       0.5448773208138786, 0.3009502555450457, 0.4026227085084492,
       0.20962912637471065, 0.05999366195567334, 0.036516346651081444,
       0.11310476491901432, 0.08415399187509577, 0.22755679272279702,
       0.19707923165150598, 0.0, 0.1908656393127322, 0.07401712465030555,
       0.08202429670335684, 0.11937328644593055, 0.0, 0.07096923085884044,
       0.019259792854357614, 0.0, 0.05812678323880852,
       0.21371921430754948, 0.0, 0.2778569625008133, 0.379319113072947,
       0.44429010166039873, 0.28669106321511667, 0.3935082075410407,
       0.49400519395598474, 0.1268898579759785, 0.27267111526690857,
       0.7039957319094506, 0.142867879882307, 0.4611798173114283,
       0.09961590742517504, 0.34094621824841087, 0.17211568014389775,
       0.5462689740682877], dtype=object)

In [6]:
norm_pmf_df = pmf_profile_p_df.iloc[:,1:factors+1].divide(pd.Series(np.full(pmf_profile_p_df.shape[0], 100.0)).astype(float), axis=0)
norm_pmf_df.insert(0, 'species', nmf_df['species'])

compare_df = pd.concat([norm_nmf_df, norm_pmf_df], keys=['nmf', 'pmf5'])
compare_df

Unnamed: 0,Unnamed: 1,species,Factor 1,Factor 2,Factor 3,Factor 4
nmf,0,124-Trimethylbenzene,0.011743,0.926631,0.0,0.061627
nmf,1,224-Trimethylpentane,0.133249,0.067491,0.636376,0.162884
nmf,2,234-Trimethylpentane,0.307241,0.279501,0.225315,0.187942
nmf,3,23-Dimethylbutane,0.365053,0.161562,0.0,0.473385
nmf,4,23-Dimethylpentane,0.367453,0.266675,0.365872,0.0
...,...,...,...,...,...,...
pmf5,36,Toluene,0.356919,0.251424,0.314662,0.076994
pmf5,37,Trans-2-Butene,0.0,0.0,0.0,1.0
pmf5,38,Trans-2-Pentene,0.218362,0.058236,0.013302,0.7101
pmf5,39,Unidentified,0.797395,0.109133,0.093472,0.0


In [7]:
# Calculate the sum of differences between the pmf factors and nmf factors
import copy

factor_diff = {}
factor_array = []
for i in range(1, factors+1):
    compare_f_df = copy.copy(norm_nmf_df).astype(factor_types)
    factor_s = compare_df.loc['pmf5'].iloc[:, i].astype(float)
    compare_dict = np.abs(compare_f_df.iloc[:,1:factors+1].sub(factor_s, axis="index").sum(axis=0)).to_dict()
    factor_array.append(list(compare_dict.values()))
    factor_diff[f"PMF-Factor {i}"] = compare_dict
factor_diff_df = pd.DataFrame(factor_diff)
factor_diff_df

Unnamed: 0,PMF-Factor 1,PMF-Factor 2,PMF-Factor 3,PMF-Factor 4
Factor 1,1.99566,0.519003,4.719889,1.484502
Factor 2,4.362891,2.886233,7.087119,3.851732
Factor 3,7.05147,8.528128,4.327242,7.562628
Factor 4,0.043493,1.520151,2.680735,0.554651


In [8]:
# Determine the best matching of factors between PMF5 and NMF-PY by minimizing difference
from itertools import permutations

factor_permutations = list(permutations(factor_columns, len(factor_columns)))

pmf_columns = list(factor_diff_df.columns)

best_permutation = None
best_value = float("inf")

all_values = []

for i in range(len(factor_permutations)):
    p_index = list(zip(factor_permutations[i], pmf_columns))
    value = 0
    for j in range(factors):
        value += factor_diff_df.loc[p_index[j]]
    if value < best_value:
        best_value = value
        best_permutation = p_index
    all_values.append((p_index, value))
print(f"Best permutations: {best_permutation}, value: {best_value}")

Best permutations: [('Factor 4', 'PMF-Factor 1'), ('Factor 2', 'PMF-Factor 2'), ('Factor 3', 'PMF-Factor 3'), ('Factor 1', 'PMF-Factor 4')], value: 8.741468849672584


In [9]:
# Calculate the best matching of factors between PMF5 and NMF-PY by maximizing R^2
best_permutation_r = None
best_r = float("-inf")
all_r = []

for perm in factor_permutations:
    values = []
    for i in range(len(factor_columns)):
        pmf_i = compare_df.loc["pmf5"][factor_columns[i]].astype(float)
        nmf_i = compare_df.loc["nmf"][perm[i]].astype(float)
        corr_matrix = np.corrcoef(nmf_i, pmf_i)
        corr = corr_matrix[0,1]
        r_sq = corr**2
        values.append(r_sq)
    r_avg = np.mean(values)
    if r_avg > best_r:
        best_r = r_avg
        best_permutation_r = perm
    all_r.append((perm, r_avg))
print(f"R2 - Best permutations: {list(best_permutation_r)}, average: {best_r}")

R2 - Best permutations: ['Factor 1', 'Factor 3', 'Factor 2', 'Factor 4'], average: 0.2426222379308552


In [10]:
len(epoch_dfs)

50

In [11]:
pmf_columns = list(factor_diff_df.columns)

for n, _nmf_df in epoch_dfs.items():
    f_sums = _nmf_df.iloc[:,1:factors+1].sum(axis=1)
    norm_nmf_df = _nmf_df.iloc[:,1:factors+1].divide(f_sums, axis=0)
    norm_nmf_df.insert(0, 'species', _nmf_df['species'])
    norm_nmf_df
    
    factor_diff = {}
    factor_array = []
    for i in range(1, factors+1):
        compare_f_df = copy.copy(norm_nmf_df).astype(factor_types)
        factor_s = compare_df.loc['pmf5'].iloc[:, i].astype(float)
        compare_dict = np.abs(compare_f_df.iloc[:,1:factors+1].sub(factor_s, axis="index").sum(axis=0)).to_dict()
        factor_array.append(list(compare_dict.values()))
        factor_diff[f"PMF-Factor {i}"] = compare_dict
    factor_diff_df = pd.DataFrame(factor_diff)
    
    factor_permutations = list(permutations(factor_columns, len(factor_columns)))

    best_permutation = None
    best_value = float("inf")

    all_values = []

    for i in range(len(factor_permutations)):
        p_index = list(zip(factor_permutations[i], pmf_columns))
        value = 0
        for j in range(factors):
            value += factor_diff_df.loc[p_index[j]]
        if value < best_value:
            best_value = value
            best_permutation = p_index
        all_values.append((p_index, value))
    print(f"Epoch: {n+1}, Best permutations: {best_permutation}, value: {best_value}")
    
    best_permutation_r = None
    best_r = float("-inf")
    all_r = []

    for perm in factor_permutations:
        values = []
        for i in range(len(factor_columns)):
            pmf_i = compare_df.loc["pmf5"][factor_columns[i]].astype(float)
            nmf_i = norm_nmf_df[perm[i]].astype(float)
            corr_matrix = np.corrcoef(nmf_i, pmf_i)
            corr = corr_matrix[0,1]
            r_sq = corr**2
            values.append(r_sq)
        r_avg = np.mean(values)
        if r_avg > best_r:
            best_r = r_avg
            best_permutation_r = perm
        all_r.append((perm, r_avg))
    print(f"Epoch: {n+1}, Best R2 permutations: {list(best_permutation_r)}, average: {best_r}")
    
    
    

Epoch: 1, Best permutations: [('Factor 4', 'PMF-Factor 1'), ('Factor 2', 'PMF-Factor 2'), ('Factor 3', 'PMF-Factor 3'), ('Factor 1', 'PMF-Factor 4')], value: 8.741468849672584
Epoch: 1, Best R2 permutations: ['Factor 1', 'Factor 3', 'Factor 2', 'Factor 4'], average: 0.2426222379308552
Epoch: 2, Best permutations: [('Factor 3', 'PMF-Factor 1'), ('Factor 1', 'PMF-Factor 2'), ('Factor 4', 'PMF-Factor 3'), ('Factor 2', 'PMF-Factor 4')], value: 6.102089362764278
Epoch: 2, Best R2 permutations: ['Factor 1', 'Factor 4', 'Factor 2', 'Factor 3'], average: 0.2298473321721479
Epoch: 3, Best permutations: [('Factor 2', 'PMF-Factor 1'), ('Factor 1', 'PMF-Factor 2'), ('Factor 3', 'PMF-Factor 3'), ('Factor 4', 'PMF-Factor 4')], value: 14.051123361433827
Epoch: 3, Best R2 permutations: ['Factor 4', 'Factor 2', 'Factor 3', 'Factor 1'], average: 0.2388586205040999
Epoch: 4, Best permutations: [('Factor 2', 'PMF-Factor 1'), ('Factor 3', 'PMF-Factor 2'), ('Factor 1', 'PMF-Factor 3'), ('Factor 4', 'PMF-Fac