### Profile Comparison
Summary: This notebook is used to explore methods for comparing the outputs of PMF5 to NMF-PY and development of metrics for evaluating the output of NMF-PY.

In [1]:
import os
import sys
import copy
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import numpy as np
import pandas as pd

In [2]:
pmf_output = os.path.join("D:\\", "projects", "nmf_py", "data", "baton-rouge_4f_profiles.txt")
# nmf_output = os.path.join("D:\\", "projects", "nmf_py", "data", "22-11-2022_092612.json")
nmf_output = os.path.join("..", "tests", "test-save-04.json")
# nmf_output = os.path.join("sklearn-nmf.json")

In [3]:
pmf_profiles_df = None
pmf_profile_p_df = None
pmf_profile_t_df = None

pmf_profiles = []
pmf_profile_p = []
pmf_profile_t = []

factors = 4
profiles = factors + 2
species_n = 41

column_labels = None

with open(pmf_output, 'r') as open_file:
    profile_strings = open_file.read()
    t = profile_strings.split('\n')
    j = 0
    for line in t:
        i = line.split('\t')
        if len(i) == profiles:
            if i[0] == '' and i[1] == '':
                i[0] = "run"
                i[1] = "species"
                column_labels = i
                continue
            if j < species_n:
                pmf_profiles.append(i)
            elif j < 2 * (species_n):
                pmf_profile_p.append(i)
            elif j < 3 * (species_n):
                pmf_profile_t.append(i)
            j += 1
    pmf_profiles_df = pd.DataFrame(pmf_profiles, columns=column_labels)
    pmf_profile_p_df = pd.DataFrame(pmf_profile_p, columns=column_labels)
    pmf_profile_t_df = pd.DataFrame(pmf_profile_t, columns=column_labels)
    pmf_profiles_df.drop('run', axis=1, inplace=True)
    pmf_profile_p_df.drop('run', axis=1, inplace=True)
    pmf_profile_t_df.drop('run', axis=1, inplace=True)
            
df_columns = list(pmf_profiles_df.columns)

factor_columns = df_columns[1:]
factor_types = {}
for f in factor_columns:
    factor_types[f] = 'float'
pmf_profiles_df = pmf_profiles_df.astype(factor_types)
pmf_profile_p_df = pmf_profile_p_df.astype(factor_types)
pmf_profile_t_df = pmf_profile_t_df.astype(factor_types)

In [4]:
# PMF5 G -> NMF W (Factor contributions [factors x samples]), PMF5 F -> NMF H (Factor Profile [species x factor])

epoch_dfs = {}

with open(nmf_output, 'br') as json_file:
    json_data = json.load(json_file)
    species_columns = np.array(pmf_profiles_df["species"])
    for i in range(len(json_data)):
        nmf_h_data = np.array(json_data[i]["H"])
        nmf_w_data = np.array(json_data[i]["W"])
        nmf_wh_data = np.array(json_data[i]["wh"])
        nmf_wh_data = nmf_wh_data.reshape(nmf_wh_data.shape[1], nmf_wh_data.shape[0])
        
        nmf_h_df = pd.DataFrame(nmf_h_data, columns=species_columns, index=factor_columns)
        nmf_w_df = pd.DataFrame(nmf_w_data, columns=factor_columns)
        nmf_wh_df = pd.DataFrame(nmf_wh_data.T, columns=species_columns)
        
        epoch_dfs[i] = {"WH": nmf_wh_df, "W": nmf_w_df, "H": nmf_h_df}

In [5]:
# Calculate the best matching of factors between PMF5 and NMF-PY by maximizing R^2
from itertools import permutations

epoch_i = 0  # test on single epoch

factor_permutations = list(permutations(factor_columns, len(factor_columns)))
best_permutation = None
best_value = float("inf")

all_r2 = {}

best_permutation_r = None
best_r = float("-inf")
best_model = None
best_factor_r = None

for e in range(len(epoch_dfs)):
    nmf_h = epoch_dfs[e]["H"]
    all_r = []
    for perm in factor_permutations:
        values = []
        for i in range(len(factor_columns)):
            pmf_i = pmf_profiles_df[factor_columns[i]].astype(float)
            nmf_i = nmf_h.loc[perm[i]].astype(float)
            corr_matrix = np.corrcoef(nmf_i, pmf_i)
            corr = corr_matrix[0,1]
            r_sq = corr**2
            values.append(r_sq)
        r_avg = np.mean(values)
        r_max = np.max(values)
        r_min = np.min(values)
        if r_avg > best_r:
            best_r = r_avg
            best_permutation_r = perm
            best_model = e
            best_factor_r = values
        all_r.append((perm, r_avg))
    all_r2[e] = all_r
print(f"R2 - Model: {best_model}, Best permutations: {list(best_permutation_r)}, Average: {best_r}, Factors: {best_factor_r}")

R2 - Model: 9, Best permutations: ['Factor 2', 'Factor 3', 'Factor 1', 'Factor 4'], Average: 0.9725850224237742, Factors: [0.927419178508057, 0.9926387149042535, 0.9721656577625054, 0.998116538520281]


In [6]:
# create factor species percentages
f_sums = nmf_df.iloc[:,1:factors+1].sum(axis=1)
norm_nmf_df = nmf_df.iloc[:,1:factors+1].divide(f_sums, axis=0)
norm_nmf_df.insert(0, 'species', nmf_df['species'])
norm_nmf_df

NameError: name 'nmf_df' is not defined

In [7]:
# Calculating factor metrics per species per sample
# epoch 0
epoch_0_h = epoch_dfs[0].iloc[:,1:factors+1].to_numpy()
epoch_0_w = epoch_w[0]
epoch_0_h = epoch_0_h.reshape(epoch_0_h.shape[1], epoch_0_h.shape[0])

# sample 0
sample_factor_composition = np.tensordot(epoch_0_h[0], epoch_0_w[3], axes=0) #GOOD: the breakdown of the factor contributions to a species by sample
sample_factor_sum = np.sum(sample_factor_composition, axis=1).astype(np.float32) #GOOD: The sum of all factors for each species by sample
sample_factor_sum_T = sample_factor_sum.reshape(sample_factor_sum.shape[0], 1)
sample_factor_percentage = np.divide(
        sample_factor_composition, 
        sample_factor_sum_T, 
        out=np.zeros_like(sample_factor_composition), 
        where=sample_factor_sum_T!=0
    ) #GOOD: The percentage of each factor for each species by sample


AttributeError: 'dict' object has no attribute 'iloc'

In [None]:
# factor contributions for all samples
factor_i = 0

factor_wh = {}
total_wh = None

for i_f in range(factors):
    f_h = epoch_0_h[i_f]
    f_0_wt = epoch_0_w.reshape(epoch_0_w.shape[1], epoch_0_w.shape[0])[i_f]
    f_wh = np.tensordot(f_h, f_0_wt, axes=0)
    factor_wh[i_f] = f_wh
    if total_wh is None:
        total_wh = f_wh
    else:
        total_wh = total_wh + f_wh
        
percent_factors = {}
for i, wh in factor_wh.items():
    i_factor_p = np.divide(
        wh, 
        total_wh, 
        out=np.zeros_like(wh), 
        where=total_wh!=0
    )
    percent_factors[i] = i_factor_p
percent_factors[0].shape

In [None]:
factor_0 = percent_factors[0]
test_avg = np.mean(factor_0, axis=1)
test_avg

In [None]:
norm_pmf_df = pmf_profile_p_df.iloc[:,1:factors+1].divide(pd.Series(np.full(pmf_profile_p_df.shape[0], 100.0)).astype(float), axis=0)
norm_pmf_df.insert(0, 'species', nmf_df['species'])

compare_df = pd.concat([norm_nmf_df, norm_pmf_df], keys=['nmf', 'pmf5'])
compare_df

In [None]:
# Calculate the sum of differences between the pmf factors and nmf factors
import copy

factor_diff = {}
factor_array = []
for i in range(1, factors+1):
    compare_f_df = copy.copy(norm_nmf_df).astype(factor_types)
    factor_s = compare_df.loc['pmf5'].iloc[:, i].astype(float)
    compare_dict = np.abs(compare_f_df.iloc[:,1:factors+1].sub(factor_s, axis="index").sum(axis=0)).to_dict()
    factor_array.append(list(compare_dict.values()))
    factor_diff[f"PMF-Factor {i}"] = compare_dict
factor_diff_df = pd.DataFrame(factor_diff)
factor_diff_df

In [None]:
# Determine the best matching of factors between PMF5 and NMF-PY by minimizing difference
from itertools import permutations

factor_permutations = list(permutations(factor_columns, len(factor_columns)))

pmf_columns = list(factor_diff_df.columns)

best_permutation = None
best_value = float("inf")

all_values = []

for i in range(len(factor_permutations)):
    p_index = list(zip(factor_permutations[i], pmf_columns))
    value = 0
    for j in range(factors):
        value += factor_diff_df.loc[p_index[j]]
    if value < best_value:
        best_value = value
        best_permutation = p_index
    all_values.append((p_index, value))
print(f"Best permutations: {best_permutation}, value: {best_value}")

In [None]:
# Calculate the best matching of factors between PMF5 and NMF-PY by maximizing R^2
best_permutation_r = None
best_r = float("-inf")
all_r = []

for perm in factor_permutations:
    values = []
    for i in range(len(factor_columns)):
        pmf_i = compare_df.loc["pmf5"][factor_columns[i]].astype(float)
        nmf_i = compare_df.loc["nmf"][perm[i]].astype(float)
        corr_matrix = np.corrcoef(nmf_i, pmf_i)
        corr = corr_matrix[0,1]
        r_sq = corr**2
        values.append(r_sq)
    r_avg = np.mean(values)
    if r_avg > best_r:
        best_r = r_avg
        best_permutation_r = perm
    all_r.append((perm, r_avg))
print(f"R2 - Best permutations: {list(best_permutation_r)}, average: {best_r}")

In [None]:
len(epoch_dfs)

In [None]:
pmf_columns = list(factor_diff_df.columns)

for n, _nmf_df in epoch_dfs.items():
    f_sums = _nmf_df.iloc[:,1:factors+1].sum(axis=1)
    norm_nmf_df = _nmf_df.iloc[:,1:factors+1].divide(f_sums, axis=0)
    norm_nmf_df.insert(0, 'species', _nmf_df['species'])
    norm_nmf_df
    
    factor_diff = {}
    factor_array = []
    for i in range(1, factors+1):
        compare_f_df = copy.copy(norm_nmf_df).astype(factor_types)
        factor_s = compare_df.loc['pmf5'].iloc[:, i].astype(float)
        compare_dict = np.abs(compare_f_df.iloc[:,1:factors+1].sub(factor_s, axis="index").sum(axis=0)).to_dict()
        factor_array.append(list(compare_dict.values()))
        factor_diff[f"PMF-Factor {i}"] = compare_dict
    factor_diff_df = pd.DataFrame(factor_diff)
    
    factor_permutations = list(permutations(factor_columns, len(factor_columns)))

    best_permutation = None
    best_value = float("inf")

    all_values = []

    for i in range(len(factor_permutations)):
        p_index = list(zip(factor_permutations[i], pmf_columns))
        value = 0
        for j in range(factors):
            value += factor_diff_df.loc[p_index[j]]
        if value < best_value:
            best_value = value
            best_permutation = p_index
        all_values.append((p_index, value))
    print(f"Epoch: {n+1}, Best permutations: {best_permutation}, value: {best_value}")
    
    best_permutation_r = None
    best_r = float("-inf")
    all_r = []

    for perm in factor_permutations:
        values = []
        for i in range(len(factor_columns)):
            pmf_i = compare_df.loc["pmf5"][factor_columns[i]].astype(float)
            nmf_i = norm_nmf_df[perm[i]].astype(float)
            corr_matrix = np.corrcoef(nmf_i, pmf_i)
            corr = corr_matrix[0,1]
            r_sq = corr**2
            values.append(r_sq)
        r_avg = np.mean(values)
        if r_avg > best_r:
            best_r = r_avg
            best_permutation_r = perm
        all_r.append((perm, r_avg))
    print(f"Epoch: {n+1}, Best R2 permutations: {list(best_permutation_r)}, average: {best_r}")
    
    
    

In [16]:
pmf_factor_contributions = os.path.join("D:\\", "projects", "nmf_py", "data", "baton-rouge_4f_contributions.txt")

pmf_contributions_df = None
pmf_contribution_data = []
pmf_contribution_columns = None

factors = 4
column_row = 4
data_start_row = 5

column_labels = None
dates = []

with open(pmf_factor_contributions, 'r') as open_file:
    contribution_strings = open_file.read()
    rows = contribution_strings.split('\n')
    for i, row in enumerate(rows):
        if i == column_row:
            pmf_contribution_columns = row.split('\t')[2:]
        elif i > data_start_row:
            row_cells = row.split('\t')
            if len(row_cells) > 1:
                dates.append(row_cells[1])
                pmf_contribution_data.append(row_cells[2:])
          
pmf_contributions_df = pd.DataFrame(pmf_contribution_data, columns=pmf_contribution_columns)
pmf_contributions_df["Datetime"] = dates

factor_types = {}
for f in pmf_contribution_columns:
    factor_types[f] = 'float'
pmf_contributions_df = pmf_contributions_df.astype(factor_types)
pmf_contributions_df

Unnamed: 0,Factor 1,Factor 2,Factor 3,Factor 4,Datetime
0,1.54410,0.449500,2.406800,1.59610,06/02/05 03:00
1,1.00530,0.990740,1.150000,1.98420,06/02/05 06:00
2,2.84740,0.516180,3.243700,1.23430,06/03/05 03:00
3,0.20761,2.094500,1.468700,0.17409,06/04/05 03:00
4,0.75381,0.903250,0.461180,0.23299,06/04/05 06:00
...,...,...,...,...,...
301,0.25549,0.633920,0.091281,3.89460,09/24/06 06:00
302,0.51440,0.402750,-0.027151,1.69450,09/25/06 06:00
303,0.30866,0.629770,-0.130690,0.10427,09/26/06 03:00
304,2.28460,0.094477,-0.055430,0.15904,09/26/06 06:00
