In [1]:
import pickle
import pprint
import pandas as pd
import numpy as np

In [2]:
output_path = '/work/forkert_lab/erik/MACAW/predictions'
data_path = '/home/erik.ohara/SFCN_PD_scanner'

In [3]:
likelihood = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}/likehood_for_each_site.pkl','rb') as f2:  
        likelihood.append(pickle.load(f2))

In [4]:
likelihood_without_site = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}_without/likehood_for_each_site.pkl','rb') as f2:  
        likelihood_without_site.append(pickle.load(f2))

In [5]:
likelihood_no_scalers = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}_no_scalers/likehood_for_each_site.pkl','rb') as f2:  
        likelihood_no_scalers.append(pickle.load(f2))

In [6]:
likelihood_all = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}_all/likehood_for_each_site.pkl','rb') as f2:  
        likelihood_all.append(pickle.load(f2))

In [7]:
likelihood_ind = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}_ind/likehood_for_each_site.pkl','rb') as f2:  
        likelihood_ind.append(pickle.load(f2))

In [8]:
likelihood_imagelike = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}_imagelike/likehood_for_each_site.pkl','rb') as f2:  
        likelihood_imagelike.append(pickle.load(f2))

In [9]:
likelihood_rever = []
for site in range(10):
    with open(output_path + f'/PD_PCA3D_site_aside_{site}_ind_obj_rever/likehood_for_each_site.pkl','rb') as f2:  
        likelihood_rever.append(pickle.load(f2))

In [10]:
df_site_aside = {}
for site_aside in range(10):
    df = pd.read_csv(data_path + '/all_df_2.csv',low_memory=False)
    df = df[df['Site_3'] == site_aside].reset_index()
    df_site_aside[site_aside] = df

In [11]:
def predictions_with_closest_site2(likelihood_var, df_site_aside, with_site='with_site'):
    # Finding the closest site
    closest_site = {}
    for site in range(10):
        likelihood_max = float('-inf')  # Better than arbitrary large negative number
        current_closest = None
        
        # First pass to find maximum
        for possible_closest in likelihood_var[site].keys():
            likehood_sum = 0.0  # Explicit float
            for diag in likelihood_var[site][possible_closest].keys():
                for ecomp in likelihood_var[site][possible_closest][diag][with_site].keys():
                    # Use float64 for better precision
                    likehood_sum += np.sum(likelihood_var[site][possible_closest][diag][with_site][ecomp], dtype=np.float64)
            
            if likehood_sum > likelihood_max:  # Strictly greater than
                likelihood_max = likehood_sum
                current_closest = possible_closest
        
        closest_site[site] = current_closest
    
    print("Closest sites found:", closest_site)
    
    # Calculating the predictions
    likelihood_PD = {}
    for site in range(10):
        likelihood_PD[site] = {}
        
        # Initialize arrays for both diagnoses
        for diag in likelihood_var[site][closest_site[site]].keys():
            likelihood_PD[site][diag] = None
            
            for ecomp in likelihood_var[site][closest_site[site]][diag][with_site].keys():
                current_likelihood = likelihood_var[site][closest_site[site]][diag][with_site][ecomp]
                
                if likelihood_PD[site][diag] is None:
                    likelihood_PD[site][diag] = current_likelihood.copy()
                else:
                    likelihood_PD[site][diag] += current_likelihood

    # Make predictions
    predictions_MACAW = {}
    acc_per_site = {}
    overall_pred = None
    total_samples = 0
    
    for site in range(10):
        # Convert to numpy arrays for faster comparison
        pred_0 = likelihood_PD[site]['0']
        pred_1 = likelihood_PD[site]['1']
        
        # Make predictions using numpy comparison
        predictions_MACAW[site] = (pred_0 < pred_1).astype(int)
        
        # Calculate accuracy
        true_labels = df_site_aside[site]['Group_bin'].to_numpy()
        site_pred = (predictions_MACAW[site] == true_labels)
        
        # Update overall predictions
        if overall_pred is None:
            overall_pred = site_pred
        else:
            overall_pred = np.concatenate((overall_pred, site_pred))
        
        # Calculate and store site accuracy
        site_acc = np.mean(site_pred)
        acc_per_site[site] = site_acc
        total_samples += len(site_pred)
        
        print(f"Accuracy on site {site} is {site_acc * 100:.2f}%")
    
    print(f"Total size is {total_samples}")
    overall_acc = np.mean(overall_pred)
    print(f"Overall accuracy is {overall_acc * 100:.2f}%")
    
    return acc_per_site, overall_acc

In [12]:
def predictions_for_all_sites2(likelihood_var, df_site_aside, with_site='with_site'):
    """
    Calculate predictions for all sites by combining likelihoods across all possible closest sites.
    
    Args:
        likelihood_var: Dictionary containing likelihood values
        df_site_aside: DataFrame containing true labels
        with_site: Key for accessing site-specific data (default='with_site')
    
    Returns:
        tuple: (accuracy per site dictionary, overall accuracy)
    """
    # Calculate the predictions
    likelihood_PD = {}
    
    for site in range(10):
        likelihood_PD[site] = {'0': None, '1': None}  # Initialize both diagnoses
        
        for possible_closest in likelihood_var[site].keys():
            for diag in likelihood_var[site][possible_closest].keys():
                temp_likelihood = None
                
                # Combine likelihoods for all ecomp values
                for ecomp in likelihood_var[site][possible_closest][diag][with_site].keys():
                    current_likelihood = likelihood_var[site][possible_closest][diag][with_site][ecomp]
                    
                    if temp_likelihood is None:
                        temp_likelihood = current_likelihood.copy()
                    else:
                        temp_likelihood += current_likelihood
                
                # Initialize or update the total likelihood for this diagnosis
                if likelihood_PD[site][diag] is None:
                    likelihood_PD[site][diag] = temp_likelihood.copy()
                else:
                    likelihood_PD[site][diag] += temp_likelihood

    # Calculate predictions using numpy operations
    predictions_MACAW = {}
    acc_per_site = {}
    overall_pred = None
    total_samples = 0
    
    for site in range(10):
        # Convert likelihoods to numpy arrays
        pred_0 = np.array(likelihood_PD[site]['0'], dtype=np.float64)
        pred_1 = np.array(likelihood_PD[site]['1'], dtype=np.float64)
        
        # Make predictions using numpy comparison
        # Note: Using <= instead of >= to match original logic but with better numerical stability
        predictions_MACAW[site] = (pred_0 <= pred_1).astype(int)
        
        # Calculate accuracy
        true_labels = df_site_aside[site]['Group_bin'].to_numpy()
        site_pred = (predictions_MACAW[site] == true_labels)
        
        # Update overall predictions
        if overall_pred is None:
            overall_pred = site_pred
        else:
            overall_pred = np.concatenate((overall_pred, site_pred))
            
        # Calculate and store site accuracy
        site_acc = np.mean(site_pred)
        acc_per_site[site] = site_acc
        total_samples += len(site_pred)
        
        print(f"Accuracy on site {site} is {site_acc * 100:.2f}%")
        
    print(f"Total size is {total_samples}")
    overall_acc = np.mean(overall_pred)
    print(f"Overall accuracy is {overall_acc * 100:.2f}%")
    
    return acc_per_site, overall_acc

# MACAW with reverse autoregressive

In [34]:
acc,over = predictions_with_closest_site2(likelihood_rever,df_site_aside,with_site='without_site')

Closest sites found: {0: '1', 1: '6', 2: '6', 3: '6', 4: '6', 5: '6', 6: '1', 7: '6', 8: '6', 9: '6'}
Accuracy on site 0 is 43.62%
Accuracy on site 1 is 86.84%
Accuracy on site 2 is 64.60%
Accuracy on site 3 is 66.67%
Accuracy on site 4 is 61.90%
Accuracy on site 5 is 65.29%
Accuracy on site 6 is 66.13%
Accuracy on site 7 is 0.00%
Accuracy on site 8 is 45.95%
Accuracy on site 9 is 19.59%
Total size is 913
Overall accuracy is 46.11%


In [35]:
acc,over = predictions_for_all_sites2(likelihood_rever,df_site_aside,with_site='without_site')

Accuracy on site 0 is 46.81%
Accuracy on site 1 is 86.84%
Accuracy on site 2 is 65.49%
Accuracy on site 3 is 66.67%
Accuracy on site 4 is 61.90%
Accuracy on site 5 is 65.29%
Accuracy on site 6 is 66.13%
Accuracy on site 7 is 0.00%
Accuracy on site 8 is 45.95%
Accuracy on site 9 is 19.59%
Total size is 913
Overall accuracy is 46.55%


# Calculating with likelihood only of the images - old causal graph

In [36]:
acc,over = predictions_with_closest_site2(likelihood_imagelike,df_site_aside,with_site='without_site')

Closest sites found: {0: '9', 1: '5', 2: '9', 3: '9', 4: '6', 5: '1', 6: '1', 7: '9', 8: '5', 9: '5'}
Accuracy on site 0 is 51.06%
Accuracy on site 1 is 75.00%
Accuracy on site 2 is 41.59%
Accuracy on site 3 is 60.00%
Accuracy on site 4 is 30.95%
Accuracy on site 5 is 50.41%
Accuracy on site 6 is 64.52%
Accuracy on site 7 is 73.08%
Accuracy on site 8 is 59.46%
Accuracy on site 9 is 33.47%
Total size is 913
Overall accuracy is 49.73%


In [37]:
acc,over = predictions_for_all_sites2(likelihood_imagelike,df_site_aside,with_site='without_site')

Accuracy on site 0 is 52.13%
Accuracy on site 1 is 13.16%
Accuracy on site 2 is 34.51%
Accuracy on site 3 is 33.33%
Accuracy on site 4 is 38.10%
Accuracy on site 5 is 34.71%
Accuracy on site 6 is 33.87%
Accuracy on site 7 is 34.62%
Accuracy on site 8 is 54.05%
Accuracy on site 9 is 80.41%
Total size is 913
Overall accuracy is 47.75%


# Calculating the closest site (highest likelihood) with site just affecting the images

In [38]:
acc,over = predictions_with_closest_site2(likelihood_ind,df_site_aside,with_site='without_site')

Closest sites found: {0: '9', 1: '6', 2: '9', 3: '5', 4: '6', 5: '7', 6: '1', 7: '9', 8: '5', 9: '5'}
Accuracy on site 0 is 47.87%
Accuracy on site 1 is 21.05%
Accuracy on site 2 is 36.28%
Accuracy on site 3 is 42.22%
Accuracy on site 4 is 47.62%
Accuracy on site 5 is 52.07%
Accuracy on site 6 is 38.71%
Accuracy on site 7 is 42.31%
Accuracy on site 8 is 64.86%
Accuracy on site 9 is 66.94%
Total size is 913
Overall accuracy is 49.18%


# Calculating for all sites with site just affecting the images

In [17]:
acc,over = predictions_for_all_sites2(likelihood_ind,df_site_aside,with_site='without_site')

Accuracy on site 0 is 50.00%
Accuracy on site 1 is 34.21%
Accuracy on site 2 is 34.51%
Accuracy on site 3 is 44.44%
Accuracy on site 4 is 52.38%
Accuracy on site 5 is 52.07%
Accuracy on site 6 is 43.55%
Accuracy on site 7 is 37.18%
Accuracy on site 8 is 64.86%
Accuracy on site 9 is 59.59%
Total size is 913
Overall accuracy is 48.52%


# Calculating the closest site (highest likelihood) with site

In [18]:
acc,over = predictions_with_closest_site2(likelihood,df_site_aside,with_site='with_site')

Closest sites found: {0: '9', 1: '5', 2: '9', 3: '9', 4: '6', 5: '9', 6: '5', 7: '9', 8: '5', 9: '5'}
Accuracy on site 0 is 51.06%
Accuracy on site 1 is 81.58%
Accuracy on site 2 is 35.40%
Accuracy on site 3 is 37.78%
Accuracy on site 4 is 47.62%
Accuracy on site 5 is 36.36%
Accuracy on site 6 is 75.81%
Accuracy on site 7 is 98.72%
Accuracy on site 8 is 56.76%
Accuracy on site 9 is 23.27%
Total size is 913
Overall accuracy is 47.43%


# Calculating likelihood of all sites

In [19]:
acc,over = predictions_for_all_sites2(likelihood,df_site_aside,with_site='with_site')

Accuracy on site 0 is 52.13%
Accuracy on site 1 is 13.16%
Accuracy on site 2 is 34.51%
Accuracy on site 3 is 33.33%
Accuracy on site 4 is 38.10%
Accuracy on site 5 is 34.71%
Accuracy on site 6 is 33.87%
Accuracy on site 7 is 20.51%
Accuracy on site 8 is 54.05%
Accuracy on site 9 is 80.41%
Total size is 913
Overall accuracy is 46.55%


# Calculating the closest site (highest likelihood) without site

In [20]:
acc,over = predictions_with_closest_site2(likelihood_without_site,df_site_aside,with_site='without_site')

Closest sites found: {0: '9', 1: '5', 2: '9', 3: '9', 4: '6', 5: '2', 6: '5', 7: '9', 8: '5', 9: '5'}
Accuracy on site 0 is 51.06%
Accuracy on site 1 is 81.58%
Accuracy on site 2 is 35.40%
Accuracy on site 3 is 37.78%
Accuracy on site 4 is 47.62%
Accuracy on site 5 is 58.68%
Accuracy on site 6 is 75.81%
Accuracy on site 7 is 98.72%
Accuracy on site 8 is 56.76%
Accuracy on site 9 is 23.27%
Total size is 913
Overall accuracy is 50.38%


# Calculating likelihood of all sites without the site variabe

In [21]:
acc,over = predictions_for_all_sites2(likelihood_without_site,df_site_aside,with_site='without_site')

Accuracy on site 0 is 52.13%
Accuracy on site 1 is 13.16%
Accuracy on site 2 is 34.51%
Accuracy on site 3 is 33.33%
Accuracy on site 4 is 38.10%
Accuracy on site 5 is 34.71%
Accuracy on site 6 is 33.87%
Accuracy on site 7 is 20.51%
Accuracy on site 8 is 54.05%
Accuracy on site 9 is 80.41%
Total size is 913
Overall accuracy is 46.55%


# Calculating the closest site (highest likelihood) without site no scalers

In [22]:
acc,over = predictions_with_closest_site2(likelihood_no_scalers,df_site_aside,with_site='without_site')

Closest sites found: {0: '5', 1: '5', 2: '5', 3: '2', 4: '6', 5: '9', 6: '4', 7: '5', 8: '5', 9: '5'}
Accuracy on site 0 is 50.00%
Accuracy on site 1 is 78.95%
Accuracy on site 2 is 65.49%
Accuracy on site 3 is 57.78%
Accuracy on site 4 is 64.29%
Accuracy on site 5 is 33.88%
Accuracy on site 6 is 54.84%
Accuracy on site 7 is 21.79%
Accuracy on site 8 is 54.05%
Accuracy on site 9 is 27.76%
Total size is 913
Overall accuracy is 45.35%


# Calculating likelihood of all sites without the site variabe no scalers

In [23]:
acc,over = predictions_for_all_sites2(likelihood_no_scalers,df_site_aside,with_site='without_site')

Accuracy on site 0 is 52.13%
Accuracy on site 1 is 13.16%
Accuracy on site 2 is 34.51%
Accuracy on site 3 is 33.33%
Accuracy on site 4 is 38.10%
Accuracy on site 5 is 34.71%
Accuracy on site 6 is 33.87%
Accuracy on site 7 is 30.77%
Accuracy on site 8 is 54.05%
Accuracy on site 9 is 80.41%
Total size is 913
Overall accuracy is 47.43%


# Calculating likelihood with all sites in the causal prior without the site variabe

In [24]:
# calculating the predictions
likelihood_PD = {}
for site in range(10):
    # likelihood of diago = 0
    likelihood_PD[site] = {}
    for diag in likelihood_all[site].keys():
        likelihood_PD
        for ecomp in likelihood_all[site][diag]['without_site'].keys():
            if ecomp == 0:
                likelihood_PD[site][diag] = likelihood_all[site][diag]['without_site'][ecomp]
            else:
                likelihood_PD[site][diag] += likelihood_all[site][diag]['without_site'][ecomp]

In [25]:
# still calculating if is PD or not PD
predictions_MACAW = {}
for site in range(10):
    predictions_MACAW[site] = []
    for indiv in range(len(likelihood_PD[site]['0'])):
        if likelihood_PD[site]['0'][indiv] >= likelihood_PD[site]['1'][indiv]:
            predictions_MACAW[site].append(0)
        else:
            predictions_MACAW[site].append(1)

In [26]:
acc_per_site = {}
for site in range(10):
    site_pred = np.array(predictions_MACAW[site]) == df_site_aside[site]['Group_bin'].to_numpy()
    if site == 0:
        overall_pred = site_pred
    else:
        overall_pred = np.concatenate((overall_pred,site_pred))
    site_acc = np.sum(site_pred)/len(site_pred)
    print(f"Accuracy on site {site} is {site_acc* 100:.2f}%")
print(f"Total size is {len(overall_pred)}")
overall_acc = np.sum(overall_pred)/len(overall_pred)
print(f"Overall accuracy is {overall_acc* 100:.2f}%")

Accuracy on site 0 is 52.13%
Accuracy on site 1 is 14.47%
Accuracy on site 2 is 37.17%
Accuracy on site 3 is 33.33%
Accuracy on site 4 is 38.10%
Accuracy on site 5 is 36.36%
Accuracy on site 6 is 35.48%
Accuracy on site 7 is 20.51%
Accuracy on site 8 is 54.05%
Accuracy on site 9 is 80.41%
Total size is 913
Overall accuracy is 47.32%


In [27]:
def predictions_with_closest_site(likelihood_var,with_site='with_site'):
    # Finding the closest site
    closest_site = {}
    for site in range(10):
        likelihood_max = float('-inf')
        current_closest = None
        for possible_closest in likelihood_var[site].keys():
            likehood_sum = 0.0
            for diag in likelihood_var[site][possible_closest].keys():
                for ecomp in likelihood_var[site][possible_closest][diag][with_site].keys():
                    likehood_sum += np.sum(likelihood_var[site][possible_closest][diag][with_site][ecomp], dtype=np.float64)
            if likehood_sum > likelihood_max:
                likelihood_max = likehood_sum
                closest_site[site] = possible_closest
    print(closest_site)
    # calculating the predictions
    likelihood_PD = {}
    for site in range(10):
        # likelihood of diago = 0
        likelihood_PD[site] = {}
        for diag in likelihood_var[site][closest_site[site]].keys():
            for ecomp in likelihood_var[site][closest_site[site]][diag][with_site].keys():
                if ecomp == 0:
                    likelihood_PD[site][diag] = likelihood_var[site][closest_site[site]][diag][with_site][ecomp]
                else:
                    likelihood_PD[site][diag] += likelihood_var[site][closest_site[site]][diag][with_site][ecomp]
    predictions_MACAW = {}
    for site in range(10):
        predictions_MACAW[site] = []
        for indiv in range(len(likelihood_PD[site]['0'])):
            if likelihood_PD[site]['0'][indiv] >= likelihood_PD[site]['1'][indiv]:
                predictions_MACAW[site].append(0)
            else:
                predictions_MACAW[site].append(1)
    acc_per_site = {}
    for site in range(10):
        site_pred = np.array(predictions_MACAW[site]) == df_site_aside[site]['Group_bin'].to_numpy()
        if site == 0:
            overall_pred = site_pred
        else:
            overall_pred = np.concatenate((overall_pred,site_pred))
        site_acc = site_pred.sum()/len(site_pred)
        acc_per_site[site] = site_acc
        print(f"Accuracy on site {site} is {site_acc* 100:.2f}%")
    print(f"Total size is {len(overall_pred)}")
    overall_acc = overall_pred.sum()/len(overall_pred)
    print(f"Overall accuracy is {overall_acc* 100:.2f}%")
    return acc_per_site,overall_acc

In [28]:
def predictions_for_all_sites(likelihood_var,with_site='with_site'):
    # calculating the predictions
    likelihood_PD = {}
    for site in range(10):
        # likelihood of diago = 0
        likelihood_PD[site] = {}
        for possible_closest in likelihood_var[site].keys():
            for diag in likelihood_var[site][possible_closest].keys():
                likelihood_PD
                for ecomp in likelihood_var[site][possible_closest][diag][with_site].keys():
                    if ecomp == 0:
                        likelihood_PD[site][diag] = likelihood_var[site][possible_closest][diag][with_site][ecomp]
                    else:
                        likelihood_PD[site][diag] += likelihood_var[site][possible_closest][diag][with_site][ecomp]
    # still calculating if is PD or not PD
    predictions_MACAW = {}
    for site in range(10):
        predictions_MACAW[site] = []
        for indiv in range(len(likelihood_PD[site]['0'])):
            if likelihood_PD[site]['0'][indiv] >= likelihood_PD[site]['1'][indiv]:
                predictions_MACAW[site].append(0)
            else:
                predictions_MACAW[site].append(1)
    acc_per_site = {}
    for site in range(10):
        site_pred = np.array(predictions_MACAW[site]) == df_site_aside[site]['Group_bin'].to_numpy()
        if site == 0:
            overall_pred = site_pred
        else:
            overall_pred = np.concatenate((overall_pred,site_pred))
        site_acc = np.sum(site_pred)/len(site_pred)
        acc_per_site[site] = site_acc
        print(f"Accuracy on site {site} is {site_acc* 100:.2f}%")
    print(f"Total size is {len(overall_pred)}")
    overall_acc = np.sum(overall_pred)/len(overall_pred)
    print(f"Overall accuracy is {overall_acc* 100:.2f}%")
    return acc_per_site,overall_acc