# Bayesian Analysis

We want to answer the simple question: Is the CAM created by a participant unique?

To do this, we will construct N random graphs using *networkx.gnm_random_graph* setting the same number of nodes and edges.

In [56]:
import re
import numpy as np
import pandas as pd
import os
from os import walk
import networkx as nx
import random
import pymc3 as pm
import numpy as np
import arviz as az



In [2]:
# Read in CAM to get nodes and edges
file_path = '/home/carterrhea/Dropbox/APLS-CAM-Proposal/DataFinal/Clean' #path to data on local computer

#Function to get unique elements from a list
def list_unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
            
    return unique_list

def make_id_list(my_file_path):

    #pull all filenames in your data directory
    _, _, filenames = next(walk(my_file_path))

    #grab only the id from the filenames to feed to the function
    id_list = []
    for filename in filenames:
        filename_split = filename.rsplit('_', 1)
        id_list.append(filename_split[0])
    
    unique_id = list_unique(id_list) #drop duplicate ids, since they are the same for _blocks & _links
        
    
    
    return unique_id


#Get an id_list from your specific file path
my_id_list = make_id_list(file_path)

In [33]:
# Grab single to play with
cam_id = my_id_list[10]
df_blocks = pd.read_csv(f'{file_path}/{cam_id}_blocks.csv')
links_df = pd.read_csv(f'{file_path}/{cam_id}_links.csv')
#df_blocks
df_blocks[df_blocks['shape']=='neutral'].count()[0]

3

In [49]:
# Create distance matrix
D = np.zeros((7,7))  # We have seven node types hence 7 (note that we consider ambivalent as the same as neutral in this calculation)
dist_ = np.arange(7)
for i in range(7):
    D[i,i:] = dist_
    D[i:,i] = dist_
    dist_ = dist_[:-1]

def calculate_diversity(D, probs):
    """
    Calculate Stirling Diversity measure with alpha = beta = 1
    Args:
        param: D - Distance Matrix
        param: probs - list of probabilities in order of Strong Negative, Negative, Weak Negative, Neutral, Weak Positive, Positive, Strong Positive
    """
    S = 0
    for i in range(7):
        for j in range(7):
            S += D[i, j]*probs[i]*probs[j]
    return S
# Get probabilities of each type of node -- used for diversity calculation
neg_strong = df_blocks[df_blocks['shape']=='negative strong'].count()[0]
neg = df_blocks[df_blocks['shape']=='negative'].count()[0]
neg_weak = df_blocks[df_blocks['shape']=='negative weak'].count()[0]
neutral = df_blocks[(df_blocks['shape']=='neutral') & (df_blocks['shape']=='ambivalent')].count()[0]
pos_weak = df_blocks[df_blocks['shape']=='positive weak'].count()[0]
pos = df_blocks[df_blocks['shape']=='positive'].count()[0]
pos_strong = df_blocks[df_blocks['shape']=='positive strong'].count()[0]
probs = [neg_strong, neg, neg_weak, neutral, pos_weak, pos, pos_strong]
true_div = calculate_diversity(D, probs)
print(true_div)

88.0


In [51]:
# Create n random graphs with the same number of nodes and edges
random_graphs = []
div_vals = []
n_ = 1000
for i in range(n_):
    random_graph = nx.gnm_random_graph(len(blocks_df), len(links_df))
    random_graphs.append(random_graph)
    # Get probabilities of each type of node -- used for diversity calculation
    neg_strong = df_blocks[df_blocks['shape']=='negative strong'].count()[0]
    neg = df_blocks[df_blocks['shape']=='negative'].count()[0]
    neg_weak = df_blocks[df_blocks['shape']=='negative weak'].count()[0]
    neutral = df_blocks[(df_blocks['shape']=='neutral') & (df_blocks['shape']=='ambivalent')].count()[0]
    pos_weak = df_blocks[df_blocks['shape']=='positive weak'].count()[0]
    pos = df_blocks[df_blocks['shape']=='positive'].count()[0]
    pos_strong = df_blocks[df_blocks['shape']=='positive strong'].count()[0]
    probs = [neg_strong, neg, neg_weak, neutral, pos_weak, pos, pos_strong]
    # Randomly assign valences to each node
    probs_rand = []
    for node in random_graph.nodes(data=True):
        shape_type_rand = random.choices([-3,-2,-1,0,1,2,3], weights=probs, k=1)[0]
        node[1]['shape_num'] = shape_type_rand
        probs_rand.append(shape_type_rand)
    # Calculate random graph node probabilities
    probs_rand = [probs_rand.count(-3), probs_rand.count(-2), probs_rand.count(-1), probs_rand.count(0), probs_rand.count(1), probs_rand.count(2), probs_rand.count(3)]
    # Calculate the diversity of each graph
    div_ = calculate_diversity(D, probs_rand)
    div_vals.append(div_)

In [52]:
# Calculate the number of instances with a similar diversity
insts_div = 0
for div_ in div_vals:
    if div_ - true_div < 100:
        insts_div += 1
    else:
        pass
print(insts_div)

35


In [57]:
# Calculate probability of replicating using Bayesian stats
alphas = np.array([1, 1])
c = np.array([insts_div, n_-insts_div])

# Create model
with pm.Model() as model:
    # Parameters of the Multinomial are from a Dirichlet
    parameters = pm.Dirichlet('parameters', a=alphas, shape=2)
    # Observed data is from a Multinomial distribution
    observed_data = pm.Multinomial(
        'observed_data', n=n_, p=parameters, shape=2, observed=c)    

  variables = ufunc(*ufunc_args, **ufunc_kwargs)


In [58]:
with model:
    # Sample from the posterior
    trace = pm.sample(draws=1000, chains=2, tune=500, 
                      discard_tuned_samples=True)

  trace = pm.sample(draws=1000, chains=2, tune=500,
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [parameters]


Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 20 seconds.


In [61]:
print(az.summary(trace, kind="stats").mean)





<bound method NDFrame._add_numeric_operations.<locals>.mean of                 mean     sd  hdi_3%  hdi_97%
parameters[0]  0.036  0.006   0.025    0.048
parameters[1]  0.964  0.006   0.952    0.975>
