# Creation of Bayes Boundary Data (3/3):
This is the last of three scripts meant to be run in order. We use a naive bayes model for our predictions, and en route we create and store cutoffs for our plots. 

### 1. Necessary Imports:

In [1]:
import numpy as np
import math 
from scipy import stats
import pandas as pd
import pymysql
import matplotlib as mpl
import matplotlib.pyplot as plt

from __future__ import division # Not neccessary in Python 3 and later
import scipy
from math import exp,sqrt

### 2) Functions to Estimate Probability Cutoffs

In [2]:

#Assuming normal distributions we estimate their moments, 
#then calculate raw probabilities for each subjects data point, 
#finally returning the true estimated probability.

def generate_probability(data, adhd):
    #calculating moments:
    adhdMu = scipy.mean(data[adhd == 1])    
    adhdSD = scipy.std(data[adhd == 1])
    healthyMu = scipy.mean(data[adhd == 0])
    healthySD = scipy.std(data[adhd == 0])

    

    
    adhdADHDProbs = np.zeros(np.shape(data)[0])
    healthyProbs = np.zeros(np.shape(data)[0])
    
    for i in np.arange(np.shape(data)[0]):
        adhdADHDProbs[i] = scipy.stats.norm.pdf(data[i], loc = adhdMu, scale = adhdSD)
        healthyProbs[i] = scipy.stats.norm.pdf(data[i], loc = healthyMu, scale = healthySD)
            
    return adhdADHDProbs/(adhdADHDProbs + healthyProbs), 1 - (adhdADHDProbs/(adhdADHDProbs + healthyProbs))


    

In [3]:
#intermediary function for finding the bayes boundary given two diferent distributions
def get_distribution_cutoff(data):

    omissionErrorsCutoffs = find_boundary(data[:, 1], data[:, 2], data[:, 3])

    commissionErrorsCutoffs = find_boundary(data[:, 4], data[:, 5], data[:, 6])
    
    targetsRTVCutoffs = find_boundary(data[:, 7], data[:, 8], data[:, 9])
    
    dPrimeCutoffs = find_boundary(data[:, 10], data[:, 11], data[:, 12])
    
    betaCutoffs = find_boundary(data[:, 13], data[:, 14], data[:, 15])
    
    theWholePointOfThisFunction = np.concatenate((np.asarray(omissionErrorsCutoffs), 
                                              np.asarray(commissionErrorsCutoffs), 
                                              np.asarray(targetsRTVCutoffs), 
                                              np.asarray(dPrimeCutoffs), 
                                              np.asarray(betaCutoffs)))

    theWholePointOfThisFunction = np.ndarray.flatten(theWholePointOfThisFunction)

    return theWholePointOfThisFunction
            

In [4]:
# helper function for finding the boundary
def find_boundary(x, first, second):
    truths = first < second
    cutoffs = []
    for i in np.arange(len(truths) - 1):
        if truths[i] != truths[i + 1]:
            cutoffs.append(x[np.int(i) + 1])
    if len(cutoffs) < 2:
        cutoffs.append(None)
    return cutoffs
        

In [5]:
"""
This is the meat of these three scripts. The function that takes all
we have done up until now and calculates the features individual and cumulative prediction of adhd
We use a naive bayes methodology inspired by this: 
https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Constructing_a_classifier_from_the_probability_model

Were we to seek to improve this model, we may want to switch out normal distributions for betas of the 
omissions and commission errors, and include gamma distributions for TargetsRTV
Some caveats:
    -From Yiming, naive bayes works poorly with highly correlated features, several of which ours are. 
It may be good to return and check the true correlations as we get more data.
    -Naive Bayes is known for for not giving particularily accurate probability estimates, although their 
classification is usually right (IE the probabilities are on the right side of the midpoint). Because
we are advertising the probability as showing where one is on the spectrum, we should revisit this as
we get more data.
"""
def get_probabilities(data):
    adhd = data[:, 5]
    
    #Here we are initializing everything to have the correct dimension so we can 
    #concatenate them in the future.
    omissionADHDProbs = np.zeros((20, 1))
    commissionADHDProbs = np.zeros((20,1))
    targetRTVADHDProbs = np.zeros((20,1))
    dPrimeADHDProbs = np.zeros((20,1))
    betaADHDProbs = np.zeros((20,1))
    priorsADHD = np.zeros((20,1))
    totalRawProbabilityADHD = np.zeros((20,1))
    
    omissionHealthyProbs = np.zeros((20, 1))
    commissionHealthyProbs = np.zeros((20,1))
    targetRTVHealthyProbs = np.zeros((20,1))
    dPrimeHealthyProbs = np.zeros((20,1))
    betaHealthyProbs = np.zeros((20,1))
    priorsHealthy = np.zeros((20,1))
    totalRawProbabilityHealthy = np.zeros((20,1))
    
    #generating the individual probabilities:
    omissionADHDProbs[:, 0], omissionHealthyProbs[:, 0] = generate_probability(data[:, 0], adhd)
    commissionADHDProbs[:, 0], commissionHealthyProbs[:, 0] = generate_probability(data[:, 1], adhd)
    targetRTVADHDProbs[:, 0], targetRTVHealthyProbs[:, 0] = generate_probability(data[:, 2], adhd)
    dPrimeADHDProbs[:, 0], dPrimeHealthyProbs[:, 0] = generate_probability(data[:, 3], adhd)
    betaADHDProbs[:, 0], betaHealthyProbs[:, 0] = generate_probability(data[:, 4], adhd)
    
    #Getting the total raw probability by multiplying all features
    totalRawProbabilityADHD = multiply_5_arrays(omissionADHDProbs, 
                                                      commissionADHDProbs,
                                                      targetRTVADHDProbs,
                                                      dPrimeADHDProbs, 
                                                      betaADHDProbs)
    totalRawProbabilityHealthy = multiply_5_arrays(omissionHealthyProbs,
                                                        commissionHealthyProbs,
                                                        targetRTVHealthyProbs,
                                                        dPrimeHealthyProbs,
                                                        betaHealthyProbs)
    #Finding the priors using an identical method as the features. It could really be called a 
    #feature itself but ~\_('_')_/~  
    priorsADHD[:, 0], priorsHealthy[:, 0] = generate_probability(data[:, 6], adhd)

    #getting the total probability
    totalProbabilityADHD = np.zeros((20,1))
    pADHD = np.multiply(priorsADHD, totalRawProbabilityADHD)
    pHealthy = np.multiply(priorsHealthy, totalRawProbabilityHealthy)
    pTotal = pADHD + pHealthy
    totalProbabilityADHD = np.divide(pADHD,pTotal)
        
    
    #Just for fun, here is our training accuracy:
    count = 0
    for i in np.arange(20):
        if (np.round(totalProbabilityADHD[i]) == adhd[i]):
            count = count + 1
    print("training accuracy: ", 100 * count/20, "%")
    
    return np.concatenate((omissionADHDProbs,
                           omissionHealthyProbs,
                           commissionADHDProbs, 
                           commissionHealthyProbs,
                           targetRTVADHDProbs,
                           targetRTVHealthyProbs,
                           dPrimeADHDProbs, 
                           dPrimeHealthyProbs,
                           betaADHDProbs, 
                           betaHealthyProbs,
                           totalRawProbabilityADHD, 
                           totalRawProbabilityHealthy,
                           priorsADHD,
                           totalProbabilityADHD),
                          axis = 1)
    

In [6]:
#helper function for readability
def multiply_5_arrays(a,b,c,d,e):
    holder = np.multiply(a, b)
    holder = np.multiply(holder, c)
    holder = np.multiply(holder, d)
    holder = np.multiply(holder, e)
    return holder

### 3. DB connecting, pull, & insert

In [7]:
#initializes connection info for database
def connect():
    return pymysql.connect(host = "rm-j6cluj4576jdi6n6oo.mysql.rds.aliyuncs.com",
                           database = 'rnd_test', 
                           user='cognitiveleap', 
                           password= 'QWE@123456')

In [8]:
#Getting data from DB

def get_individual_data(CaseIds, raw):
    #connect to db
    db = connect()
    
    # prepare a cursor object using cursor() method
    cursor = db.cursor()
    dataPatient = []
    dataSignal = []
    dataSNAP = np.zeros((20,1))
    # Prepare SQL query to INSERT a record into the database.
    for i in CaseIds:
        if raw:
            sql = """SELECT CasdId, OmissionErrors, CommissionErrors, TargetsRtVariability
                     FROM cpt_output_results WHERE Block = 0 AND CasdId = """ + str(i) 
            cursor.execute(sql)
            # Fetch all the rows in a list of lists.
            results = np.asarray(cursor.fetchall())
            dataPatient.append(results)
            
            sql = """SELECT DPrime, Beta, ADHD FROM signal_detection WHERE Block = 0 AND CaseId = """ + str(i)
            cursor.execute(sql)
            # Fetch all the rows in a list of lists.
            results = np.asarray(cursor.fetchall())
            dataSignal.append(results)
            
            
        else:
            sql = "SELECT PathLen, TimeActive, NumRot, TotalDeg, CasdId FROM head_features WHERE CasdId = %s "% str(i)
            #print(sql)
            cursor.execute(sql)
            results = np.asarray(cursor.fetchall())
            if (len(results) > 0):
                data.append(results[0])
     

    #odd error
    """   except:
         print ("Error: unable to fetch data")"""
    if raw:
        sql = """SELECT SNAPCombined FROM snap_hack"""
        cursor.execute(sql)
        # Fetch all the rows in a list of lists.
        results = np.asarray(cursor.fetchall())
        dataSNAP[:, 0] = np.ndarray.flatten(results)


    db.close()
    dataPatient = np.concatenate(np.asarray(dataPatient))
    dataSignal = np.concatenate(np.asarray(dataSignal))
    return np.concatenate((dataPatient, dataSignal, dataSNAP), axis = 1)



In [9]:
#getting data from plotting vector tables
def get_probability_data(raw):
    #connect to db
    db = connect()
    
    # prepare a cursor object using cursor() method
    cursor = db.cursor()
    data = []

    if raw:
        sql = """SELECT * FROM bayes_bound_prob_plot"""
        cursor.execute(sql)
        # Fetch all the rows in a list of lists.
        results = np.asarray(cursor.fetchall())
        data = np.asarray(results)

    else:
        sql = "SELECT PathLen, TimeActive, NumRot, TotalDeg, CasdId FROM head_features WHERE CasdId = %s "% str(i)
        #print(sql)
        cursor.execute(sql)
        results = np.asarray(cursor.fetchall())
        if (len(results) > 0):
            data.append(results[0])
     

    """except:
         print ("Error: unable to fetch data")"""

    return data

In [10]:
#Inserting created cutoofs for plots
def insert_cutoffs(data):
    data[np.equal(data,None)] = "NULL"
    db = connect()
    cursor = db.cursor()

    sql = "INSERT INTO bayes_cutoffs (Id,"
    sql += "OmissionErrorsCutoffOne," 
    sql += "OmissionErrorsCutoffTwo,"
    sql += "CommissionErrorsCutoffOne,"
    sql += "CommissionErrorsCutoffTwo,"
    sql += "TargetRTVCutoffOne, "
    sql += "TargetRTVCutoffTwo, "
    sql += "DPrimeCutoffOne, "
    sql += "DPrimeCutoffTwo, "
    sql += "BetaCutoffOne, "
    sql += "BetaCutoffTwo )"
    sql += " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" % (0,
                                                                     data[0], 
                                                                     data[1], 
                                                                     data[2], 
                                                                     data[3], 
                                                                     data[4], 
                                                                     data[5],
                                                                     data[6],
                                                                     data[7],
                                                                     data[8],
                                                                     data[9])
        # Execute the SQL command
    cursor.execute(sql)
        # Commit your changes in the database
    db.commit()
    db.close()

In [11]:

def insert_bayes_probabilities(data):
    db = connect()
    cursor = db.cursor()
    for i in range(len(data)):
        sql = "INSERT INTO bayes_probabilities"
        sql += "(CaseId,"
        sql += "OmissionRawProbabilityADHD, "
        sql += "OmissionRawProbabilityHealthy, "
        sql += "CommissionRawProbabilityADHD, "
        sql += "CommissionRawProbabilityHealthy, "
        sql += "TargetRTVRawProbabilityADHD, "
        sql += "TargetRTVRawProbabilityHealthy, "
        sql += "DPrimeRawProbabilityADHD, "
        sql += "DPrimeRawProbabilityHealthy, "
        sql += "BetaRawProbabilityADHD, "
        sql += "BetaRawProbabilityHealthy, "
        sql += "totalRawProbabilityADHD, "
        sql += "totalRawProbabilityHealthy, "
        sql += "priors, "
        sql += "finalProbabilityOfADHD) "
        sql += " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" % (np.int(i),
                                                                                         data[i, 0], 
                                                                                         data[i, 1], 
                                                                                         data[i, 2], 
                                                                                         data[i, 3], 
                                                                                         data[i, 4], 
                                                                                         data[i, 5],
                                                                                         data[i, 6],
                                                                                         data[i, 7],
                                                                                         data[i, 8],
                                                                                         data[i, 9],
                                                                                         data[i, 10],
                                                                                         data[i, 11],
                                                                                         data[i, 12],
                                                                                         data[i, 13]) 
        # Execute the SQL command
        cursor.execute(sql)
        # Commit your changes in the database
        db.commit()
    db.close()

### 4. Main Function:

In [14]:
#main function
def main_bayes_boundary(caseIds):
    allPatients = get_individual_data(caseIds, True)   
    probData = get_probability_data(True)
    
    cutoffs = get_distribution_cutoff(probData)
    insert_cutoffs(cutoffs)
    
    probabilities = get_probabilities(allPatients[:, 1:])
    insert_bayes_probabilities(probabilities)
    
    return probData

    

In [15]:
#calls main from here:
bayesData = main_bayes_boundary(np.arange(1, 21, 1))


training accuracy:  90.0 %
