In [1]:
# Hartigan's dip test
# check for bimodal distribution for PKR functional scores separated by K3 contact sites

# interpretation
# dip - low value = unimodal, high value = multimodality (0.05)
# pval - small indicates strong evidence against null hypothesis of unimodality

# bootstrap 1e6 iterations to have the confidence in a pvalue of < 1e-5
# https://github.com/RUrlus/diptest/issues/45

# PKR functional scores versus K3-WT contact sites have a multimodal distrubution
# PDB Contact sites - Dip: 0.10526623882697403 P-value: 0.0
# AF2 Contact sites - Dip: 0.09081772236391696 P-value: 0.0

In [2]:
import pandas as pd
import numpy as np
import diptest # https://pypi.org/project/diptest/

In [3]:
input_file = '../../results/barseq/pkr-variant-reads_240228.csv'
df = pd.read_csv(input_file)

In [4]:
def pkr_type(x, pkr_stop = 551):
    if x[-1] == "*" and int(x.split('-')[1][1:-1]) < pkr_stop:
        return "Nonsense"
    elif x[-2:] == 'WT':
        return "WT"
    else:
        return 'Variant'
df['pkr_type'] = df['pkr'].apply(pkr_type)

In [6]:
# eIF2a contacts 
eif2a_af2 = [274,275,276,279,335,337,338,339,340,341,342,379,382,451,452,453,483,486,487,488,489,490,491,492,493]
eif2a_pdb = [379,382,450,451,452,453,483,484,486,487,488,489,490,492,493]

# k3 contact info
pdb_contacts = [276,277,278,337,375,382,450,451,452,453,454,483,484,485,486,487,488,489,490,492,493,495,496]
af2_contacts = [275,276,278,304,339,343,345,375,379,382,414,416,435,448,449,450,451,452,453,455,460,485,486,487,488,489,490,492,493,496]

def pdb_contact(row):
    site = row['site']
    if site in pdb_contacts:
        return "K3 Contact"
    else:
        return "No Contact"

def af2_contact(row):
    site = row['site']
    if site in af2_contacts:
        return "K3 Contact"
    else:
        return "No Contact"

df['pdb_contact'] = df.apply(pdb_contact, axis=1)
df['af2_contact'] = df.apply(af2_contact, axis=1)

# differing
#set(pdb_contacts).intersection(set(af2_contacts))
#set(pdb_contacts) ^ set(af2_contacts)

In [17]:
# pdb contacts
for i in [1,1e3,1e6]: # n bootstrap iterations
    print(f"Hartigan dip test ({i} boostrap iterations)\n")
    for k3, k3_name in zip(['K3L-Null','K3L-WT','K3L-H47R'],["K3Δ58","K3-WT","K3-H47R"]):
        temp_df = df.query('k3 == @k3 and pkr_type != "Nonsense"')
    
        print(k3)
        
        no_contact = temp_df.query("pdb_contact == 'No Contact'")['auc_mean'].values
        dip, pval = diptest.diptest(no_contact, boot_pval=True, n_boot=int(i))
        print(f"Non-contact sites - Dip: {dip} P-value: {pval}")
        
        contact = temp_df.query("pdb_contact == 'K3 Contact'")['auc_mean'].values
        dip, pval = diptest.diptest(contact, boot_pval=True, n_boot=int(i))
        print(f"Contact sites - Dip: {dip} P-value: {pval}")
        print()
    print("###\n")

Hartigan dip test (1 boostrap iterations)
K3L-Null
Non-contact sites - Dip: 0.009357381191472718 P-value: 1.0
Contact sites - Dip: 0.05394342196188357 P-value: 0.0

K3L-WT
Non-contact sites - Dip: 0.01606227717043062 P-value: 1.0
Contact sites - Dip: 0.10526623882697403 P-value: 0.0

K3L-H47R
Non-contact sites - Dip: 0.018717475160905894 P-value: 0.0
Contact sites - Dip: 0.021399107101934947 P-value: 1.0

###

Hartigan dip test (1000.0 boostrap iterations)
K3L-Null
Non-contact sites - Dip: 0.009357381191472718 P-value: 1.0
Contact sites - Dip: 0.05394342196188357 P-value: 0.013

K3L-WT
Non-contact sites - Dip: 0.01606227717043062 P-value: 0.904
Contact sites - Dip: 0.10526623882697403 P-value: 0.0

K3L-H47R
Non-contact sites - Dip: 0.018717475160905894 P-value: 0.692
Contact sites - Dip: 0.021399107101934947 P-value: 0.992

###

Hartigan dip test (1000000.0 boostrap iterations)
K3L-Null
Non-contact sites - Dip: 0.009357381191472718 P-value: 1.0
Contact sites - Dip: 0.05394342196188357 

In [18]:
# af2 contacts
for i in [1,1e3,1e6]: # n bootstrap iterations
    print(f"Hartigan dip test ({i} boostrap iterations)\n")
    for k3, k3_name in zip(['K3L-Null','K3L-WT','K3L-H47R'],["K3Δ58","K3-WT","K3-H47R"]):
        temp_df = df.query('k3 == @k3 and pkr_type != "Nonsense"')
    
        print(k3)
        
        no_contact = temp_df.query("af2_contact == 'No Contact'")['auc_mean'].values
        dip, pval = diptest.diptest(no_contact, boot_pval=True, n_boot=int(i))
        print(f"Non-contact sites - Dip: {dip} P-value: {pval}")
        
        contact = temp_df.query("af2_contact == 'K3 Contact'")['auc_mean'].values
        dip, pval = diptest.diptest(contact, boot_pval=True, n_boot=int(i))
        print(f"Contact sites - Dip: {dip} P-value: {pval}")
        print()
    print("###\n")

Hartigan dip test (1 boostrap iterations)

K3L-Null
Non-contact sites - Dip: 0.011537036343668006 P-value: 1.0
Contact sites - Dip: 0.04391630888567744 P-value: 0.0

K3L-WT
Non-contact sites - Dip: 0.016479237319341117 P-value: 1.0
Contact sites - Dip: 0.09081772236391696 P-value: 0.0

K3L-H47R
Non-contact sites - Dip: 0.016447520721817658 P-value: 1.0
Contact sites - Dip: 0.0248762285048945 P-value: 1.0

###

Hartigan dip test (1000.0 boostrap iterations)

K3L-Null
Non-contact sites - Dip: 0.011537036343668006 P-value: 0.998
Contact sites - Dip: 0.04391630888567744 P-value: 0.089

K3L-WT
Non-contact sites - Dip: 0.016479237319341117 P-value: 0.881
Contact sites - Dip: 0.09081772236391696 P-value: 0.0

K3L-H47R
Non-contact sites - Dip: 0.016447520721817658 P-value: 0.892
Contact sites - Dip: 0.0248762285048945 P-value: 0.892

###

Hartigan dip test (1000000.0 boostrap iterations)

K3L-Null
Non-contact sites - Dip: 0.011537036343668006 P-value: 0.999584
Contact sites - Dip: 0.0439163088

In [19]:
# pdb contacts are still pval 0 for K3-WT, 1e7 iterations

In [7]:
# test, pdb contacts
for i in [1e3]: # n bootstrap iterations
    print(f"Hartigan dip test ({i} boostrap iterations)\n")
    for k3, k3_name in zip(['K3L-WT'],["K3-WT"]):
        temp_df = df.query('k3 == @k3 and pkr_type != "Nonsense"')
    
        print(k3)
        
        no_contact = temp_df.query("pdb_contact == 'No Contact'")['auc_mean'].values
        dip, pval = diptest.diptest(no_contact, boot_pval=True, n_boot=int(i))
        print(f"Non-contact sites - Dip: {dip} P-value: {pval}")
        
        contact = temp_df.query("pdb_contact == 'K3 Contact'")['auc_mean'].values
        dip, pval = diptest.diptest(contact, boot_pval=True, n_boot=int(i))
        print(f"Contact sites - Dip: {dip} P-value: {pval}")
        print()
    print("###\n")

Hartigan dip test (1000.0 boostrap iterations)

K3L-WT
Non-contact sites - Dip: 0.01606227717043062 P-value: 0.897
Contact sites - Dip: 0.10526623882697403 P-value: 0.0

###

