In [1]:
import sys
sys.path.insert(1, '../../code/scripts')

In [2]:
# analysis
import numpy as np
from scipy.stats import mannwhitneyu
import pandas as pd
import tempparse as paf

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'font.size': 14})
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = "Arial"

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Setting up

In [4]:
# Calculate scaled hydropathy
kd_scale = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5, "E": -3.5,
                      "Q": -3.5, "G": -0.4, "H": -3.2, "I": 4.5, "L": 3.8, "K": -3.9,
                      "M": 1.9, "F": 2.8, "P": -1.6, "S": -0.8, "T": -0.7, "W": -0.9,
                      "Y": -1.3, "V": 4.2}

vals = list(kd_scale.values())
scaled_vals = [v-np.min(vals) for v in vals]


kd_scale_norm = {}
for k in kd_scale.keys():
    kd_scale_norm[k] = (kd_scale[k] - np.min(vals)) / np.max(scaled_vals)

In [5]:
def append_hydropathy(row):
    seq = row['region_seq']
    return paf.calculate_hydropathy(seq, scale=kd_scale_norm)

In [6]:
def append_norm_nc(row):
    seq = row['region_seq']
    nc = np.abs((seq.count('R') + seq.count('K') - seq.count('E') + seq.count('D'))) / row['len_region']
    return nc

In [7]:
def append_uversky_pred(row):
    norm_hydrop = row['norm_hydrop']
    norm_nc = row['norm_nc']
    if norm_nc > (2.785 * norm_hydrop - 1.151):
        return "disordered"
    else:
        return "helix"

In [8]:
def append_uversky_correctness(row):
    if row['label'] == row['uversky_pred']:
        return 1
    else:
        return 0

## Test dataset of LR Model

In [9]:
df = pd.read_csv('../../data/af_regions/sc_af_regions_testing.csv')
df.head()

Unnamed: 0,region_seq,start,end,uni_id,len_region,label,freq_A,freq_C,freq_D,freq_E,...,freq_M,freq_N,freq_P,freq_Q,freq_R,freq_S,freq_T,freq_V,freq_W,freq_Y
0,VHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKIN,372,407,P18409,36,helix,0.027778,0.0,0.083333,0.083333,...,0.0,0.083333,0.0,0.027778,0.055556,0.111111,0.055556,0.027778,0.0,0.027778
1,DKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQQMQKKIA...,176,249,P31376,74,helix,0.067568,0.0,0.040541,0.175676,...,0.027027,0.027027,0.0,0.148649,0.081081,0.0,0.0,0.0,0.013514,0.0
2,PIEMEEQRMTALKEITDIEYKFAQLRQKLYDNQLVRLQTELQMCLE,171,216,P31385,46,helix,0.043478,0.021739,0.043478,0.152174,...,0.065217,0.021739,0.021739,0.130435,0.065217,0.0,0.065217,0.021739,0.0,0.043478
3,YSIREAAVNNLKRLTEIFGSDWCRDEIISRLL,490,521,P31383,32,helix,0.0625,0.03125,0.0625,0.09375,...,0.0,0.0625,0.0,0.0,0.125,0.09375,0.03125,0.03125,0.03125,0.03125
4,FNQYLDKVYKKHFSKVMSRTRYVLMNFFKDAFTGGAFMYPFKGFLEFNT,34,82,P31379,49,helix,0.040816,0.0,0.040816,0.020408,...,0.061224,0.061224,0.020408,0.020408,0.040816,0.040816,0.061224,0.061224,0.0,0.081633


In [10]:
df['norm_hydrop'] = df.apply(lambda row: append_hydropathy(row), axis=1)
df['norm_nc'] = df.apply(lambda row: append_norm_nc(row), axis=1)
df['uversky_pred'] = df.apply(lambda row: append_uversky_pred(row), axis=1)
df['uversky_correctness'] = df.apply(lambda row:append_uversky_correctness(row), axis=1)

In [11]:
df.to_csv('../../data/uversky/uversky_all.csv', index=False)

In [12]:
df = df.sort_values(by='label', ascending=False)

## Random non-highly-charged regions

In [13]:
df_rd = pd.read_csv('../../data/af_regions/random_af_regions_low_thresh.csv')
df_rd.head()

Unnamed: 0,uni_id,seq,left_bound,right_bound,label
0,P53258,DSFTPTLTRDEKFRLKYKLPANENILEDTNAEVSFATSIKDGKGHS...,18,73,disordered
1,P47154,AHEIGHWQKNHIVNMVIFSQLHTFLIFSLFTSIYRNTSFYNTFGFF...,295,392,helix
2,P46973,LKCYKDAAKHVHKESEQPRAGTEANVEVVNNDKIINSSLAMNKTLK...,29,93,disordered
3,P32849,ILQLDCFLTSLIFEERNDGESLMKRRRTEGGNKREKDNGNFGRTLT...,279,362,disordered
4,P49687,LAQSKPSDKEVILKTDGTFGTLSGKDDSIVEEKAYEPDLSDADFEG...,639,722,disordered


In [14]:
df_rd = df_rd.rename(columns={'seq': 'region_seq'})

In [15]:
def append_region_len(row):
    return len(row['region_seq'])

In [16]:
df_rd['len_region'] = df_rd.apply(lambda row: append_region_len(row), axis=1)

In [17]:
df_rd['norm_hydrop'] = df_rd.apply(lambda row: append_hydropathy(row), axis=1)
df_rd['norm_nc'] = df_rd.apply(lambda row: append_norm_nc(row), axis=1)
df_rd['uversky_pred'] = df_rd.apply(lambda row: append_uversky_pred(row), axis=1)
df_rd['uversky_correctness'] = df_rd.apply(lambda row:append_uversky_correctness(row), axis=1)

In [18]:
df_rd.to_csv('../../data/uversky/uversky_random.csv', index=False)

## Highly charged regions - trimmed

In [19]:
df_hc = pd.read_csv('../../data/charged_regions/cr_trimmed_filtered_aflabel.csv')
df_hc.head()

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,uni_id,orf_label,kappa,label
0,YAL011W,SWC3,626,5,53,RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND,49,0.017593,0.591837,P31376,verified,0.539053,disordered
1,YAL011W,SWC3,626,169,261,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,93,0.052936,0.55914,P31376,verified,0.098538,helix
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.5,P31376,verified,0.089231,disordered
3,YAL013W,DEP1,406,84,159,ESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAIDSK...,76,0.138444,0.592105,P31385,verified,0.219047,disordered
4,YAL019W,FUN30,1132,463,538,ERETKRIRNTTKPKVVEDEDDDVDLEAIDDELPQSEHEDDDYEEED...,76,0.269474,0.644737,P31380,verified,0.498598,disordered


In [20]:
df_hc = df_hc.rename(columns={'region.seq': 'region_seq', 'region.len': 'len_region'})

In [21]:
df_hc['norm_hydrop'] = df_hc.apply(lambda row: append_hydropathy(row), axis=1)
df_hc['norm_nc'] = df_hc.apply(lambda row: append_norm_nc(row), axis=1)
df_hc['uversky_pred'] = df_hc.apply(lambda row: append_uversky_pred(row), axis=1)
df_hc['uversky_correctness'] = df_hc.apply(lambda row:append_uversky_correctness(row), axis=1)

In [22]:
df_hc.to_csv('../../data/uversky/uversky_hc_trimmed.csv', index=False)

## Accuracy

In [23]:
cnf_matrix = np.array([[0, 0], [0, 0]])
cnf_matrix[0, 0] = len(df_hc[(df_hc.label == 'disordered') & (df_hc.uversky_pred == 'disordered')])
cnf_matrix[0, 1] = len(df_hc[(df_hc.label == 'disordered') & (df_hc.uversky_pred == 'helix')])
cnf_matrix[1, 0] = len(df_hc[(df_hc.label == 'helix') & (df_hc.uversky_pred == 'disordered')])
cnf_matrix[1, 1] = len(df_hc[(df_hc.label == 'helix') & (df_hc.uversky_pred == 'helix')])

In [24]:
cnf_matrix

array([[382,   0],
       [237,   0]])

In [30]:
1 - np.sum(df_hc['uversky_correctness']) / len(df_hc)

0.382875605815832

In [27]:
len(df_hc[(df_hc.label == 'helix') & (df_hc.uversky_pred == 'disordered')]) / len(df_hc[df_hc.label == 'helix'])

1.0

In [60]:
len(df[(df.label == 'helix') & (df.uversky_pred == 'helix')]) / len(df[df.uversky_pred == 'helix'])

0.7138952164009111

In [64]:
len(df[(df.label == 'disordered') & (df.uversky_pred == 'disordered')]) / len(df[df.uversky_pred == 'disordered'])

0.7792286668691163

In [65]:
len(df[(df.label == 'disordered') & (df.uversky_pred == 'helix')]) / len(df[df.uversky_pred == 'helix'])

0.28610478359908886