In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.stats import ttest_ind
from numpy import median

input_data_str = 'data/func_class_p0.05_effectsize2.0.csv'
df = pd.read_csv(input_data_str)
df

Unnamed: 0.1,Unnamed: 0,correct,protein_id,sp_id,run_label,seqID,prot_correct,sp_correct,family,run_id,run_index,assay,assay_value,old_correct,plot_id,func
0,0,True,Arnold_002,sps12-2,amylase_1,seq19,True,True,amylase,19_12-2,21,assay_3_value,0.502,True,,True
1,1,True,Arnold_002,sps12-2,amylase_1,seq19,True,True,amylase,19_12-2,33,assay_1_value,0.378,True,Pro_02-2,True
2,2,True,Arnold_002,sps12-2,amylase_1,seq19,True,True,amylase,19_12-2,21,assay_4_value,0.618,True,,True
3,3,True,Arnold_002,sps12-2,amylase_1,seq19,True,True,amylase,19_12-2,21,assay_1_value,0.430,True,Pro_02-2,True
4,4,True,Arnold_002,sps12-2,amylase_1,seq19,True,True,amylase,19_12-2,33,assay_2_value,0.294,True,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348,1348,True,Arnold_039,Neg3,negative,seq3,True,True,negative,3_3_neg,1147,assay_1_value,0.572,False (-SP/+Protein),random_3,False
1349,1349,True,Arnold_039,Neg3,negative,seq3,True,True,negative,3_3_neg,1144,assay_2_value,0.384,False (-SP/-Protein),,False
1350,1350,True,Arnold_039,Neg3,negative,seq3,True,True,negative,3_3_neg,1147,assay_3_value,0.484,False (-SP/+Protein),,False
1351,1351,True,Arnold_039,Neg3,negative,seq3,True,True,negative,3_3_neg,1147,assay_2_value,0.574,False (-SP/+Protein),,False


In [2]:
def sp_prot_match(row):
    ''' Return True if sp matches prot ID and if family is not positive/negative
    '''
    if row['family'] not in ['positive','negative']:
        runID = row['run_id']
        runID = runID.replace('-', '_')
        splits = runID.split('_')
        if splits[0] == splits[1]:
            if splits[-1] not in ['neg','pos']:
                return True
    
    return False

# Get where prot and sp match
df['match'] = df.apply(lambda x: sp_prot_match(x), axis=1)

In [3]:
cols = list(df.columns)
cols.remove('assay')
cols.remove('assay_value')
cols.remove('Unnamed: 0')
cols.remove('run_index')
cols.remove('old_correct')

df = df.groupby(cols).mean().reset_index()
df = df.drop(['Unnamed: 0', 'assay_value', 'run_index'], axis=1)

### !!!! ####
# Throwing out seq12, which did not have sufficient activity for any construct
df = df[df['seqID'] != 'seq12']

In [4]:
len(df[df['match']==True])

29

In [5]:
# Overall summary
neg = df[df['family']=='negative'].copy()
negfunc = neg[neg['func']==True]
print("{0:.2f}".format(len(negfunc)/len(neg)),':', len(negfunc), 'out of', len(neg), 'negative constructs functional')

pos = df[df['family']=='positive'].copy()
posfunc = pos[pos['func']==True]
print("{0:.2f}".format(len(posfunc)/len(pos)), ':', len(posfunc), 'out of', len(pos), 'positive constructs functional')

gen = df[~df['family'].isin(['positive', 'negative'])].copy()
genfunc = gen[gen['func']==True]
print("{0:.2f}".format(len(genfunc)/len(gen)), ':', len(genfunc), 'out of', len(gen), 'generated functional')

matched = gen[gen['match']==True].copy()
matchfunc = matched[matched['func']==True]
print("{0:.2f}".format(len(matchfunc)/len(matched)), ':', len(matchfunc), 'out of', len(matched), 'sps generated for the matching protein are functional')
# print(lw)

0.06 : 1 out of 18 negative constructs functional
0.79 : 27 out of 34 positive constructs functional
0.48 : 53 out of 111 generated functional
0.62 : 18 out of 29 sps generated for the matching protein are functional


In [6]:
gen = df[~df['family'].isin(['positive', 'negative'])].copy()
gen_func = gen[gen['func'] == True]
gen_func.to_csv(input_data_str[:-4] + '_functional.csv')
gen_func

Unnamed: 0,correct,protein_id,sp_id,run_label,seqID,prot_correct,sp_correct,family,run_id,plot_id,func,match
12,True,Arnold_001,sps2-2,amylase_2,seq39,True,True,amylase,39_2-2,Lip_08-2,True,False
15,True,Arnold_001,sps20-2,amylase_1,seq39,True,True,amylase,39_20-2,Pro_04-2,True,False
17,True,Arnold_001,sps20-4,amylase_1,seq39,True,True,amylase,39_20-4,Pro_04-4,True,False
19,True,Arnold_001,sps39-3,amylase_1,seq39,True,True,amylase,39_39-3,Amy_01-3,True,True
20,True,Arnold_001,sps39-4,amylase_1,seq39,True,True,amylase,39_39-4,Amy_01-4,True,True
22,True,Arnold_002,sps12-2,amylase_1,seq19,True,True,amylase,19_12-2,Pro_02-2,True,False
23,True,Arnold_002,sps12-4,amylase_1,seq19,True,True,amylase,19_12-4,Pro_02-4,True,False
24,True,Arnold_002,sps19-1,amylase_1,seq19,True,True,amylase,19_19-1,Amy_02-1,True,True
25,True,Arnold_002,sps19-2,amylase_1,seq19,True,True,amylase,19_19-2,Amy_02-2,True,True
26,True,Arnold_002,sps19-3,amylase_1,seq19,True,True,amylase,19_19-3,Amy_02-3,True,True


In [7]:
# For each protein target
seqs = list(set(df['seqID'].values))
seqs.sort()

neg_total=0
negfunc_total = 0

for seqID in seqs:
    print(seqID)
    # Get the fraction of functioning negatives, generated, and natural SPs (numbers)
    _df = df[df['seqID'] == seqID].copy()
    print(_df.iloc[0]['protein_id'])
    
    # Controls
    neg = _df[_df['family']=='negative'].copy()
    negfunc = neg[neg['func']==True]
    pos = _df[_df['family']=='positive'].copy()
    posfunc = pos[pos['func']==True]
    
    if len(neg) != 0:
        print("{0:.2f}".format(len(negfunc)/len(neg)), ':', len(negfunc), 'out of', len(neg), 'negatives functional')
    else:
        print('no negatives found')    
        
    if len(pos) != 0:
        print("{0:.2f}".format(len(posfunc)/len(pos)), ':', len(posfunc), 'out of', len(pos), 'positives functional')
    else:
        print('no positives found')
        
    # Generated data
    gen = _df[~_df['family'].isin(['negative','positive'])].copy()
    genfunc = gen[gen['func']==True]
    if len(gen) != 0:
        print("{0:.2f}".format(len(genfunc)/len(gen)), ':', len(genfunc), 'out of', len(gen), 'generated functional')
    else:
        print('no generated functional')
        
    matched = gen[gen['match']==True].copy()
    matchfunc = matched[matched['func']==True]
    if len(matched) != 0:
        print("{0:.2f}".format(len(matchfunc)/len(matched)), ':', len(matchfunc), 'out of', len(matched), 'generated for', str(seqID), 'functional')
    else:
        print('no matched found')
    print('-----\n')

# Also get the number where SP matches prot

seq1
Arnold_036
0.33 : 1 out of 3 negatives functional
0.83 : 5 out of 6 positives functional
0.50 : 4 out of 8 generated functional
no matched found
-----

seq19
Arnold_002
no negatives found
no positives found
0.70 : 7 out of 10 generated functional
1.00 : 4 out of 4 generated for seq19 functional
-----

seq21
Arnold_015
no negatives found
1.00 : 1 out of 1 positives functional
0.70 : 7 out of 10 generated functional
0.75 : 3 out of 4 generated for seq21 functional
-----

seq25
Arnold_026
0.00 : 0 out of 3 negatives functional
0.80 : 4 out of 5 positives functional
0.58 : 7 out of 12 generated functional
0.75 : 3 out of 4 generated for seq25 functional
-----

seq26
Arnold_040
0.00 : 0 out of 3 negatives functional
0.80 : 4 out of 5 positives functional
0.67 : 8 out of 12 generated functional
0.75 : 3 out of 4 generated for seq26 functional
-----

seq3
Arnold_039
0.00 : 0 out of 3 negatives functional
0.50 : 3 out of 6 positives functional
0.20 : 2 out of 10 generated functional
0.00 