In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt 
from collections import Counter
import re
import difflib
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import diff

import venn as venn
%matplotlib inline

In [2]:
df = pd.read_excel ('LLPS.xls')
df=df.drop(['Entry ID','Fusion', 'Cleaved', 'Repeat', 'Protein name', 'Pressure', 'Incubation time', 'other molecules', 'Detection method', 'Phase diagram', 'Morphology', 'Description', 'PMID', 'DOI', 'Nucleic acid', 'Protein structure type'], axis=1)
df = df[(df['Components type'] == 'protein(1)') & (df['Protein type (N/D)'] == 'N') & (df['PTM'] =='-')]
df=df.drop([ 'Components type', 'Protein type (N/D)', 'PTM'], axis = 1)


In [3]:
sequence = df['Sequence'].tolist()
lst = [str(residue).split('|') for residue in sequence]
flat_list = []
for sublist in lst:
    flat_list.append(''.join(sublist[-1].split('\n')[1:-1]))
    
df['protein'] = flat_list


In [4]:
boolean = df.groupby(by='Protein ID')
protein_id = []
for index, row in boolean:
    if len(row) >= 5:
        protein_id.append(index)

In [5]:
df=df[df['Protein ID'].isin(protein_id)]
df.index=[df['Protein ID'], df['PSID']]
df=df.drop(['PSID', 'Protein ID'], axis=1)
df['quantified mutation'] = df['Mutation'].apply(lambda x: x.isalnum())

In [6]:
clean_df = df.drop_duplicates(subset=['protein'])
clean_df.groupby(by='Protein ID').size()

Protein ID
p0001     7
p0002    38
p0003     6
p0004     4
p0006     1
p0007     3
p0008    10
p0011     6
p0017     4
p0020    10
p0023    25
p0024     4
p0032    10
p0038    21
p0048     9
p0049     7
p0105     1
p0106     3
p0108     1
p0117     8
p0120     2
p0128     1
p0129     3
p0132     3
p0137     7
p0138     5
p0139     1
p0149     3
p0150     7
p0152     1
p0154     1
p0157     1
p0171     6
p0175     5
p0187     1
p0199     8
p0216     5
p0252     1
p0259     1
p0260     1
p0261    13
p0264     1
p0274     1
p0275     1
p0276     4
p0277     1
p0278     1
dtype: int64

Unique sequences: p0002 - 38, p0008 - 10, p0020 - 10, p0023 - 25, p0032 - 10, p0038 - 21, p0261 - 13

In [7]:
p0002 = df.loc['p0002']

In [8]:
p0002['crowding 0/1'] = p0002['Crowding agent'].apply(lambda x:x == '-')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p0002['crowding 0/1'] = p0002['Crowding agent'].apply(lambda x:x == '-')


In [9]:
def temp_checker(x):
    if x == 'RT':
        return True
    try:
        x = int(x)
    except: 
        return np.nan
    
    if x>=20 and x<=30:
        return True
    else:
        return False
    
    
def phase_checker(x):
    if x == 'Yes':
        return True
    else:
        return False
    
def salt_checker(x):
    x = str(x).split()[0]
    if x == '-':
        return False
    
    if x[0] in ['≤', '≥', '<', '>','≥']:
        x = x[1:]
        
    if '–' in x:
        x = x.replace('–', '-')
    try:
        x = float(x)
    except:
        x = float(x.split('-')[1])
    
        
    if x>=100 and x<=500:
        return True
    else:
        return False

def tiret(x):
    if type(x) == datetime.datetime:
        x = np.nan
        
    if str(x)[0] in ['≤', '≥', '<', '>', '＞']:
        x = x[1:]
    
    if '–' in str(x):
        x = str(x).replace('–', '-')
        
    if '-' in str(x):
        return x.split('-')[1]
    
    if 'K' in str(x):
        x = int(str(x).strip('K'))-273
        
    else:
        return x
    
z = re.compile('^[\d \. \-]+')
def space_checker(x):
    if ' ' in x:
        return x
    else:
        match = z.match(x)
        match_end = match.end()
        x = x[:match_end]+' '+x[match_end]
        return x

In [10]:
def concentration_checker(data):
    data['solute_split_1'] = data['Solute concentration'].str.split(" \[").str[0]
    data['solute_split_1'] = data['solute_split_1'].str.rstrip()
    data['solute_split_1'] = data['solute_split_1'].str.lstrip()
    data['solute_split_1'] = data['solute_split_1'].apply(lambda x: space_checker(x))
    data['solute_concentration'] = data['solute_split_1'].str.split(' ').str[0]
    data['solute_concentration'] = data['solute_concentration'].apply(lambda x: tiret(x))
    data['solute_conc_unit'] = data['solute_split_1'].str.split(' ').str[1]
    data = data.drop(['solute_split_1'], axis=1)
#data['Mw'] = [ProteinAnalysis(x).molecular_weight() for x in data['Construct_sequence']]  µM 

    data['conc_to_uM_conversion_factor'] = 0
    data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'µM', 1, data['conc_to_uM_conversion_factor'])
    data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'μM', 1, data['conc_to_uM_conversion_factor'])
    data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'µ', 1, data['conc_to_uM_conversion_factor'])
    data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'uM', 1, data['conc_to_uM_conversion_factor'])
    data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'nM', 0.001, data['conc_to_uM_conversion_factor'])
    data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'mM', 1000, data['conc_to_uM_conversion_factor'])
#data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'mg/ml',
                                    #10**6/data['Mw'], data['conc_to_uM_conversion_factor'])
#data['conc_to_uM_conversion_factor'] = np.where(data['solute_conc_unit'] == 'mg/mL',
                                    #10**6/data['Mw'], data['conc_to_uM_conversion_factor'])
    data['conc_uM_final'] = data['conc_to_uM_conversion_factor'] * data['solute_concentration'].astype(float)
    data = data.drop(columns = ['solute_concentration', 'solute_conc_unit', 'conc_to_uM_conversion_factor'])
    return data

In [11]:
p0002['clean temp'] = p0002['Temperature'].apply(lambda x: tiret(x))
p0002['temp 0/1'] = p0002['clean temp'].apply(lambda x: temp_checker(x))
p0002['salt 0/1'] = p0002['Salt concentration'].apply(lambda x: salt_checker(x))
p0002['phase 0/1'] = p0002['Phase separation'].apply(lambda x: phase_checker(x))
p0002 = concentration_checker(p0002)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p0002['clean temp'] = p0002['Temperature'].apply(lambda x: tiret(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p0002['temp 0/1'] = p0002['clean temp'].apply(lambda x: temp_checker(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p0002['salt 0/1'] = p0002['Salt concentration'].apply(lambda x:

In [12]:
p0002_1 = pd.DataFrame(p0002[[ 'crowding 0/1', 'temp 0/1', 'phase 0/1', 'salt 0/1']])
p0002_1['sum'] = p0002_1.sum(axis=1)
p0002_1['concentration']=p0002['conc_uM_final']
p0002_1['rank']=p0002_1.sort_values(['sum', 'concentration'], ascending=[False, True]).groupby(['sum', 'concentration'], sort=False).ngroup() + 1
p0002_1 = p0002_1.drop([ 'crowding 0/1', 'temp 0/1', 'phase 0/1', 'salt 0/1'], axis=1)
p0002_1['sequence_final']=p0002['protein']
p0002_1['mutation'] = p0002['quantified mutation']
p0002_1=p0002_1.sort_values(['rank'], ascending=True)
p0002_1.to_csv('p0002.csv')

In [13]:
p0002_1

Unnamed: 0_level_0,sum,concentration,rank,sequence_final,mutation
PSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PS00000217,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,True
PS00000137,4.0,1.0,1,MASNDYTQQARQSYGAYPTQPRQGYSQQRSQPYGQQSYSGYSQRTD...,True
PS00000218,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,True
PS00000221,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,True
PS00000224,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,True
...,...,...,...,...,...
PS00000115,2.0,250.0,42,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,False
PS00000116,2.0,250.0,42,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,False
PS00000212,1.0,1.0,43,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,False
PS00000093,1.0,250.0,44,,False


In [14]:
def sequencinator(lst1, lst2):
    compatibility =[]
    for length in lst1:
        length = length.split(':')[1]
        length = length.split('a')[0]
        for sequence in lst2:
            if len(sequence) != int(length):
                compatibility.append(0)
            else:
                compatibility.append(1)
    return compatibility

In [15]:
counts = p0002['protein'].value_counts().to_dict()
def wild_type(counts):
    sequences = {}
    for seq, val in counts.items():
        sequence = {}
        sequence['Name'] = seq
        sequence['frequency'] = val
        sequence['length'] = len(seq)
        sequences[seq] = sequence
    
    return sequences
sequences = wild_type(counts)

def by_length(sequences):
    wt_length = {}
    for seq, dic in sequences.items():
        current_length = dic['length']
        current_sequence = dic['Name']
        if current_length not in wt_length.keys():
            current_frequency = dic['frequency']
            wt_length[current_length] = current_sequence
        elif current_length in wt_length.keys() and dic['frequency'] > current_frequency:
            current_frequency = dic['frequency']
            wt_length[current_length] = current_sequence
            
    return wt_length
wt_length = by_length(sequences)
            

In [16]:
sequences


{'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNSSSGGGGGGGGGGNYGQDQSSMSSGGGSGGGYGNQDQSGGGGSGGYGQQDRGGRGRGGSGGGGGGGGGGYNRSSGGYEPRGRGGGRGGRGGMGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPGGGPGGSHMGGNYGDDRRGGRGGYD': {'Name': 'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNSSSGGGGGGGGGGNYGQDQSSMSSGGGSGGGYGNQDQSGGGGSGGYGQQDRGGRGRGGSGGGGGGGGGGYNRSSGGYEPRGRGGGRGGRGGMGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPGGGPGGSHMGGNYGDDRRGGRGGYD',
  'frequency': 54,
  '

In [17]:
wt_length

{480: 'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNSSSGGGGGGGGGGNYGQDQSSMSSGGGSGGGYGNQDQSGGGGSGGYGQQDRGGRGRGGSGGGGGGGGGGYNRSSGGYEPRGRGGGRGGRGGMGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPGGGPGGSHMGGNYGDDRRGGRGGYD',
 180: 'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNSSSGGGGGGGGGGNYGQD',
 120: 'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSS',
 240: 'QDRGGRGRGGSGGGGGGGGGGYNRSSGGYEPRGRGGGRGGRGGMGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGGGNGRGGRGRGGPMG

In [18]:
def mutation_seeker2(sequence):
    mutations = ''
    
    for key in wt_length.keys():
        if (len(sequence) <= key +5) and (len(sequence) >= key-5):
            wild_type = wt_length[key]
    
            if sequence == wild_type:
                return ''
    
        #else:
            s = difflib.SequenceMatcher(None, wild_type, sequence, autojunk=False)
            for tag, i1, i2, j1, j2 in s.get_opcodes():
                if tag =='replace':
                    mutations += wild_type[i1] + str(i1+1) + sequence[j1]+ ' '
                elif tag == 'delete':
                    mutations += wild_type[i1] + str(i1+1) + '△ '
                elif tag == 'insert':
                    try:
                        mutations += '△'+ wild_type[i1] + str(i1+1) +  sequence[j1] + ' '
                    except IndexError: 
                        mutations += '△'+ str(i1) +  sequence[j1] + ' '
    
            lst = mutations.rstrip().split(' ')#[:-1]
            return lst
                

In [19]:
p0002['mutation_seeker'] = p0002['protein'].apply(lambda x: mutation_seeker2(x))
p0002['protein_len'] = p0002['protein'].apply( lambda x: len(x))
p0002['mutation_seeker'].value_counts()
p0002_new = p0002.drop(['protein', 'clean temp', 'Sequence length', 'Sequence', 'Solute concentration', 'Salt concentration', 'Buffer', 'Crowding agent', 'Temperature', 'Phase separation', 'In vivo/In cell', 'quantified mutation', 'crowding 0/1', 'salt 0/1', 'temp 0/1', 'phase 0/1', 'conc_uM_final'], axis =1)
p0002_new['mutation_len'] = [len(mutation) for mutation in p0002_new['mutation_seeker']]

p0002_new.to_csv('p0002_new.csv')

In [21]:
total.dtypes

sum               float64
concentration     float64
rank                int64
sequence_final     object
mutation             bool
Mutation           object
protein_len         int64
mutation_len        int64
dtype: object

In [23]:
total.to_csv('total2.csv')
#simple_df.to_csv('simple2.csv')

In [26]:
wt_rank = total.loc[total['sequence_final'].isin(wt_length.values())]

long_dic = {}
for index, row in wt_rank.iterrows():
    if len(row['sequence_final']) in long_dic.keys():
        continue
    else:
        long_dic[len(row['sequence_final'])] = (row['rank'], row['sequence_final'])

In [27]:
long_dic

{481: (1,
  'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSAGSSSQSSSAGQPQSGSASQQPSAGGQQQSAGQQQSANPPQGYGQQNQANSSSGRGGRGGRGGNYGQDQSSMSSGGGSGGGYGNQDQSGRGGRGGYGQQDRGGRGRGGRGGGRGGRGRGYNRSSGGYEPRGRGGGRGGRGGRGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGRGGGRGGRGGRGGPMGRGGYRGGRGGRGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPRGGPGGSHRGGNYGDDRRGGRGGYD'),
 480: (2,
  'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNSSSGGGGGGGGGGNYGQDQSSMSSGGGSGGGYGNQDQSGGGGSGGYGQQDRGGRGRGGSGGGGGGGGGGYNRSSGGYEPRGRGGGRGGRGGMGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPGGGPGGSHMGGNYGDDRRGGRGGYD'),
 420:

In [28]:
def mutation_difference(sequence, rank):
    for key, value in long_dic.items():
        if key == len(sequence):
            difference =int(value[0]) - rank
    return difference

In [29]:
total['rank_difference'] = total.apply(lambda x: mutation_difference(x['sequence_final'], x['rank']), axis=1)

In [30]:
total = total.sort_values(by = 'rank_difference', ascending = False)

In [34]:
total = total.drop([ 'mutation'], axis = 1)
total = total.drop_duplicates(subset = 'sequence_final')


total.to_csv('rank_diff2.csv')

In [35]:
total

Unnamed: 0_level_0,sum,concentration,rank,sequence_final,Mutation,protein_len,mutation_len,rank_difference
PSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PS00002266,4.0,20.0,7,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,-,240,2,14
PS00002170,4.0,80.0,8,ASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDT...,-,180,2,8
PS00000129,3.0,1.5,13,MASNDYTQQATQSYDAYPTQPGQGYDQQSSQPYDQQSYDGYDQSTD...,M2,180,6,3
PS00000232,4.0,20.0,7,RRADFNRGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGG...,-,120,13,2
PS00000217,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,M19,480,8,1
PS00000218,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,M10,480,9,1
PS00000221,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,M8,480,19,1
PS00000224,4.0,1.0,1,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,M9,480,20,1
PS00000137,4.0,1.0,1,MASNDYTQQARQSYGAYPTQPRQGYSQQRSQPYGQQSYSGYSQRTD...,M11,480,29,1
PS00000192,4.0,7.0,5,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,-,420,0,0
