# Script for generating scores for Pymol visualization
- Author: Savandara BESSE & Leo BLONDEL
- Creation: 06-30-2017
- Last modification : 09-03-2019


### Required inputs
- ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta
- ../Data/02_Oskar_analyses/2.5/FASTA/OSKAR_Monomeric_alignment.fasta
- ../Data/02_Oskar_analyses/2.5/FASTA/OSKAR_Dimeric_alignment.fasta
- ../Data/02_Oskar_analyses/2.4/FASTA/OSKAR_hemimetabola.fasta
- ../Data/02_Oskar_analyses/2.4/FASTA/OSKAR_holometabola.fasta

#### If CSV already generated: 
- Directly use createTable(CSV_PATH)
> Note: logratio.csv and RNABindR_scores.csv need to be provided

### Description
Computes all possible scores for Oskar alignment using `score_conservation.py` and `besse_blondel_conservation_scores.py`

### Generated outputs
Available in `./Data/03_Oskar_scores_generation/CSV/`
- scores.csv
- mapping.csv

In [297]:
import numpy as np
import pandas as pd
import os, progressbar, re, sys, time

from Bio.Seq import Seq
from Bio import AlignIO, SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
from Bio.Align import MultipleSeqAlignment

## STEP 1 : Run all Python scripts to generate the scores tables for each alignment (optional if CSV files already generated)

In [298]:
fasta_files = {
    'OSKAR': "../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta",
    'OSKAR_Monomers': "../Data/02_Oskar_analyses/2.5/FASTA/OSKAR_Monomeric_alignment.fasta",
    'OSKAR_Dimers': "../Data/02_Oskar_analyses/2.5/FASTA/OSKAR_Dimeric_alignment.fasta",
    'OSKAR_Hemimetabola': "../Data/02_Oskar_analyses/2.4/FASTA/OSKAR_hemimetabola.fasta",
    'OSKAR_Holometabola': "../Data/02_Oskar_analyses/2.4/FASTA/OSKAR_holometabola.fasta"
}

In [299]:
CSV_PATH = '../Data/03_Oskar_scores_generation/CSV'

In [245]:
fasta_files = {
    'OSKAR_Monomers': "../Data/02_Oskar_analyses/2.5/FASTA/OSKAR_Monomeric_alignment.fasta",
    'OSKAR_Dimers': "../Data/02_Oskar_analyses/2.5/FASTA/OSKAR_Dimeric_alignment.fasta",
}

In [300]:
bar = progressbar.ProgressBar()
for alignment in bar(fasta_files):
    ## JSD_scores
    !python2 ./score_conservation.py -o {os.path.join(CSV_PATH, alignment + "_JSD_scores.csv")} {fasta_files[alignment]} 
    !python3 besse_blondel_conservation_scores.py -a {fasta_files[alignment]} -o {os.path.join(CSV_PATH, alignment)}_other_scores.csv  

N/A% (0 of 5) |                          | Elapsed Time: 0:00:00 ETA:  --:--:--

Computing Scores
 [Elapsed Time: 0:00:00] |                                  | (ETA:  --:--:--) Calculating Weights...
 [Elapsed Time: 0:00:07] |##################################| (Time:  0:00:07) 


 20% (1 of 5) |#####                     | Elapsed Time: 0:00:12 ETA:   0:00:50

Computing Scores
 [Elapsed Time: 0:00:00] |                                  | (ETA:  --:--:--) Calculating Weights...
^C
Traceback (most recent call last):
  File "besse_blondel_conservation_scores.py", line 276, in <module>
    scores.save_Score(score_type, output, weighted)
  File "besse_blondel_conservation_scores.py", line 172, in save_Score
    results = self.get_Score(score_type, weighted)
  File "besse_blondel_conservation_scores.py", line 160, in get_Score
    scores['valdar'].append(self.Score_Valdar(position))
  File "besse_blondel_conservation_scores.py", line 70, in Score_Valdar
    self.get_weight_sequences()
  File "besse_blondel_conservation_scores.py", line 50, in get_weight_sequences
    W.append(self.weight(i))
  File "besse_blondel_conservation_scores.py", line 42, in weight
    w += self.distance_seq(list(self.alignment[i].seq),list(self.alignment[j].seq))
  File "/home/lblondel/.local/lib/python3.5/site-packages/Bio/Seq.py", line 237, in __getitem__
    return sel

 40% (2 of 5) |##########                | Elapsed Time: 0:00:23 ETA:   0:00:32

Computing Scores
 [Elapsed Time: 0:00:00] |                                  | (ETA:  --:--:--) Calculating Weights...
 [Elapsed Time: 0:00:10] |######                            | (ETA:   0:00:45) ^C
Traceback (most recent call last):
  File "besse_blondel_conservation_scores.py", line 276, in <module>
    scores.save_Score(score_type, output, weighted)
  File "besse_blondel_conservation_scores.py", line 172, in save_Score
    results = self.get_Score(score_type, weighted)
  File "besse_blondel_conservation_scores.py", line 160, in get_Score
    scores['valdar'].append(self.Score_Valdar(position))
  File "besse_blondel_conservation_scores.py", line 85, in Score_Valdar
    M = self.valdar_table[Six][Sjx]
KeyboardInterrupt
 [Elapsed Time: 0:00:10] |##################################| (Time:  0:00:10) 


 60% (3 of 5) |###############           | Elapsed Time: 0:00:35 ETA:   0:00:22

^C
Traceback (most recent call last):
  File "./score_conservation.py", line 813, in <module>
    names, alignment = read_fasta_alignment(align_file)
  File "./score_conservation.py", line 664, in read_fasta_alignment
    if aa not in iupac_alphabet:
KeyboardInterrupt
^C
Traceback (most recent call last):
  File "besse_blondel_conservation_scores.py", line 14, in <module>
    import pandas as pd
  File "/home/lblondel/.local/lib/python3.5/site-packages/pandas/__init__.py", line 42, in <module>
    from pandas.core.api import *
  File "/home/lblondel/.local/lib/python3.5/site-packages/pandas/core/api.py", line 10, in <module>
    from pandas.core.groupby import Grouper
  File "/home/lblondel/.local/lib/python3.5/site-packages/pandas/core/groupby.py", line 49, in <module>
    from pandas.core.frame import DataFrame
  File "/home/lblondel/.local/lib/python3.5/site-packages/pandas/core/frame.py", line 67, in <module>
    from pandas.core.generic import NDFrame, _shared_docs
  File "/home/l

 80% (4 of 5) |####################      | Elapsed Time: 0:00:35 ETA:   0:00:06

Computing Scores
 [Elapsed Time: 0:00:00] |                                  | (ETA:  --:--:--) Calculating Weights...
^C
Traceback (most recent call last):
  File "besse_blondel_conservation_scores.py", line 276, in <module>
    scores.save_Score(score_type, output, weighted)
  File "besse_blondel_conservation_scores.py", line 172, in save_Score
    results = self.get_Score(score_type, weighted)
  File "besse_blondel_conservation_scores.py", line 160, in get_Score
    scores['valdar'].append(self.Score_Valdar(position))
  File "besse_blondel_conservation_scores.py", line 70, in Score_Valdar
    self.get_weight_sequences()
  File "besse_blondel_conservation_scores.py", line 50, in get_weight_sequences
    W.append(self.weight(i))
  File "besse_blondel_conservation_scores.py", line 42, in weight
    w += self.distance_seq(list(self.alignment[i].seq),list(self.alignment[j].seq))
  File "/home/lblondel/.local/lib/python3.5/site-packages/Bio/Seq.py", line 230, in __getitem__
    def __geti

100% (5 of 5) |##########################| Elapsed Time: 0:00:42 Time:  0:00:42


In [186]:
RNA = pd.read_csv('../Data/03_Oskar_scores_generation/CSV/RNABindR_scores.csv')

In [187]:
mapping  = pd.read_csv('../Data/03_Oskar_scores_generation/CSV/mapping.csv')

In [188]:
RNA = RNA.merge(mapping[['structure', 'alignment']], on="structure")
RNA.to_csv('../Data/03_Oskar_scores_generation/CSV/RNABindR_scores.csv', index=False)

## STEP 2: Merge CSV tables and make scores.csv table

In [192]:
def collectScores(CSV_PATH, length):
    scores = pd.DataFrame()
    scores['position'] = range(length)
    for i in range(len(os.listdir(CSV_PATH))) :
        CSV = os.listdir(CSV_PATH)[i]
        if 'holo_hemi_conservation' in CSV :
            tmp = pd.read_csv(os.path.join(CSV_PATH, CSV))
            tmp = tmp.rename(columns={'alignment': 'position', 'ratio_log':'Logratio'})
            if len(scores) == 0:
                scores['position'] = tmp['position']
            scores = scores.merge(tmp, on='position', how='outer')
        if 'RNABindR' in CSV:
            tmp = pd.read_csv(os.path.join(CSV_PATH, CSV))
            tmp = tmp.rename(columns={'alignment': 'position'})
            scores = scores.merge(tmp, on='position', how='outer')
        if 'OSKAR' in CSV: 
            if 'JSD' in CSV :
                if 'Holo' in CSV : 
                    suf = 'holo'
                elif 'Hemi' in CSV :
                    suf = 'hemi' 
                elif 'Dim' in CSV :
                    suf = 'dim'
                elif 'Mon' in CSV : 
                    suf = 'mon'
                else :
                    suf = ''
                tmp = pd.read_csv(os.path.join(CSV_PATH, CSV), comment='#', sep='\t', names=['JSD{}'.format(suf), 'column']).reset_index()
                tmp = tmp.rename(columns={'index':'position'})
                tmp = tmp[['position', 'JSD{}'.format(suf)]]
                tmp = tmp.replace(-1000, 0)
                if len(scores) == 0:
                    scores['position'] = tmp['position']
                scores = scores.merge(tmp, on='position', how='outer')
            else : 
                if 'Holo' in CSV : 
                    suf = 'holo'
                elif 'Hemi' in CSV :
                    suf = 'hemi' 
                elif 'Dim' in CSV :
                    suf = 'dim'
                elif 'Mon' in CSV : 
                    suf = 'mon'
                else :
                    suf = ''
                tmp = pd.read_csv(os.path.join(CSV_PATH, CSV))
                tmp = tmp.rename(columns={'Unnamed: 0': 'position', 'valdar':'Valdar{}'.format(suf), 'hydro':'Hydro{}'.format(suf),  'elec':'Elec{}'.format(suf) })
                if len(scores) == 0:
                    scores['position'] = tmp['position']
                scores = scores.merge(tmp, on='position', how='outer')
    del(scores['domain_x'])
    scores = scores.rename(columns={'domain_y':'domain'})
    return scores 
                
    # Checking that all the data is here and nothing is missing
    for col in scores.columns:
        print(col, len(scores[scores[col].isna()]))
        if col != 'RNABindR':
            assert(len(scores[scores[col].isna()]) == 0)
    return scores 


In [247]:
OskarAlignPath = "../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta"
OskarAlign = SeqIO.parse(OskarAlignPath,'fasta')
OskarSeqs = [seq for seq in OskarAlign]

In [248]:
scores = collectScores(CSV_PATH, len(OskarSeqs[0]))
scores.to_csv('../Data/03_Oskar_scores_generation/CSV/scores.csv', index=False)

In [102]:
OSK_surface = "GHTSGTYNdS---iNsDydaylLDfplm--rMELKCrfRRHERvLq-gl-vsgLt-nG-rnRlKrvQlPEGTq-ivnI-s--imrGKPlVQ-eHdfrlliKEmHNMRlVpiltnlaPLgNYCHDKVLCDKiYR-nKFirSECCHLKvId-hsC-inErGVvRfDcfQAspRQVTGSKEPylfwNkI-rqR-lqV-eTSLEY"

In [111]:
dmel = [s for s in OskarSeqs if "melanoga" in s.description][0]

In [162]:
res = [0 for i in range(dmel.seq.find("TSGT"))]
len(res)

1192

In [163]:
for i in range(2, len(OSK_surface)):
    l = OSK_surface[i]
    if l == '-':
        res.append(0)
    elif l.upper() == l:
        res.append(1)
    else:
        res.append(0.5)

In [166]:
for i in range(len(res), len(dmel.seq)):
    res.append(0)

In [167]:
scores['OSK_surface'] = res

In [169]:
scores.to_csv('tmp_scores.csv')

In [172]:
scores[scores['OSK_surface'] == 1]['Valdar'].mean()

0.23123268518828735

In [175]:
scores[scores['OSK_surface'] == 0.5]['Valdar'].mean()

0.23128881162268236

In [174]:
scores[(scores['domain'] == 'OSK') & (scores['OSK_surface'] == 0)]['Valdar'].mean()

0.4957269877508402

In [None]:
scores[scores['OSK_surface'] == 0]['Valdar'].mean()

In [176]:
scores[(scores['domain'] == 'OSK')]['Valdar'].mean()

0.5075840381423258

In [177]:
scores[(scores['domain'] == 'LOTUSVASA')]['Valdar'].mean()

0.4883670619729115

In [None]:
scores[scores['position'].isin(range(1380,1393)]

### OSK alleles

424, 435, 438, 441, 452, 457, 561, 566, 584, 593, 595

In [282]:
mapping[mapping['structure'].isin([158, 161, 162, 165, 155, 184])]['alignment'].values

array([465, 468, 472, 473, 478, 539])

In [290]:
scores[scores['position'].isin([465, 468, 472, 473, 478, 539])][['structure', 'Dmel_Seq', 'Valdar', 'Elec', 'Hydro']]

Unnamed: 0,structure,Dmel_Seq,Valdar,Elec,Hydro
465,155.0,D,0.601302,-0.741425,2.107018
468,158.0,S,0.420062,0.051451,1.142401
472,161.0,R,0.694775,0.91029,4.328918
473,162.0,A,0.638443,0.005277,1.255594
478,165.0,L,0.090067,0.009235,-0.291293
539,184.0,L,0.486447,-0.002639,-0.689024


In [296]:
scores[scores['position'].isin([465, 468, 472, 473, 478, 539])][['structure', 'Dmel_Seq', 'Valdarholo', 'Valdarhemi', 'Elecholo', 'Elechemi', 'Hydroholo', 'Hydrohemi']]

Unnamed: 0,structure,Dmel_Seq,Valdarholo,Valdarhemi,Elecholo,Elechemi,Hydroholo,Hydrohemi
465,155.0,D,0.610468,0.50442,-0.74928,-0.65625,2.141412,1.734063
468,158.0,S,0.427892,0.413289,0.053314,0.03125,1.134294,1.230312
472,161.0,R,0.695469,0.747159,0.912104,0.890625,4.287752,4.775312
473,162.0,A,0.63611,0.73087,0.002882,0.03125,1.209942,1.750625
478,165.0,L,0.106256,0.0,0.008646,0.015625,-0.331873,0.14875
539,184.0,L,0.497876,0.396095,0.005764,-0.09375,-0.73245,-0.218125


In [295]:
scores.columns

Index(['position', 'RNABindR', 'structure', 'Dmel_Seq', 'JSDmon', 'hemi_elec',
       'hemi_valdar', 'holo_elec', 'holo_valdar', 'domain', 'ratio',
       'Logratio', 'log_ratio', 'JSDholo', 'Elecholo', 'Hydroholo',
       'Valdarholo', 'Elecdim', 'Valdardim', 'Hydrodim', 'JSD', 'JSDhemi',
       'Elec', 'Hydro', 'Valdar', 'Valdarmon', 'Elecmon', 'Hydromon', 'JSDdim',
       'Valdarhemi', 'Hydrohemi', 'Elechemi'],
      dtype='object')

In [294]:
scores[scores['position'].isin([465, 468, 472, 473, 478, 539])][['structure', 'Dmel_Seq', 'Valdardim', 'Valdarmon', 'Elecdim', 'Elecmon', 'Hydrodim', 'Hydromon']]

Unnamed: 0,structure,Dmel_Seq,Valdardim,Valdarmon,Elecdim,Elecmon,Hydrodim,Hydromon
465,155.0,D,0.80994,0.798875,-0.953488,-0.878788,2.620349,1.669545
468,158.0,S,0.52544,0.565621,0.0,0.090909,0.61093,0.229848
472,161.0,R,0.86541,0.736631,0.976744,0.931818,3.85814,4.574545
473,162.0,A,0.682488,0.836787,0.0,0.0,0.613256,1.740303
478,165.0,L,0.616669,0.127867,0.0,0.007576,-1.354884,-0.455606
539,184.0,L,0.495584,0.809248,0.011628,0.015152,0.577209,-1.50303


In [None]:
mapping[mapping['structure'].isin([424, 435, 438, 441, 452, 457, 561, 566, 584, 593, 595])]

# STEP 3: Create base mapping table

In [23]:
df = pd.read_csv('../Data/03_Oskar_scores_generation/CSV/structure_dmel_mapping.csv')

In [24]:
handle = SeqIO.parse('../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta', 'fasta')
dmel = [seq for seq in handle if 'melanogaster' in seq.description][0]

In [25]:
def make_map(df, domain, seq):
    results = []
    for domain in df['domain'].unique(): 
        PDB_pos = df[df['domain'] == domain]['structure'].values
        domain_seq = df[df['domain'] == domain]['Dmel_Seq'].values
        pos = 0
        #Get a unique nMer that binds
        for n in range(2,10):
            nMer = ''.join(domain_seq[:n])
            if seq.seq.count(nMer) == 1:
                break
        start_pos = seq.seq.find(nMer)
        mapping = {}
        check = {}

        domain_index = 0
        for pos in range(start_pos, len(seq)):
            if domain_seq[domain_index] == seq.seq[pos]:
                mapping[PDB_pos[domain_index]] = pos
                check[PDB_pos[domain_index]] = seq.seq[pos]
                domain_index += 1
            if domain_index >= len(domain_seq):
                break
        recovered_seq = ""
        for key in sorted(check.keys()):
            recovered_seq += check[key]
        assert(''.join(domain_seq) == recovered_seq)
        for key in sorted(mapping.keys()):
            results.append([key, domain, mapping[key]])
    res = pd.DataFrame(results, columns=['structure', 'domain', 'alignment'])
    return res

In [26]:
mapping = make_map(df, 'OSK', dmel)

In [27]:
mapping = df.merge(mapping, on=['structure', 'domain'], how='outer')

In [66]:
mapping.to_csv('../Data/03_Oskar_scores_generation/CSV/mapping.csv')

## STEP 4 : Create RNABindR scores table (Run if RNABindR\_scores.csv not provided)

### 1. Format OSKAR alignment into dataframe

In [9]:
fastaFile = '../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta'
MSA = AlignIO.read(open(fastaFile), 'fasta')
length = MSA.get_alignment_length()
records = list(SeqIO.parse(fastaFile, 'fasta'))

ID = [record.id for record in records]
seq = [str(record.seq) for record in records]
description = [record.description for record in records]
description = [descr.split('|') for descr in description]
description = [descr[len(descr)-1] for descr in description]

seqInfos = [ID, description, seq]
MSA_DF = pd.DataFrame(data=seqInfos).T
MSA_DF.columns = ['ID', 'Description', 'Sequence']

### 2. Create alignment mapping table

In [10]:
def MSA_MAP(MSA_DF):
    alignment_mapping = {}

    for index in MSA_DF.index:
        name = MSA_DF['ID'][index]
        List = []
        cpt = 0
        for i in MSA_DF['Sequence'][index]:
            if i != '-':
                List.append(cpt)
            cpt += 1
        alignment_mapping[name] = List
    return alignment_mapping 

In [11]:
alignment_mapping = MSA_MAP(MSA_DF)

### 3. Collect Oskar sequences into a dictionary

In [12]:
seqFile = '../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.fasta'

records = list(SeqIO.parse(seqFile, 'fasta'))
ID = [record.id for record in records]
seq = [str(record.seq) for record in records]

oskInfos = {}
for i in range(len(ID)) :
    oskInfos[seq[i]] = ID[i]

### 4. Collect scores for RNA prediction from RNABindR outputs

#### A. Collect RNABindR scores per protein positions

In [13]:
def return_id(seq):
    if seq in oskInfos.keys():
        return oskInfos[seq]

In [14]:
rna_scores = {}
cpt = 0
RNABindR_PATH = './RNABindR_raw_sources'
for file in os.listdir(RNABindR_PATH):
    score_file = open(os.path.join(RNABindR_PATH, file))
    allLines = [line for line in score_file.readlines()]
    score_file.close()
    sequence = allLines[1].replace('sequence:\t\t','').strip('\n')
    scores = allLines[3].strip('\n').replace('predicted score:\t','').split(',')
    name = return_id(sequence)
    if name !='None':
        rna_scores[name] = scores

#### B. Add nan numbers to represent gap positions for each protein

In [15]:
def get_position(ID, position):
    if position in alignment_mapping[ID]:
        return alignment_mapping[ID].index(position)

In [16]:
scores_matrix = {}
for ID in rna_scores.keys() :
    if ID in MSA_DF['ID'].values:
        name = ID
        List = []
        for i in range(length):
            indice = get_position(name,i)   
            if indice :
                List.append(float(rna_scores[ID][indice]))             
            else :
                List.append(float('Nan'))
        scores_matrix[name] = List

#### C. Save final table score

In [18]:
DF = pd.DataFrame.from_dict(scores_matrix).T
DF = DF.fillna(0)

Scores = []
Prediction = []
for column in DF.columns:
    score = sum(DF[column])/len(DF.index)
    Scores.append(score)
    
RNA_scores_df = pd.DataFrame(Scores,columns=['scores']).reset_index()
RNA_scores_df = RNA_scores_df.rename(columns={'index':'position'})

In [186]:
RNA_scores_df.to_csv('../Data/03_Oskar_scores_generation/CSV/RNABindR_scores.csv', index=False)