# 0. Common functions

In [None]:
# Get reverse complement sequence
def reverse_comp(seq):
  seq = seq[::-1]
  rc_seq = []
  for i in seq:
    if i=='A':
      rc_seq.append('T')
    elif i=='T':
      rc_seq.append('A')
    elif i=='C':
      rc_seq.append('G')
    else:
      rc_seq.append('C')

  return ''.join(rc_seq)

# 1. Import modules & data

In [None]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import logging

import numpy as np
import pandas as pd
import json

import tensorflow as tf

# 2. Identify original RNA sequence & build dictionary

Import unmodified RNA descriptions

In [None]:
# Import unmodified RNA descriptions as DataFrame
rna_library_df = pd.read_csv('1-s2.0-S1097276521002197-mmc3.csv', skiprows = 1)

# Fill any empty gaps with empty string, instead of NaN
rna_library_df = rna_library_df.fillna('')

Construct RNA sequence from descriptions

In [None]:
# Identify fixed region
fixed_B2 = reverse_comp(rna_library_df.loc[(rna_library_df['desc'] == 'perfect_ds') & (rna_library_df['B2.mNG'] == 'B2'),'seq'].item())
fixed_mNG = reverse_comp(rna_library_df.loc[(rna_library_df['desc'] == 'mNG_perfect_ds') & (rna_library_df['B2.mNG'] == 'mNG'),'seq'].item())

In [None]:
# Add fixed region
rna_library_df.loc[rna_library_df['B2.mNG'] == 'B2','seq_fixed'] = fixed_B2
rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','seq_fixed'] = fixed_mNG

# Add remaining sequences
# 5' end:
rna_library_df.loc[rna_library_df['B2.mNG'] == 'B2','seq_5'] = '5' + rna_library_df['Buffer'] + rna_library_df['F']

rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','seq_5'] = '5' + rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','Buffer'] \
+ rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','F']

# 3' end:
rna_library_df.loc[rna_library_df['B2.mNG'] == 'B2','seq_3'] = rna_library_df['loop'] + rna_library_df['barcode'] \
+ rna_library_df['BstBI'] + rna_library_df['R'] + rna_library_df['seq'] + rna_library_df['AscI'] + '3'

rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','seq_3'] = rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','loop'] \
+ rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','barcode'] + rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','BstBI'] \
+ rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','R'] + rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','seq'] \
+ rna_library_df.loc[rna_library_df['B2.mNG'] == 'mNG','AscI'] + '3'

# 3. Build data set

Import data set CSV files

In [7]:
# Import modified RNA descriptions as DataFrame: For HEK293T (Interferon-alpha stimulated)
exper_rna_df_B21 = pd.read_csv('GSM4705211_293T_B2_IFN_rep1.csv')
exper_rna_df_B22 = pd.read_csv('GSM4705212_293T_B2_IFN_rep2.csv')
exper_rna_df_mNG1 = pd.read_csv('GSM4705213_293T_mNG_IFN_rep1.csv')
exper_rna_df_mNG2 = pd.read_csv('GSM4705214_293T_mNG_IFN_rep2.csv')

In [8]:
# Combine all four data sets into one
exper_rna_df = pd.concat([exper_rna_df_B21,exper_rna_df_B22,exper_rna_df_mNG1, exper_rna_df_mNG2])

In [9]:
# Rename column 'BC' into 'barcode', to match rna_library_df
exper_rna_df = exper_rna_df.rename(columns={'BC': 'barcode'})

# Use rna_library_df to match modified RNA sequence
exper_rna_df = pd.merge(exper_rna_df, rna_library_df[['barcode','seq_5', 'seq_fixed','seq_3']], on='barcode')

### Input sequence:

In [10]:
# Combine for unmodified RNA sequence:
exper_rna_df['in_rna_seq'] = exper_rna_df['seq_5'] + exper_rna_df['seq_fixed'] + exper_rna_df['seq_3']

### Output sequence:

Edit adenosine to inosine, based probability threshold

In [11]:
# Set probabity threshold to assume A to I editing
PROB_THRESHOLD = 30 #%

In [12]:
# Identify adenosine positions
aden_pos_B2 = [index for index, char in enumerate(fixed_B2) if char == 'A']
aden_pos_mNG = [index for index, char in enumerate(fixed_mNG) if char == 'A']

In [13]:
# Define a function that updates adenosine to inosine if larger than threshold
def adenosine_to_inosine(row):
  for i, pos in enumerate(aden_pos_B2):
    if row['A'+str(i+1)] > PROB_THRESHOLD:
      row['out_rna_seq'] = row['out_rna_seq'][:pos-1] + 'I' + row['out_rna_seq'][pos+1:] # Update to inosine
  return row['out_rna_seq']

In [14]:
# Copy seq_fixed for out_rna_seq
exper_rna_df['out_rna_seq'] = '5' + exper_rna_df['seq_fixed'] + '3'

# Apply the function to each row
exper_rna_df['out_rna_seq'] = exper_rna_df.apply(adenosine_to_inosine, axis=1)

### Export data

In [None]:
# Select relevant columns
processed_rna_df = exper_rna_df[['in_rna_seq','out_rna_seq']]

In [None]:
# Output final dataset as .json file
processed_rna_json = processed_rna_df.to_json(orient='records', lines=True)

with open('rna_dataset_processed.json', 'w') as f:
    f.write(processed_rna_json)