## Import statements

In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np

from scripts import utils

In [None]:
rna_inter_df = utils.load_rna_inter('miRBase')
rna_inter_df.to_parquet('Download_data_RP_miRBase.parquet', engine='pyarrow', compression=None)

# Load miRBase from database file
miRBase is stored in embl file format

In [9]:
miRNA_df = pd.DataFrame([dict(
    Sequence_1_ID=record.id,
    Sequence_1=str(record.seq),
    Raw_ID1=f"miRBase:{record.id}",
    Sequence_1_len=len(str(record.seq))
) for record in SeqIO.parse("miRNA.dat", "embl")])
rna_inter_df = pd.read_parquet('Download_data_RP_miRBase.parquet', engine='pyarrow')
rna_inter_df = rna_inter_df.loc[:, ['Raw_ID1']].drop_duplicates()
miRNA_df = miRNA_df.merge(rna_inter_df, how='inner', on='Raw_ID1')
# miRNA_df = miRNA_df.dropna(subset=['Sequence_1'])
miRNA_df = miRNA_df.reset_index()
miRNA_df = utils.remove_illegal_nucleotides(miRNA_df, ['Y', 'R', 'W', 'N', 'S', 'K', 'M', 'B'])

utils.check_sequences(miRNA_df)
miRNA_df.to_parquet('miRNA.parquet', engine='pyarrow', compression=None)

# Calc recovery rate

In [10]:
miRNA_df = pd.read_parquet('miRNA.parquet', engine='pyarrow')
rna_inter_df = pd.read_parquet('Download_data_RP_Ensembl.parquet', engine='pyarrow')
utils.calc_recovery_rate(rna_inter_df, miRNA_df)

Unique Gene IDs before extraction:	11,040
Unique Gene IDs after extraction:	3,803
Extraction rate:	34.45%


# Load miRBase entries from RNAInter

In [None]:
rna_inter_df = utils.load_rna_inter('miRBase')
rna_inter_df.to_parquet('Download_data_RP_miRBase.parquet', engine='pyarrow', compression=None)

In [None]:
rna_inter_df = pd.read_parquet('Download_data_RP_miRBase.parquet', engine='pyarrow')
print(f"RNAInter database loaded with size: {rna_inter_df.shape[0]:,}")

In [None]:
merged_df_1 = pd.merge(miRNA_df, rnainter_df, left_on='sequence_id', right_on='Raw_ID1')
print(merged_df_1.shape[0])
merged_df_2 = pd.merge(miRNA_df, rnainter_df, left_on='sequence_id', right_on='Raw_ID2')
print(merged_df_2.shape[0])
merged_df = pd.concat([merged_df_1, merged_df_2])
print(merged_df.shape[0])
merged_df.to_csv('merged_miRNABase.csv')

In [None]:
merged_df = pd.read_csv('merged_miRNABase.csv')

# Analyze sizes merged dataframe

In [None]:
sizes = [100, 200, 216]
for size in sizes:
    df_size = merged_df[merged_df['sequence_len'] < size].shape[0]
    print(f"There are {df_size:,} interactions with a length < {size}")
print(f"Max sequence length: {merged_df['sequence_len'].max()}")

In [None]:
merged_df.columns

In [None]:
merged_df['strong_exists'] = ~merged_df['strong'].isnull()
merged_df['weak_exists'] = ~merged_df['weak'].isnull()
merged_df['predict_exists'] = ~merged_df['predict'].isnull()
# df['Discount_rating'] = np.where(df['Discount'] > 2000, 'Good', 'Bad')


In [None]:
for quality in ['strong', 'weak', 'predict']:
    df_size = merged_df[merged_df[f'{quality}_exists'] == True].shape[0]
    print(f"There are {df_size:,} interactions with a {quality} prediction")

In [None]:
ranges = np.arange(0.0, 1.1, 0.1)
merged_df['score'] = merged_df['score'].astype(float)
merged_df = merged_df[~pd.isnull(merged_df['score'])]
for idx in range(1, len(ranges)):
    df_size = merged_df[(merged_df['score'] >= ranges[idx -1 ]) & (merged_df['score'] < ranges[idx])].shape[0]
    print(f"There are {df_size:,} interactions between confidence score {round(ranges[idx - 1], 2)} and {round(ranges[idx], 2)}")