# analze sequenes with range
how many of them are a valid option??

# Import Module


In [1]:
import pickle
from scripts import utils
import pandas as pd
from Bio import Entrez
from more_itertools import chunked
from tqdm import tqdm
Entrez.email = "gernel@informatik.uni-freiburg.de"

# Fetch regular sequences from genes

In [None]:
# First load NCBI entries
rna_inter_df = utils.load_rna_inter('NCBI')
rna_inter_df.to_parquet('Download_data_RP_NCBI.parquet', engine='pyarrow', compression=None)

In [None]:
rna_inter_df = pd.read_parquet('Download_data_RP_NCBI.parquet', engine='pyarrow')
print(rna_inter_df.columns)

print(f"There are {rna_inter_df.shape[0]:,} interactions with RNAs from NCBI")
# obtain unique sequences
ncbi_rna_ids = list(rna_inter_df['Raw_ID1'].unique())
print(f"There are {len(ncbi_rna_ids):,} unique gene IDs")
ncbi_rna_ids = [rna_id[5:] for rna_id in ncbi_rna_ids]
# delete df to save memory :)
del rna_inter_df
file = open('ncbi_rna_ids.pickle', 'wb')
pickle.dump(ncbi_rna_ids, file)
file.close()

In [None]:
file = open('ncbi_rna_ids.pickle', 'rb')
ncbi_rna_ids = pickle.load(file)
file.close()

In [None]:
def fetch_ncbi_nucleotide_ids(ids: list):
    handle = Entrez.elink(db="nucleotide", dbfrom="gene", id=ids)
    records = Entrez.read(handle)
    temp_results = []
    for record in records:
        if record['IdList'] == ['0', '0']:
            # This entry does not exist. Therefore, we skipped it.
            continue
        assert len(record['IdList']) == 1
        gene_id = record['IdList'][0]
        for link in record['LinkSetDb']:
            if link['LinkName'] != 'gene_nuccore_refseqrna':
                continue
            temp_results += [dict(Raw_ID1=gene_id, Sequence_1_ID=seq['Id']) for seq in link['Link']]
    return temp_results

In [None]:
def fetch_ncbi_nucleotide_sequences(ids: list) -> list:
    # Fetch the nucleotide sequences from NCBI
        handle = Entrez.efetch(db="nucleotide", id=ids, rettype="gb", retmode='xml')
        # Read the nucleotide sequences into a list
        records = Entrez.read(handle)
        return [dict(
            Sequence_1=record.get('GBSeq_sequence'),
            Sequence_1_len=len(record.get('GBSeq_sequence')),
            Sequence_1_ID=record.get('GBSeq_locus'),
            Sequence_1_ID2=record.get('GBSeq_other-seqids')[1][3:]
        ) for record in records]

In [None]:
ncbi_results_df = utils.call_fetch_function(fetch_ncbi_nucleotide_ids, 100, ncbi_rna_ids)
print(f"{len(list(ncbi_results_df['Raw_ID1'].unique()))}/{len(ncbi_rna_ids)} unique gene_ids fetched")
ncbi_results_df.to_parquet('ncbi_results_1.parquet', engine='pyarrow', compression=None)

In [None]:
# get missing chunk
missing_rna_ids = list(chunked(ncbi_rna_ids, 1000))[57]
missing_sequence_ids = pd.DataFrame(fetch_ncbi_nucleotide_ids(missing_rna_ids))
print(f"Fetched {len(list(missing_sequence_ids['Raw_ID1'].unique()))}/{len(missing_rna_ids)} of missing rna ids")
missing_sequence_ids.to_parquet('ncbi_results_2.parquet', engine='pyarrow', compression=None)

In [None]:
ncbi_results_df = pd.concat((
    pd.read_parquet('ncbi_results_1.parquet', engine='pyarrow'),
    pd.read_parquet('ncbi_results_2.parquet', engine='pyarrow')
    ))
ncbi_results_df.to_parquet('ncbi_results.parquet', engine='pyarrow', compression=None)

In [None]:
ncbi_results_df = pd.read_parquet('ncbi_results.parquet', engine='pyarrow')
print(ncbi_results_df.shape[0])
unique_sequence_ids = list(ncbi_results_df['Sequence_1_ID'].unique())
print(len(unique_sequence_ids))
file = open('unique_sequence_ids.pickle', 'wb')
pickle.dump(unique_sequence_ids, file)
file.close()

In [None]:
# Fetch sequence with obtained sequence IDs
file = open('unique_sequence_ids.pickle', 'rb')
unique_sequence_ids = pickle.load(file)
file.close()

In [None]:
# [{'Id': '332164660'}, {'Id': '332164659'}]
unique_sequence_ids = unique_sequence_ids[:100]
ncbi_rna_sequences_df = utils.call_fetch_function(fetch_ncbi_nucleotide_sequences, 10, unique_sequence_ids)
# by default, compression is active :)
ncbi_rna_sequences_df.to_parquet('ncbi_rna_sequences.parquet', engine='pyarrow')

In [None]:
# fetch missing sequences for missing sequence_ids
unique_sequence_ids_2 = list(missing_sequence_ids['Sequence_1_ID'].unique())
ncbi_rna_sequences_2_df = utils.call_fetch_function(fetch_ncbi_nucleotide_sequences, 100, unique_sequence_ids_2)
ncbi_rna_sequences_2_df.to_parquet('ncbi_rna_sequences_2.parquet', engine='pyarrow')
print(f"{len(list(ncbi_rna_sequences_2_df['Sequence_1_ID'].unique()))}/{len(unique_sequence_ids_2)} from missing sequences fetched.")

In [None]:
ncbi_rna_sequences_df = pd.concat((pd.read_parquet('ncbi_rna_sequences_1.parquet', engine='pyarrow'),
                                   pd.read_parquet('ncbi_rna_sequences_2.parquet', engine='pyarrow')))
# ncbi_rna_sequences_df = ncbi_rna_sequences_df.drop(columns='Sequence_1_ID2')
ncbi_rna_sequences_df.to_parquet('ncbi_rna_sequences.parquet', engine='pyarrow', compression=None)

In [6]:
# merge gene_ids and sequence ids
ncbi_results_df = pd.read_parquet('ncbi_results.parquet', engine='pyarrow')
ncbi_rna_sequences_df = pd.read_parquet('ncbi_rna_sequences.parquet', engine='pyarrow')
ncbi_rna_sequences_df[['Sequence_1_ID', 'Sequence_1_ID2']] = ncbi_rna_sequences_df[['Sequence_1_ID2', 'Sequence_1_ID']]
ncbi_rna_df = ncbi_results_df.merge(ncbi_rna_sequences_df, how='inner', on='Sequence_1_ID')
del ncbi_results_df
del ncbi_rna_sequences_df
# ncbi_rna_df = ncbi_results_df.set_index('Sequence_1_ID').join(ncbi_rna_sequences_df.set_index('Sequence_1_ID'))
ncbi_rna_df['Raw_ID1'] = "NCBI:" + ncbi_rna_df['Raw_ID1'].astype(str)
ncbi_rna_df = ncbi_rna_df.drop(['Sequence_1_ID2'], axis=1)
ncbi_rna_df['Sequence_1'] = ncbi_rna_df['Sequence_1'].str.upper()
ncbi_rna_df['Sequence_1'] = ncbi_rna_df['Sequence_1'].str.replace('T', 'U')
ncbi_rna_df = utils.remove_illegal_nucleotides(ncbi_rna_df, ['N', 'Y', 'B', 'M', 'S', 'K', 'R', 'W', 'D'])
utils.check_sequences(ncbi_rna_df)
ncbi_rna_df.to_parquet('ncbi_rna.parquet', engine='pyarrow', compression=None)

# Calc recovery rate

In [7]:
rna_inter_df = pd.read_parquet('Download_data_RP_NCBI.parquet', engine='pyarrow')
ncbi_rna_df = pd.read_parquet('ncbi_rna.parquet', engine='pyarrow')
utils.calc_recovery_rate(rna_inter_df, ncbi_rna_df)

Unique Gene IDs before extraction:	148,535
Unique Gene IDs after extraction:	131,277
Extraction rate:	88.38%


In [11]:
# analyse ncbi results dataframe
ncbi_rna_df = pd.read_parquet('ncbi_rna.parquet', engine='pyarrow')
print(f"Size of ncbi_rna_df: {ncbi_rna_df.shape[0]}")
print(f"Number of unique sequence_ids: {ncbi_rna_df['Sequence_1_ID2'].nunique()}")
print(f"Number of unique sequences: {ncbi_rna_df['Sequence_1'].nunique()}")

test_df_2 = ncbi_rna_df.groupby(['Raw_ID1'])['Sequence_1_len'].std()
# NCBI:100001267
test_df = ncbi_rna_df[ncbi_rna_df['Raw_ID1'] == 'NCBI:100001267']
pass

Size of ncbi_rna_df: 494057
Number of unique sequence_ids: 493411
Number of unique sequences: 472378


KeyboardInterrupt: 

# Sequences with ranges

In [None]:
# load sequences with range
file = open('sequence_ids/sequence_ids_with_range.pickle', 'rb')
sequences_with_range = pickle.load(file)
file.close()

In [None]:
# flatten list
sequences_with_range = [(key,seq2) for key, seq1 in sequences_with_range.items() for seq2 in seq1]
seq_df = [dict(
    gene_id=seq[0],
    seq_id=seq[1][0],
    seq_start=int(seq[1][1]),
    seq_end=int(seq[1][2]),
    seq_len=int(seq[1][2]) - int(seq[1][1])
) for seq in sequences_with_range]
seq_df = pd.DataFrame(seq_df)
seq_df.to_parquet('sequence_ids_with_range.parquet', engine='pyarrow', compression=None)

In [None]:
seq_df = pd.read_parquet('sequence_ids_with_range.parquet')
seq_df['seq_id'] = seq_df['seq_id'].astype(str) + ":" + seq_df['seq_start'].astype(str) + "-" + seq_df['seq_end'].astype(str)
pass

In [None]:
# filter out long sequences
seq_df = seq_df[seq_df['seq_len'] < 2000]
# seq_df = seq_df.to_dict('records')
# results = utils.fetch_ncbi_rna_fasta_with_range(seq_df)

In [None]:
# this file containes RNA sequences which are shorter than 2000 bps.
file = open('fetch_results.pickle', 'rb')
raw_sequences = pickle.load(file)
file.close()
# flatten the list
raw_sequences = [(seq[0], seq1) for seq in raw_sequences for seq1 in seq[1]]

In [None]:
rna_seq_df = pd.DataFrame([dict(
    sequence_id=seq[1].id.split('.')[0] + ":" + seq[1].id.split('.')[1].split(':')[1],
    sequence=str(seq[1].seq),
    sequence_len=len(str(seq[1].seq)) - 1
) for seq in raw_sequences])