In [1]:
import pandas as pd
import requests
from scripts import utils
import pickle
import json
from more_itertools import chunked
from tqdm import tqdm

In [None]:
def get_fasta_post(ensemble_ids, retries=3):
    for _ in range(retries):
        try:
            server = "https://apr2022.rest.ensembl.org"
            ext = "/sequence/id"
            headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
            rbody = dict(
                ids=ensemble_ids
            )
            r = requests.post(server+ext, headers=headers, data=json.dumps(rbody))

            if r.ok:
              return r.json()
        except:
            continue
    return []

In [2]:
rna_inter_df = utils.load_rna_inter('Ensembl')
rna_inter_df.to_parquet('Download_data_RP_Ensembl.parquet', engine='pyarrow', compression=None)

In [3]:
rna_inter_df = pd.read_parquet('Download_data_RP_Ensembl.parquet', engine='pyarrow')
print(f"RNAInter database loaded with size: {rna_inter_df.shape[0]:,}")

RNAInter database loaded with size: 7,471,922


In [None]:
# Get unique Ensembl IDs
ensembl_ids = list(rna_inter_df['Raw_ID1'].unique())
# to save memory :)
del rna_inter_df
ensembl_ids = [ensembl_id[8:] for ensembl_id in ensembl_ids]
file = open('ensembl_ids.pickle', 'wb')
pickle.dump(ensembl_ids, file)
file.close()

In [None]:
# for ensembl_id in ensembl_ids:
results = []
ensembl_ids_chunks = list(chunked(ensembl_ids, 50))
for ensembl_ids_chunk in tqdm(ensembl_ids_chunks):
    results += get_fasta_post(ensembl_ids_chunk)
file = open('Ensembl_sequences.pickle', 'wb')
pickle.dump(results, file)
file.close()

In [4]:
file = open('Ensembl_sequences.pickle', 'rb')
ensembl_sequences = pickle.load(file)
file.close()

# Obtain sequences which were fetched on server

In [5]:
print(f"{len(ensembl_sequences):,} sequences fetched.")

Ensembl_df = pd.DataFrame([dict(
        Raw_ID1=f"Ensembl:{seq['id']}",
        Sequence_1=seq['seq'],
        Sequence_1_len=len(seq['seq']),
        Sequence_1_ID=seq['id']
    ) for seq in tqdm(ensembl_sequences)
])
# fix RNA sequences
Ensembl_df['Sequence_1'] = Ensembl_df['Sequence_1'].str.replace('T', 'U')
Ensembl_df['has_N'] = Ensembl_df['Sequence_1'].str.contains('N')
Ensembl_df = Ensembl_df[Ensembl_df['has_N'] == False]
Ensembl_df = Ensembl_df.drop(['has_N'], axis=1)
utils.check_sequences(Ensembl_df)
Ensembl_df.to_parquet('Ensembl.parquet', engine='pyarrow', compression=None)

41,100 sequences fetched.


100%|██████████| 41100/41100 [00:00<00:00, 1034772.74it/s]


In [6]:
Ensembl_df = pd.read_parquet('Ensembl.parquet', engine='pyarrow')

# Calc recovery rate

In [7]:
rna_inter_df = pd.read_parquet('Download_data_RP_Ensembl.parquet', engine='pyarrow')
Ensembl_df = pd.read_parquet('Ensembl.parquet', engine='pyarrow')
utils.calc_recovery_rate(rna_inter_df, Ensembl_df)

Unique Gene IDs before extraction:	45,235
Unique Gene IDs after extraction:	41,086
Extraction rate:	90.83%
