# BIOLOGICAL FEATURES

In [None]:
# ================================
#           libraries
# ================================

# %pip install -q MDTraj    # leave this uncommented if running it on Google Colab
# Standard library imports
import math
import ssl
import urllib.request
from copy import deepcopy
from random import random

# Third-party imports
import mdtraj as md
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import scipy.ndimage as nd
from Bio import SeqIO
from tqdm import tqdm

In [None]:
url = 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/AX9716PF/GSE105544_ENCFF010WBP/VC/LorDG/chr1.pdb'
# chromosome model used: https://gsdb.mu.hekademeia.org/details.php?id=GSE105544_ENCFF010WBP
# resolution: 500Kb

In [None]:
# loading PDB file to check the raw file for some EDA

with open ('chr1.pdb', 'r') as f:
    file = f.read()

lines = file.split('\n')

In [3]:
# Create unverified context for this specific download
ssl_context = ssl._create_unverified_context()

# Download the file
with urllib.request.urlopen(url, context=ssl_context) as response:
    with open('chr1.pdb', 'wb') as out_file:
        out_file.write(response.read())

# Load from local file
chr_str = md.load_pdb('chr1.pdb')
coords = chr_str.xyz.reshape(-1, 3)

In [4]:
fasta_file = '../data/raw/hg38_chr1_FASTA.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, 'fasta'):
    sequences.append(record)
    print('_' * 60)
    print(f'ID: {record.id}')
    # print(f'Sequence: {record.seq}')
    print(f'Description: {record.description}')
    print('_' * 60)


____________________________________________________________
ID: CM000663.2
Description: CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly
____________________________________________________________


In [None]:
def gc_content_calculator(sequence, window) -> DataFrame:
    """Calculate GC content based on the window size"""

    sequence_upper = str(sequence).upper()
    seq_len = len(sequence_upper)
    num_windows = (seq_len - window) // window + 1
    bins = []
    gc_percentages = []
    bins.reserve = num_windows  # Hint for list allocation
    gc_percentages.reserve = num_windows

    gc_chars = {'G', 'C'}
    for i in range(0, seq_len - window + 1, window):
        sequence_frag = sequence_upper[i:i + window]
        gc_count = sum(1 for char in sequence_frag if char in gc_chars)
        if gc_count == 0 and 'N' in sequence_frag:
            bins.append(i)
            gc_percentages.append('telo-centro')
        elif gc_count > 0:
            gc_content_percentage = round((gc_count / window) * 100, 2)
            bins.append(i)
            gc_percentages.append(gc_content_percentage)

    return pd.DataFrame({
        'bin': bins,
        'gc_content_percentage': gc_percentages
    })

In [107]:
gc = np.array(gc_content_calculator(sequences[0].seq, 500000))



In [None]:
# ==========================================================
#                       BINS STAT
# ==========================================================
# PDB ---> chr_str ---> 451 bins
# FASTA -> total -----> 492 bins
# FASTA -> w/o tel ---> 462 bins
# ==========================================================

In [115]:
merged_file = []
for coord, g in zip(coords, gc):
    merged_file.append([*coord, *g])

merged_file = np.array(merged_file)
merged_file.shape

(451, 5)

In [116]:
np.save('x_y_z_bin_gc.npy', merged_file)