# BIOLOGICAL FEATURES

In [1]:
# ================================
#           libraries
# ================================

# %pip install -q MDTraj    # leave this uncommented if running it on Google Colab
# Standard library imports
import math
import ssl
import urllib.request
from copy import deepcopy
from random import random

# Third-party imports
import mdtraj as md
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import scipy.ndimage as nd
from Bio import SeqIO
from tqdm import tqdm

In [2]:
url = 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/AX9716PF/GSE105544_ENCFF010WBP/VC/LorDG/chr1.pdb'
# chromosome model used: https://gsdb.mu.hekademeia.org/details.php?id=GSE105544_ENCFF010WBP
# resolution: 500Kb

In [3]:
# loading PDB file to check the raw file for some EDA

with open ('chr1.pdb', 'r') as f:
    file = f.read()

lines = file.split('\n')

In [4]:
# Create unverified context for this specific download
ssl_context = ssl._create_unverified_context()

# Download the file
with urllib.request.urlopen(url, context=ssl_context) as response:
    with open('chr1.pdb', 'wb') as out_file:
        out_file.write(response.read())

# Load from local file
chr_str = md.load_pdb('chr1.pdb')
coords = chr_str.xyz.reshape(-1, 3)

In [5]:
fasta_file = '../data/raw/hg38_chr1_FASTA.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, 'fasta'):
    sequences.append(record)
    print('_' * 60)
    print(f'ID: {record.id}')
    # print(f'Sequence: {record.seq}')
    print(f'Description: {record.description}')
    print('_' * 60)


____________________________________________________________
ID: CM000663.2
Description: CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly
____________________________________________________________


In [19]:
def gc_content_calculator(sequence, window):
    """Calculate GC content based on the window size"""

    sequence_upper = str(sequence).upper()
    seq_len = len(sequence_upper)
    num_windows = (seq_len - window) // window + 1
    bins = []
    gc_percentages = []

    gc_chars = {'G', 'C'}
    for i in range(0, seq_len - window + 1, window):
        sequence_frag = sequence_upper[i:i + window]
        gc_count = sum(1 for char in sequence_frag if char in gc_chars)
        if gc_count == 0 and 'N' in sequence_frag:
            bins.append(i)
            gc_percentages.append('telo-centro')
        elif gc_count > 0:
            gc_content_percentage = round((gc_count / window) * 100, 2)
            bins.append(i)
            gc_percentages.append(gc_content_percentage)

    return pd.DataFrame({
        'bin': bins,
        'gc_content_percentage': gc_percentages
    })

In [20]:
gc = gc_content_calculator(sequences[0].seq, 500000)




In [None]:
# ==========================================================
#                       BINS STAT
# ==========================================================
# PDB ---> chr_str ---> 451 bins
# FASTA -> total -----> 492 bins
# FASTA -> w/o tel ---> 462 bins
# ==========================================================

In [None]:
coords_df = pd.DataFrame(coords, columns=['x', 'y', 'z'])

merged_file = pd.concat([
    coords_df.reset_index(drop=True),
    gc[gc['gc_content_percentage'] != 'telo-centro'].reset_index(drop=True)
], axis=1).iloc[:451, :]


merged_file



Unnamed: 0,x,y,z,bin,gc_content_percentage
0,-0.3264,0.3988,-0.2650,0,33.65
1,-0.3026,0.4367,-0.2681,500000,43.05
2,-0.2171,0.5026,-0.2616,1000000,60.5
3,-0.0999,0.6025,-0.2160,1500000,54.07
4,0.0170,0.6524,-0.1841,2000000,58.55
...,...,...,...,...,...
446,-0.3537,0.2988,0.4387,240500000,39.82
447,-0.3457,0.2375,0.4932,241000000,39.81
448,-0.3446,0.1994,0.5161,241500000,40.06
449,-0.3101,0.2030,0.5102,242000000,40.85


In [57]:
merged_file.to_csv('x_y_z_bin_gc.csv')

In [56]:
line_dict = dict(
    width=4, 
    color=merged_file['gc_content_percentage'], 
    colorscale='Turbo'
)

data = go.Scatter3d(
    x=merged_file['x'],
    y=merged_file['y'],
    z=merged_file['z'],
    mode='lines',
    line=line_dict
)

fig = go.Figure(data)
fig.update_layout(
    scene = dict(
        xaxis = dict(visible=False),
        yaxis = dict(visible=False),
        zaxis = dict(visible=False)
    ),
    margin=dict(l=0, r=0, b=0, t=0, pad=0),
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=True
)

fig.show()

In [116]:
np.save('x_y_z_bin_gc.npy', merged_file)