# BIOLOGICAL FEATURES

In [1]:
# ================================
#           libraries
# ================================

# %pip install -q MDTraj    # leave this uncommented if running it on Google Colab
import mdtraj as md
from Bio import SeqIO

import pandas as pd
import numpy as np
import scipy.ndimage as nd
import math
from tqdm import tqdm
from random import random
import plotly.graph_objects as go
import plotly.io as pio

import urllib.request
import ssl
from copy import deepcopy

In [None]:
url = 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/AX9716PF/GSE105544_ENCFF010WBP/VC/LorDG/chr1.pdb'
# chromosome model used: https://gsdb.mu.hekademeia.org/details.php?id=GSE105544_ENCFF010WBP
# resolution: 500Kb

In [None]:
# loading PDB file to check the raw file for some EDA

with open ('chr1.pdb', 'r') as f:
    file = f.read()

lines = file.split('\n')

In [3]:
# Create unverified context for this specific download
ssl_context = ssl._create_unverified_context()

# Download the file
with urllib.request.urlopen(url, context=ssl_context) as response:
    with open('chr1.pdb', 'wb') as out_file:
        out_file.write(response.read())

# Load from local file
chr_str = md.load_pdb('chr1.pdb')
coords = chr_str.xyz.reshape(-1, 3)

In [4]:
fasta_file = '../data/raw/hg38_chr1_FASTA.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, 'fasta'):
    sequences.append(record)
    print('_' * 60)
    print(f'ID: {record.id}')
    # print(f'Sequence: {record.seq}')
    print(f'Description: {record.description}')
    print('_' * 60)


____________________________________________________________
ID: CM000663.2
Description: CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly
____________________________________________________________


In [71]:

def gc_content_calculator(sequence, window):

    """calculate GC content based on the window size"""

    gc_content = {'bin': [], 'gc_content_percentage': []}

    for i in range(0, len(sequence) - window, window):
        sequence_frag = sequence[i:(i + window)].upper()
        
        n_count = sequence_frag.count('N')
        gc_count = sequence_frag.count('G') + sequence_frag.count('C')
        gc_content_percentage = round((gc_count/window) * 100, 2)

        if gc_count == 0 and n_count > 0:
            gc_content['bin'].append(i)
            gc_content['gc_content_percentage'].append('telo-centro')
        elif gc_count > 0:
            gc_stretch = 1
            gc_content['bin'].append(i)
            gc_content['gc_content_percentage'].append(gc_content_percentage)
    
    gc_table = pd.DataFrame(gc_content)
    return gc_table




In [83]:
gc = gc_content_calculator(sequences[0].seq, 500000)
condition = gc['gc_content_percentage'] != 'telo-centro'
gc[condition].count()


bin                      462
gc_content_percentage    462
dtype: int64

In [None]:
# ==========================================================
#                       BINS STAT
# ==========================================================
# PDB ---> chr_str ---> 451 bins
# FASTA -> total -----> 492 bins
# FASTA -> w/o tel ---> 462 bins
# ==========================================================

In [92]:
a = [[10, 11, 12], [20, 21, 22], [30, 31, 32]]
b = [13, 23, 33, 43]

for row, val in zip(a, b):
    row.append(val)

print(a)


[[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]]
