# BIOLOGICAL FEATURES

In [3]:
# ================================
#           libraries
# ================================

# %pip install -q MDTraj    # leave this uncommented if running it on Google Colab
import mdtraj as md
from Bio import SeqIO

import pandas as pd
import numpy as np
import scipy.ndimage as nd
import math
from tqdm import tqdm
from random import random
import plotly.graph_objects as go
import plotly.io as pio

import urllib.request
import ssl
from copy import deepcopy

In [4]:
url = 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/AX9716PF/GSE105544_ENCFF010WBP/VC/LorDG/chr1.pdb'

In [5]:
# Create unverified context for this specific download
ssl_context = ssl._create_unverified_context()

# Download the file
with urllib.request.urlopen(url, context=ssl_context) as response:
    with open('chr1.pdb', 'wb') as out_file:
        out_file.write(response.read())

# Load from local file
chr_str = md.load_pdb('chr1.pdb')
coords = chr_str.xyz.reshape(-1, 3)

In [6]:
fasta_file = '../data/raw/hg38_chr1_FASTA.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, 'fasta'):
    sequences.append(record)
    print('_' * 60)
    print(f'ID: {record.id}')
    # print(f'Sequence: {record.seq}')
    print(f'Description: {record.description}')
    print('_' * 60)


____________________________________________________________
ID: CM000663.2
Description: CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly
____________________________________________________________


In [10]:
gc_content = {'bin': [], 'gc_content': []}

def gc_content_calculator(sequence, window):
    for i in range(0, len(sequence) - window, window):
        sequence_frag = sequence[i, i + window].upper()
        g_content = sequence_frag.count('G')
        c_content = sequence_frag.count('C')
        gc_content['bin'].append(i)
        gc_content['gc_content'].append((g_content + c_content)/ window)




In [11]:
seq = 'ATGCGCCGATATAGCGGGGCGTGGAG'

seq[0:5]

'ATGCG'