In [1]:
import numpy as np
import pandas as pd
import itertools
from Bio import SeqIO

## Wczytanie i przygotowanie danych

### Pliki `.bed`

In [2]:
negative_set = pd.read_csv('negative_set.bed', delimiter='\t', header=None, names=["Chromosome", "StartPos", "EndPos"])
positive_set = pd.read_csv('GM12878.bed', delimiter='\t', header=None, names=["Chromosome", "StartPos", "EndPos", "Score"])

In [3]:
negative_set.head(5)

Unnamed: 0,Chromosome,StartPos,EndPos
0,chr10,32251,36771
1,chr10,39431,39891
2,chr10,72312,74222
3,chr10,84717,85177
4,chr10,90499,91949


In [4]:
positive_set.head(5)

Unnamed: 0,Chromosome,StartPos,EndPos,Score
0,chr1,773300,774100,7.866088
1,chr1,778980,779450,6.472419
2,chr1,800100,802000,11.010675
3,chr1,825670,826410,6.114487
4,chr1,839470,842590,8.848865


In [5]:
# dodanie etykiety do zbiorów
negative_set["IsEnhancer"] = 0
positive_set["IsEnhancer"] = 1

positive_set = positive_set.drop(columns="Score")

In [6]:
# połączenie zbiorów danych
total_set = pd.concat([negative_set, positive_set])

# usunięcie nieprawidłowych chromosomów
total_set = total_set.loc[total_set["Chromosome"].str.match(r"chr\d+")]

In [7]:
total_set

Unnamed: 0,Chromosome,StartPos,EndPos,IsEnhancer
0,chr10,32251,36771,0
1,chr10,39431,39891,0
2,chr10,72312,74222,0
3,chr10,84717,85177,0
4,chr10,90499,91949,0
...,...,...,...,...
48555,chr9,140651150,140652330,1
48556,chr9,140702130,140703100,1
48557,chr9,140703310,140704120,1
48558,chr9,140710520,140711890,1


### Plik `.fasta`

In [8]:
# odczyt pliku za pomocą biblioteki biopython

fasta_sequences = {}

with open("GRCh37.primary_assembly.genome.fa", "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        fasta_sequences[record.id] = record.seq

In [9]:
# mamy podział na chromosomy
list(fasta_sequences.keys())[:5]

['chr1', 'chr10', 'chr11', 'chr12', 'chr13']

In [10]:
fasta_sequences["chr1"]

Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN')

In [11]:
# na podstawie danych z plików .bed
# wybieramy odpowiednie fragmenty DNA

dna_sequence_list = []

for index, data in total_set.iterrows():
    chromosome = data["Chromosome"]
    start_pos = data["StartPos"]
    end_pos = data["EndPos"]
    dna_sequence = str(fasta_sequences[chromosome][start_pos:end_pos])
    dna_sequence_list.append(dna_sequence)

total_set["DNA sequence"] = dna_sequence_list

In [12]:
total_set.head(5)

Unnamed: 0,Chromosome,StartPos,EndPos,IsEnhancer,DNA sequence
0,chr10,32251,36771,0,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1,chr10,39431,39891,0,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
2,chr10,72312,74222,0,CTGCCACCGTGCCTGGCTAATTTCTGTATTTTTAGTAGAGAGGGGG...
3,chr10,84717,85177,0,AGAAGTTGAATCTCTGAATAGACCAATAACAGGATCTGAAATTGTG...
4,chr10,90499,91949,0,AGGTCAGGTGATCTGCAGCCAACCAAGCAGCTGCTAAGTGGCCAAC...


In [13]:
# usuwanie rekordów z 'N' w sekwencji DNA
total_set = total_set[~total_set["DNA sequence"].str.contains("N")]

### k-mery

In [14]:
# tutaj trzeba policzyć k-mery
# i sconcatować kolumny do tej ramki

total_set

Unnamed: 0,Chromosome,StartPos,EndPos,IsEnhancer,DNA sequence
2,chr10,72312,74222,0,CTGCCACCGTGCCTGGCTAATTTCTGTATTTTTAGTAGAGAGGGGG...
3,chr10,84717,85177,0,AGAAGTTGAATCTCTGAATAGACCAATAACAGGATCTGAAATTGTG...
4,chr10,90499,91949,0,AGGTCAGGTGATCTGCAGCCAACCAAGCAGCTGCTAAGTGGCCAAC...
5,chr10,104967,107977,0,TTTTCCCCACTTGGTATGATACTAGCTGTGGGTCTGTCATATATGA...
6,chr10,114029,114779,0,GAGAGCTGTTAAAGCACTATGTGTGGGTATGTCTGTGAAGGTGTTT...
...,...,...,...,...,...
48555,chr9,140651150,140652330,1,TTGGGGTAGAGATTTCATCTTTGTCATCTTGGCTGTAATGTGCAAA...
48556,chr9,140702130,140703100,1,TGTATTTTTAGTAGAGGCGGGGTTTCACCACGTTGCCCAGGCTGGT...
48557,chr9,140703310,140704120,1,GTTGCAGTGAACCAAGATCGTGCCAACTGCATTCCAGCCTGGGTGA...
48558,chr9,140710520,140711890,1,GGCAGCTTCCTGCCGGAGCCCCACATTCTGCTCGTATTAGCACGTA...


In [15]:
def Kmers_cols(column, k):
    kmers = itertools.product('ACTG', repeat=k)
    df = pd.DataFrame()
    for i in kmers:
        kmer = ''.join(i)
        rev_kmer = kmer.replace('A', '%temp%').replace('T', 'A').replace('%temp%', 'T').replace('C', '%temp%').replace('G', 'C').replace('%temp%', 'G')
        if(rev_kmer not in df.columns):
            df[kmer] = column.str.count(kmer)/(len(column.iloc[0])-k+1)
        else:
            df[rev_kmer] += column.str.count(kmer)/(len(column.iloc[0])-k+1)
    return df

In [16]:
# liczy się do dosyć wolno

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    kmers = Kmers_cols(total_set["DNA sequence"].head(100), k=4)
    
kmers

Unnamed: 0,AAAA,AAAC,AAAT,AAAG,AACA,AACC,AACT,AACG,AATA,AATC,...,CGCT,CGCG,CGTA,CGTC,CGTT,CGTG,CGGA,CGGC,CGGT,CGGG
2,0.018353,0.012061,0.019927,0.012061,0.008390,0.008390,0.008915,0.003671,0.013110,0.006293,...,0.004195,0.002098,0.003671,0.004719,0.006817,0.006817,0.008915,0.004195,0.006817,0.006817
3,0.005768,0.001573,0.004195,0.003146,0.002098,0.002098,0.002622,0.000000,0.006817,0.004719,...,0.000000,0.000000,0.001049,0.001049,0.003146,0.000524,0.000524,0.000524,0.000524,0.000000
4,0.002098,0.003671,0.001049,0.005244,0.004719,0.006817,0.005244,0.003146,0.002098,0.001573,...,0.001573,0.000000,0.003146,0.005768,0.002622,0.004195,0.006293,0.001049,0.005244,0.005768
5,0.042475,0.027268,0.039853,0.026744,0.024122,0.014158,0.015732,0.008390,0.026744,0.009439,...,0.000524,0.000524,0.006293,0.004195,0.007341,0.006293,0.003671,0.001049,0.004719,0.001573
6,0.002098,0.002098,0.003671,0.004195,0.001573,0.002098,0.002098,0.001049,0.005768,0.002622,...,0.000000,0.000000,0.002098,0.001573,0.002622,0.001049,0.001573,0.001573,0.000524,0.002098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.019402,0.009439,0.015732,0.010488,0.007866,0.005768,0.005244,0.006293,0.008390,0.005244,...,0.000524,0.001573,0.003146,0.005768,0.002622,0.004195,0.003671,0.000000,0.002098,0.002098
98,0.002098,0.000524,0.003671,0.001573,0.003146,0.001573,0.000524,0.001049,0.000524,0.002098,...,0.001049,0.000000,0.000524,0.000524,0.002098,0.003671,0.000000,0.002098,0.000524,0.000524
99,0.005768,0.004719,0.009439,0.004195,0.006817,0.003671,0.002622,0.003146,0.009439,0.006293,...,0.000000,0.000000,0.003671,0.003146,0.001049,0.001573,0.002098,0.000000,0.001049,0.000524
100,0.004195,0.002098,0.003671,0.002622,0.003146,0.000524,0.000000,0.000524,0.005768,0.002622,...,0.000000,0.000524,0.000000,0.001049,0.001049,0.002622,0.003146,0.001049,0.002622,0.001049


In [17]:
# podział na zbiory treningowy i testowy

test_set = total_set.loc[total_set["Chromosome"].isin(["chr1", "chr14", "chr21"])]
trening_set = total_set.loc[~total_set["Chromosome"].isin(["chr1", "chr14", "chr21"])]