In [8]:
from Bio import SeqIO
import pandas as pd

In [1]:
fasta_path = "/data/projects/Resources/HumanReferenceGenome/chr22.fa"

In [9]:
def get_subsequences_with_coords(sequence, subseq_length, slide):
    """
    Extracts subsequences of specified length from a DNA sequence with a sliding window, along with their coordinates.

    :param sequence: String, the DNA sequence.
    :param subseq_length: Integer, length of each subsequence to extract.
    :param slide: Integer, the number of base pairs to slide the window each step.
    :return: List of tuples containing coordinates and subsequences.
    """
    subsequences_with_coords = []
    for i in range(0, len(sequence) - subseq_length + 1, slide):
        subsequence = sequence[i:i + subseq_length]
        coordinates = f"{i+1}-{i+subseq_length}"  # 1-based indexing
        subsequences_with_coords.append((coordinates, subsequence))
    return subsequences_with_coords

In [10]:
# Read the sequence from the FASTA file
with open(fasta_path, "r") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        chromosome_22_sequence = str(record.seq)

In [11]:
# Get subsequences of length 90 with a sliding window of 50 along with their coordinates
subsequences_with_coords = get_subsequences_with_coords(chromosome_22_sequence, 90, 50)

In [12]:
# Create a DataFrame
df = pd.DataFrame(subsequences_with_coords, columns=["Coordinates", "Sequence"])
df

Unnamed: 0,Coordinates,Sequence
0,1-90,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1,51-140,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
2,101-190,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
3,151-240,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
4,201-290,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
...,...,...
1016363,50818151-50818240,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1016364,50818201-50818290,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1016365,50818251-50818340,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1016366,50818301-50818390,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...


In [13]:
df_filtered = df[~df['Sequence'].str.contains('N')]
df_filtered

Unnamed: 0,Coordinates,Sequence
210200,10510001-10510090,GAATTCTTGTGTTTATATAATAAGATGTCCTATAATTTCTGTTTGG...
210201,10510051-10510140,TAAAATCAGCAACTAATATGTATTTTCAAAGCATTATAAATACAGA...
210202,10510101-10510190,TAAGTTACTTCACTGTGAAATGTAGTCATATAAAGAACATAATAAT...
210203,10510151-10510240,CTGGATTATTTTTAAATGGGCTGTCTAACATTATATTAAAAGGTTT...
210204,10510201-10510290,AGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggctta...
...,...,...
1016163,50808151-50808240,gagggtgagggtgagggtgggttagggttagggttagggttcgggt...
1016164,50808201-50808290,gttcgggttcgggttcgggttcgggttcgggttcgggttcgggttc...
1016165,50808251-50808340,tcgggttgggttagggttaggggttaggggttaggggttagggtta...
1016166,50808301-50808390,tagggttagggttagggttagggttaggcttagggttagggtaggc...


In [14]:
def seq2kmer(seq, k):
    seq=seq.upper()
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [15]:
df_filtered['6-mer'] = df_filtered['Sequence'].apply(lambda x: seq2kmer(x, 6))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['6-mer'] = df_filtered['Sequence'].apply(lambda x: seq2kmer(x, 6))


In [16]:
df_filtered

Unnamed: 0,Coordinates,Sequence,6-mer
210200,10510001-10510090,GAATTCTTGTGTTTATATAATAAGATGTCCTATAATTTCTGTTTGG...,GAATTC AATTCT ATTCTT TTCTTG TCTTGT CTTGTG TTGT...
210201,10510051-10510140,TAAAATCAGCAACTAATATGTATTTTCAAAGCATTATAAATACAGA...,TAAAAT AAAATC AAATCA AATCAG ATCAGC TCAGCA CAGC...
210202,10510101-10510190,TAAGTTACTTCACTGTGAAATGTAGTCATATAAAGAACATAATAAT...,TAAGTT AAGTTA AGTTAC GTTACT TTACTT TACTTC ACTT...
210203,10510151-10510240,CTGGATTATTTTTAAATGGGCTGTCTAACATTATATTAAAAGGTTT...,CTGGAT TGGATT GGATTA GATTAT ATTATT TTATTT TATT...
210204,10510201-10510290,AGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggctta...,AGTAAT GTAATT TAATTC AATTCA ATTCAT TTCATT TCAT...
...,...,...,...
1016163,50808151-50808240,gagggtgagggtgagggtgggttagggttagggttagggttcgggt...,GAGGGT AGGGTG GGGTGA GGTGAG GTGAGG TGAGGG GAGG...
1016164,50808201-50808290,gttcgggttcgggttcgggttcgggttcgggttcgggttcgggttc...,GTTCGG TTCGGG TCGGGT CGGGTT GGGTTC GGTTCG GTTC...
1016165,50808251-50808340,tcgggttgggttagggttaggggttaggggttaggggttagggtta...,TCGGGT CGGGTT GGGTTG GGTTGG GTTGGG TTGGGT TGGG...
1016166,50808301-50808390,tagggttagggttagggttagggttaggcttagggttagggtaggc...,TAGGGT AGGGTT GGGTTA GGTTAG GTTAGG TTAGGG TAGG...


In [17]:
output_path = "/data/private/pdutta/DNABERT_2"

In [18]:
df_filtered.to_csv(output_path+"/chr22_all_info.csv", sep=",", index=False)

In [20]:
df_filtered[['Sequence','6-mer']].to_csv(output_path+"/chr22_seq.csv", sep=",", index=False)