In [5]:
def fix_line_breaks(input_file, output_file):
    # Read the contents of the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Remove incorrect line breaks
    corrected_lines = []
    current_sequence = ""
    for line in lines:
        if line.startswith(">"):
            # If a new sequence header is encountered, append the previous sequence (with corrected line breaks)
            if current_sequence:
                corrected_lines.append(current_sequence)
            corrected_lines.append(line)
            current_sequence = ""
        else:
            # Remove line breaks within the sequence data
            current_sequence += line.strip()

    # Append the last sequence (with corrected line breaks)
    if current_sequence:
        corrected_lines.append(current_sequence)

    # Write the corrected lines to the output file
    with open(output_file, 'w') as file:
        file.writelines(corrected_lines)

#fix_line_breaks('/data/jenuwein/processing/20230412_Alecia_PoreC/chr2_withInsertion.fa', '/data/jenuwein/processing/20230412_Alecia_PoreC/chr2_withInsertion.fixed.fa')

fix_line_breaks('/data/jenuwein/processing/20230412_Alecia_PoreC/mm10.PlusInsertion.fa', '/data/jenuwein/processing/20230412_Alecia_PoreC/mm10.PlusInsertion.fixed.fa')


In [4]:
def exchange_sequences(input_file, sequence_name1, sequence_file2, sequence_name2, output_file):
    # Read the contents of the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Find the index of the line corresponding to sequence 1
    index1 = None
    for i, line in enumerate(lines):
        if line.startswith(">" + sequence_name1):
            index1 = i
            break

    if index1 is None:
        print("Sequence 1 not found in the input file.")
        return

    # Read the contents of sequence file 2
    with open(sequence_file2, 'r') as file:
        sequence2_lines = file.readlines()

    # Create the sequence 2 entry with the new name and sequence
    sequence2_entry = [">" + sequence_name2 + "\n"] + sequence2_lines[1:]

    # Replace the sequence 1 entry with sequence 2 entry
    lines[index1:index1 + 2] = sequence2_entry

    # Write the modified lines to the output file
    with open(output_file, 'w') as file:
        file.writelines(lines)

# Usage example
exchange_sequences('/data/repository/organisms/GRCm38_ensembl/genome_fasta/genome.fa', '2 dna_sm:chromosome chromosome:GRCm38:2:1:182113224:1 REF', '/data/jenuwein/processing/20230412_Alecia_PoreC/chr2.withInsertFinal.fa', '2', '/data/jenuwein/processing/20230412_Alecia_PoreC/mm10.PlusInsertion.fa')

In [9]:
#split fasta
import os

def split_fasta(input_file, output_directory):
    # Read the contents of the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Initialize variables
    sequence_name = None
    sequence_lines = []

    # Iterate over each line in the input file
    for line in lines:
        if line.startswith(">"):
            # If a new sequence header is encountered, write the previous sequence to a file
            if sequence_name is not None:
                sequence_name = sequence_name.split(' ')[0]
                print(sequence_name)
                write_sequence_file(sequence_name, sequence_lines, output_directory)

            # Extract the sequence name from the header
            sequence_name = line[1:].split(' ')[0]
            sequence_lines = []
        else:
            # Collect the lines of the current sequence
            sequence_lines.append(line)

    # Write the last sequence to a file
    if sequence_name is not None:
        write_sequence_file(sequence_name, sequence_lines, output_directory)

def write_sequence_file(sequence_name, sequence_lines, output_directory):
    # Generate the filename based on the sequence name
    filename = sequence_name + ".fasta"

    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Write the sequence lines to the output file in the specified directory
    output_file = os.path.join(output_directory, filename)
    with open(output_file, 'w') as file:
        file.write(">"+sequence_name+"\n")
        file.write("".join(sequence_lines))

# Usage example
split_fasta('/data/repository/organisms/GRCm38_ensembl/genome_fasta/genome.fa', '/data/jenuwein/processing/20230412_Alecia_PoreC/split_fasta/')

1
10
11
12
13
14
15
16
17
18
19
2
3
4
5
6
7
8
9
MT
X
Y
JH584299.1
GL456233.1
JH584301.1
GL456211.1
GL456350.1
JH584293.1
GL456221.1
JH584297.1
JH584296.1
GL456354.1
JH584294.1
JH584298.1
JH584300.1
GL456219.1
GL456210.1
JH584303.1
JH584302.1
GL456212.1
JH584304.1
GL456379.1
GL456216.1
GL456393.1
GL456366.1
GL456367.1
GL456239.1
GL456213.1
GL456383.1
GL456385.1
GL456360.1
GL456378.1
GL456389.1
GL456372.1
GL456370.1
GL456381.1
GL456387.1
GL456390.1
GL456394.1
GL456392.1
GL456382.1
GL456359.1
GL456396.1
GL456368.1
JH584292.1


In [14]:
#change line length
def change_line_length(input_file, output_file, line_length=60):
    # Read the contents of the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Remove existing line breaks and create new lines with the desired line length
    sequence=""
    new_lines = []
    for line in lines:
        if line.startswith(">"):
            # Header line, no modification needed
            new_lines.append(line)
        else:
            # Sequence line, remove line breaks and create new lines with the desired length
            sequence = sequence+line.strip()
                
    for i in range(0, len(sequence), line_length):
        new_lines.append(sequence[i:i+line_length] + '\n')

    # Write the modified lines to the output file
    with open(output_file, 'w') as file:
        file.writelines(new_lines)

# Usage example
change_line_length('/data/jenuwein/processing/20230412_Alecia_PoreC/customeGenome/chr2_plusInsertion.fa', '/data/jenuwein/processing/20230412_Alecia_PoreC/customeGenome/chr2_plusInsertion.FINAL.fa', line_length=60)

In [19]:
def check_length(input_file):
    # Read the contents of the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Remove existing line breaks and create new lines with the desired line length
    sequence=""
    new_lines = []
    counter=0
    for line in lines:
        counter=counter+1
        if line.startswith(">"):
            print("continue")
        else:
            if len(line.strip()) != 60:
                print("we got problems")
                print(str(counter))
                break

#check_length('/data/jenuwein/processing/20230412_Alecia_PoreC/customeGenome/chr2_plusInsertion.FINAL.fa')
check_length('/data/jenuwein/processing/20230412_Alecia_PoreC/split_fasta/2.fasta')

continue
we got problems
3035291
