In [1]:
from PIL import Image
import shutil
import re
import glob
import subprocess
import os
import fitz  # PyMuPDF
import template_matching

templates = "templates/"

In [None]:

def convert_pdf_to_jpg(pdf_path, output_dir, file_name, start_page, end_page, dpi=300):
    """
    Convert specified pages of a PDF to JPG images.

    Args:
        pdf_path (str): Path to the input PDF file.
        output_dir (str): Directory to save the output images.
        start_page (int): Start page number (1-based index).
        end_page (int): End page number (1-based index, inclusive).
        dpi (int): Resolution of the output images in dots per inch.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Open the PDF file
    pdf_file = fitz.open(pdf_path)

    # Convert pages to images
    for page_num in range(start_page - 1, end_page):  # Adjust for 0-based indexing
        page = pdf_file[page_num]
        pix = page.get_pixmap(dpi=dpi)  # Generate pixmap with specified DPI

        # Define the output file path
        output_file_path = os.path.join(output_dir, f"{file_name}_page_{page_num + 1}.jpg")

        # Save as JPG
        pix.pil_save(output_file_path, format="JPEG")
        print(f"Saved page {page_num + 1} as image: {output_file_path}")

    # Close the PDF file
    pdf_file.close()

# Carvykty (Oezdemirli) Vector

## Create JPG from PDF

In [29]:
#Ozdemirli_2024_Indolent-CD4+_Supplement
pdf_path = "Ozdemirli_2024_Indolent-CD4+_Supplement.pdf"
# Example: Convert pages 5 to 8 of a PDF to JPG

# Ensure the output directory exists
output_dir = "ciltacel_oezdemirli_output"
os.makedirs(output_dir, exist_ok=True)

convert_pdf_to_jpg(
    pdf_path=pdf_path,
    output_dir=output_dir,
    file_name="ciltacel_ozde",
    start_page=6,
    end_page=8,
    dpi=500  # High-quality output
)

Saved page 6 as image: ciltacel_oezdemirli_output/ciltacel_ozde_page_6.jpg
Saved page 7 as image: ciltacel_oezdemirli_output/ciltacel_ozde_page_7.jpg
Saved page 8 as image: ciltacel_oezdemirli_output/ciltacel_ozde_page_8.jpg


Additional rectangles were added to cover the annotations (because they include additional A, T, C, Gs). Otherwise this would lead to additional matched by the template matching script.

## Get Sequences from image

In [6]:
# Define commands dynamically based on parameters
output_dir = "ciltacel_oezdemirli_output"
def generate_commands(file_name, pages, templates, template_strings, scales, eps, threshold, sequences):
    commands = []
    for page, sequence in zip(pages, sequences):
        commands.append({
            "description": f"{file_name} Page {page['page_number']}",
            "cmd": [
                "python", "template_matching.py",
                "--image_path", f"ciltacel_oezdemirli_output/{file_name}_page_{page['page_number']}.jpg",
                "--templates", *templates,
                "--template_strings", *template_strings,
                "--scales", str(scales[0]), str(scales[1]), str(scales[2]),
                "--eps", str(eps),
                "--threshold", str(threshold),
                "--output_image", f"{output_dir}/{file_name}_page_{page['page_number']}_highlighted.jpg",
                "--output_text", f"{output_dir}/{file_name}_part{page['part_number']}.txt",
                "--start_strings", sequence["start"],
                "--end_strings", sequence["end"]
            ]
        })
    return commands


# Configuration for pages
file_name = "ciltacel_ozde"
pages = [
    {"page_number": 6, "part_number": 1},
    {"page_number": 7, "part_number": 2},
    {"page_number": 8, "part_number": 3}
]
sequences = [
    {"start": "TGGAAGGG", "end": "ATTCTTG"},
    {"start": "ATCCCGAGCTT", "end": "GGGCCAGAACCAGCTCTATAAC"},
    {"start": "GAGCTCAATCTA", "end": "AAAATCTCTAGCA"}
]

# Shared configuration
templates = ["templates/A3.jpg", "templates/G3.jpg", "templates/C3.jpg", "templates/T3.jpg"]
template_strings = ["A", "G", "C", "T"]
scales = (0.8, 1.1, 50)   # (start, end, steps)
eps = 30
threshold = 0.85

# Generate commands
commands = generate_commands(file_name, pages, templates, template_strings, scales, eps, threshold, sequences)

# Launch all processes in the background
processes = []
for cmd_info in commands:
    process = subprocess.Popen(cmd_info["cmd"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    processes.append((cmd_info["description"], process))
    print(f"{cmd_info['description']} is running in background with PID: {process.pid}")

# Wait for all processes to complete
for description, process in processes:
    stdout, stderr = process.communicate()
    if process.returncode == 0:
        print(f"{description} completed successfully.")
    else:
        print(f"{description} failed with error:\n{stderr.decode()}")

ciltacel_ozde Page 6 is running in background with PID: 2570616
ciltacel_ozde Page 7 is running in background with PID: 2570617
ciltacel_ozde Page 8 is running in background with PID: 2570618
ciltacel_ozde Page 6 completed successfully.
ciltacel_ozde Page 7 completed successfully.
ciltacel_ozde Page 8 completed successfully.


## Compare sequences

In [7]:
output_dir = "ciltacel_oezdemirli_output/"
# Define input pattern and output file name
input_pattern = output_dir + "ciltacel_ozde*.txt"
output_file = output_dir + "ciltacel_ozde_combined.txt"

# Find all matching files
files = sorted(glob.glob(input_pattern))

# Concatenate the contents of all files
with open(output_file, "w") as outfile:
    outfile.write(">ciltacel_ozde")
    outfile.write("\n")
    for file in files:
        with open(file, "r") as infile:
            content = infile.read()
            outfile.write(content)  # Add newline between parts for clarity

print(f"All ciltacel_ozde_part* files have been concatenated into {output_file}.")

# Read the combined sequence file and extract the vector sequence
with open(output_file, 'r') as combined_file:
    combined_lines = combined_file.readlines()
    vector_sequence = ''.join(combined_lines[1:]).strip()  # Extract sequence from the second line onward

# Step 4: Compare with the FASTA sequence
fasta_file_path = '../Vector_systems/Sequences/Carvykty_Vector.fasta'
with open(fasta_file_path, 'r') as fasta_file:
    lines = fasta_file.readlines()
    fasta_sequence = ''.join(lines[1:]).strip()  # Extract sequence from FASTA file

# Step 5: Compare sequences and output results
if fasta_sequence.upper() == vector_sequence.upper():
    print("The sequences are identical.")
else:
    print("The sequences are different.")
    print(f"Vector Sequence (Extracted): {vector_sequence[:50]}...{vector_sequence[-50:]}")
    print(f"FASTA Sequence: {fasta_sequence[:50]}...{fasta_sequence[-50:]}")

# Optional: Save the new vector sequence to a file
output_fasta_path = 'new_vector_sequence.fasta'
with open(output_fasta_path, 'w') as new_file:
    new_file.write('>New_Vector_Sequence\n')
    new_file.write(vector_sequence + "\n")
    new_file.write('>Old_Vector_Sequence\n')
    new_file.write(fasta_sequence + "\n")

print(f"New vector sequence saved to {output_fasta_path}.")

All ciltacel_ozde_part* files have been concatenated into ciltacel_oezdemirli_output/ciltacel_ozde_combined.txt.
The sequences are identical.
New vector sequence saved to new_vector_sequence.fasta.


# Carvykti Patent (US20230270786A1) 

## Create JPG from PDF	

In [3]:
# Ensure the output directory exists
output_dir = "ciltacel_patent_US20230270786A1_output"
os.makedirs(output_dir, exist_ok=True)

pdf_document = "US20230270786A1.pdf"
file_name = "Ciltacel_patent_US20230270786A1"
start_page = 140
end_page = 141
dpi = 300

# Use the function to convert specified pages to JPG images
convert_pdf_to_jpg(pdf_path=pdf_document, 
                   output_dir=output_dir, 
                   file_name=file_name, 
                   start_page=start_page, 
                   end_page=end_page, 
                   dpi=dpi)

Saved page 140 as image: ciltacel_patent_US20230270786A1_output/Ciltacel_patent_US20230270786A1_page_140.jpg
Saved page 141 as image: ciltacel_patent_US20230270786A1_output/Ciltacel_patent_US20230270786A1_page_141.jpg


## Get Sequences from image

In [7]:
import os
import subprocess

# Define commands dynamically based on parameters
def generate_commands(file_name, start_page, end_page, templates, template_strings, scales, eps, threshold, sequences):
    commands = []
    for page_number, sequence in zip(range(start_page, end_page + 1), sequences):
        commands.append({
            "description": f"{file_name} Page {page_number}",
            "cmd": [
                "python", "template_matching.py",
                "--image_path", f"{output_dir}/{file_name}_page_{page_number}.jpg",
                "--templates", *templates,
                "--template_strings", *template_strings,
                "--scales", str(scales[0]), str(scales[1]), str(scales[2]),
                "--eps", str(eps),
                "--threshold", str(threshold),
                "--output_image", f"{output_dir}/{file_name}_page_{page_number}_highlighted.jpg",
                "--output_text", f"{output_dir}/{file_name}_part{page_number - start_page + 1}.txt",
                "--start_strings", sequence["start"],
                "--end_strings", sequence["end"]
            ]
        })
    return commands

# Configuration
file_name = "ciltacel_patent_US20230270786A1_output"
start_page = 140
end_page = 141

# Sequences for the corresponding pages
sequences = [
    {"start": "ATGGCTCT", "end": "GCAGATG"},  
    {"start": "AACTCCCTGA", "end": "TCGCTAA"} 
]

# Shared configuration
templates = ["templates/a5.jpg", "templates/g5.jpg", "templates/c5.jpg", "templates/t5.jpg"]
template_strings = ["A", "G", "C", "T"]
scales = (0.8, 0.95, 20)  # (start, end, steps)
eps = 20
threshold = 0.85

# Generate commands
commands = generate_commands(file_name, start_page, end_page, templates, template_strings, scales, eps, threshold, sequences)
print(commands)
# Launch all processes in the background
# processes = []
# for cmd_info in commands:
#     process = subprocess.Popen(cmd_info["cmd"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#     processes.append((cmd_info["description"], process))
#     print(f"{cmd_info['description']} is running in background with PID: {process.pid}")


[{'description': 'ciltacel_patent_US20230270786A1_output Page 140', 'cmd': ['python', 'template_matching.py', '--image_path', 'ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_output_page_140.jpg', '--templates', 'templates/a5.jpg', 'templates/g5.jpg', 'templates/c5.jpg', 'templates/t5.jpg', '--template_strings', 'A', 'G', 'C', 'T', '--scales', '0.8', '0.95', '20', '--eps', '20', '--threshold', '0.85', '--output_image', 'ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_output_page_140_highlighted.jpg', '--output_text', 'ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_output_part1.txt', '--start_strings', 'ATGGCTCT', '--end_strings', 'GCAGATG']}, {'description': 'ciltacel_patent_US20230270786A1_output Page 141', 'cmd': ['python', 'template_matching.py', '--image_path', 'ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_output_page_141.jpg', '--templates', 'templates/a5.jpg', 'templates/g5.jpg', 'temp

In [11]:
output_dir = "ciltacel_patent_US20230270786A1_output/"
os.makedirs(output_dir, exist_ok=True)

# Define input pattern and output file name
input_pattern = output_dir + "ciltacel_patent_US20230270786A1_part*.txt"
output_file = output_dir + "ciltacel_patent_US20230270786A1_combined.txt"

# Find all matching files
files = sorted(glob.glob(input_pattern))
print(files)

# Concatenate the contents of all files
with open(output_file, "w") as outfile:
    outfile.write(">Ciltacel_patent_US20230270786A1")
    outfile.write("\n")
    for file in files:
        with open(file, "r") as infile:
            content = infile.read()
            outfile.write(content)  # Add newline between parts for clarity

print(f"All Ciltacel_patent* files have been concatenated into {output_file}.")


['ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_part1.txt', 'ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_part2.txt']
All Ciltacel_patent* files have been concatenated into ciltacel_patent_US20230270786A1_output/ciltacel_patent_US20230270786A1_combined.txt.


# Carvykty Patent

## Create JPG from PDF

In [14]:
# Ensure the output directory exists
output_dir = "ciltacel_patent_output"
os.makedirs(output_dir, exist_ok=True)

pdf_document = "WO2022116086A1.pdf"
file_name = "Ciltacel_patent"
start_page = 82
end_page = 83
dpi = 300

# Use the function to convert specified pages to JPG images
convert_pdf_to_jpg(pdf_path=pdf_document, 
                   output_dir=output_dir, 
                   file_name=file_name, 
                   start_page=start_page, 
                   end_page=end_page, 
                   dpi=dpi)

Saved page 82 as image: ciltacel_patent_output/Ciltacel_patent_page_82.jpg
Saved page 83 as image: ciltacel_patent_output/Ciltacel_patent_page_83.jpg


Ciltacel_patent Page 140 is running in background with PID: 1826901
Ciltacel_patent Page 141 is running in background with PID: 1826902


## Get Sequences from image

In [None]:
import os
import subprocess

# Define commands dynamically based on parameters
def generate_commands(file_name, start_page, end_page, templates, template_strings, scales, eps, threshold, sequences):
    commands = []
    for page_number, sequence in zip(range(start_page, end_page + 1), sequences):
        commands.append({
            "description": f"{file_name} Page {page_number}",
            "cmd": [
                "python", "template_matching.py",
                "--image_path", f"{output_dir}/{file_name}_page_{page_number}.jpg",
                "--templates", *templates,
                "--template_strings", *template_strings,
                "--scales", str(scales[0]), str(scales[1]), str(scales[2]),
                "--eps", str(eps),
                "--threshold", str(threshold),
                "--output_image", f"{output_dir}/{file_name}_page_{page_number}_highlighted.jpg",
                "--output_text", f"{output_dir}/{file_name}_part{page_number - start_page + 1}.txt",
                "--start_strings", sequence["start"],
                "--end_strings", sequence["end"]
            ]
        })
    return commands

# Configuration
file_name = "Ciltacel_patent"
start_page = 82
end_page = 83

# Sequences for the corresponding pages
sequences = [
    {"start": "ATGGCTCT", "end": "TGAACTG"},  # Sequence for page 81
    {"start": "AGAGTGAAG", "end": "TCGCTAA"}   # Sequence for page 82
]

# Shared configuration
templates = ["templates/A4.jpg", "templates/G4.jpg", "templates/C4.jpg", "templates/T4.jpg"]
template_strings = ["A", "G", "C", "T"]
scales = (0.9, 1.0, 20)  # (start, end, steps)
eps = 20
threshold = 0.85

# Generate commands
commands = generate_commands(file_name, start_page, end_page, templates, template_strings, scales, eps, threshold, sequences)

# Launch all processes in the background
processes = []
for cmd_info in commands:
    process = subprocess.Popen(cmd_info["cmd"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    processes.append((cmd_info["description"], process))
    print(f"{cmd_info['description']} is running in background with PID: {process.pid}")


Ciltacel_patent Page 82 is running in background with PID: 2315161
Ciltacel_patent Page 83 is running in background with PID: 2315162


## Compare Sequences

In [33]:
output_dir = "ciltacel_patent_output/"
os.makedirs(output_dir, exist_ok=True)

# Define input pattern and output file name
input_pattern = output_dir + "Ciltacel_patent_part*.txt"
output_file = output_dir + "Ciltacel_patent_combined.txt"

# Find all matching files
files = sorted(glob.glob(input_pattern))
print(files)

# Concatenate the contents of all files
with open(output_file, "w") as outfile:
    outfile.write(">Ciltacel_patent")
    outfile.write("\n")
    for file in files:
        with open(file, "r") as infile:
            content = infile.read()
            outfile.write(content)  # Add newline between parts for clarity

print(f"All Ciltacel_patent* files have been concatenated into {output_file}.")

# Read the combined sequence file and extract the vector sequence
with open(output_file, 'r') as combined_file:
    combined_lines = combined_file.readlines()
    vector_sequence = ''.join(combined_lines[1:]).strip()  # Extract sequence from the second line onward

# Step 4: Compare with the FASTA sequence
fasta_file_path = '../CAR_constructs/Sequences/Ciltacel_patent.fasta'
with open(fasta_file_path, 'r') as fasta_file:
    lines = fasta_file.readlines()
    fasta_sequence = ''.join(lines[1:]).strip()  # Extract sequence from FASTA file

# Step 5: Compare sequences and output results
if fasta_sequence.upper() == vector_sequence.upper():
    print("The sequences are identical.")
else:
    print("The sequences are different.")

['ciltacel_patent_output/Ciltacel_patent_part1.txt', 'ciltacel_patent_output/Ciltacel_patent_part2.txt']
All Ciltacel_patent* files have been concatenated into ciltacel_patent_output/Ciltacel_patent_combined.txt.
The sequences are identical.


# Idecel

## Create JPG from PDF

In [34]:
# Define parameters

# Ensure the output directory exists
output_dir = "idecel_output"
os.makedirs(output_dir, exist_ok=True)

pdf_document = "WO2021091978A1.pdf"
file_name = "Idecel"
start_page = 38
end_page = 44
dpi = 300

# Use the function to convert specified pages to JPG images
convert_pdf_to_jpg(pdf_path=pdf_document, 
                   output_dir=output_dir, 
                   file_name=file_name, 
                   start_page=start_page, 
                   end_page=end_page, 
                   dpi=dpi)

Saved page 37 as image: idecel_output/Idecel_page_37.jpg
Saved page 38 as image: idecel_output/Idecel_page_38.jpg
Saved page 39 as image: idecel_output/Idecel_page_39.jpg
Saved page 40 as image: idecel_output/Idecel_page_40.jpg
Saved page 41 as image: idecel_output/Idecel_page_41.jpg
Saved page 42 as image: idecel_output/Idecel_page_42.jpg
Saved page 43 as image: idecel_output/Idecel_page_43.jpg
Saved page 44 as image: idecel_output/Idecel_page_44.jpg


## Get Sequences from image

In [37]:
# Function to dynamically generate commands
def generate_commands(file_name, start_page, end_page, templates, template_strings, scales, eps, threshold, sequences):
    commands = []
    for page_number, sequence in zip(range(start_page, end_page + 1), sequences):
        commands.append({
            "description": f"{file_name} Page {page_number}",
            "cmd": [
                "python", "template_matching.py",
                "--image_path", f"{output_dir}/{file_name}_page_{page_number}.jpg",
                "--templates", *templates,
                "--template_strings", *template_strings,
                "--scales", *map(str, scales),
                "--eps", str(eps),
                "--threshold", str(threshold),
                "--output_image", f"{output_dir}/{file_name}_page_{page_number}_highlighted.jpg",
                "--output_text", f"{output_dir}/{file_name}_part{page_number - start_page + 1}.txt",
                "--start_strings", sequence["start"],
                "--end_strings", sequence["end"]
            ]
        })
    return commands

# Configuration
file_name = "Ciltacel_patent"
output_dir = "idecel_output"
start_page = 82
end_page = 83

# Sequences for the corresponding pages
sequences = [
    {"start": "ATGGCTCT", "end": "TGAACTG"},  # Sequence for page 82
    {"start": "AGAGTGAAG", "end": "TCGCTAA"}  # Sequence for page 83
]

# Shared configuration
templates = ["templates/A4.jpg", "templates/G4.jpg", "templates/C4.jpg", "templates/T4.jpg"]
template_strings = ["A", "G", "C", "T"]
scales = (0.9, 1.0, 20)  # (start, end, steps)
eps = 20
threshold = 0.85

# Generate commands
commands = generate_commands(file_name, start_page, end_page, templates, template_strings, scales, eps, threshold, sequences)

# Launch all processes in the background
processes = []
for cmd_info in commands:
    process = subprocess.Popen(cmd_info["cmd"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    processes.append((cmd_info["description"], process))
    print(f"{cmd_info['description']} is running in background with PID: {process.pid}")


Vector Page 42 is running in background with PID: 2345356


## Compare Sequences

In [42]:
import glob

def read_and_concatenate_files(pattern, output_file=None):
    files = sorted(glob.glob(pattern))
    combined_content = "".join(open(file, "r").read().strip() for file in files)
    if output_file:
        with open(output_file, "w") as outfile:
            outfile.write(combined_content)
        print(f"Combined content saved to {output_file}")
    return combined_content

# Patterns for files
vector_pattern = "idecel_output/Idecel_vector_part*.txt"
car_pattern = "idecel_output/Idecel_part*.txt"

# Read and concatenate parts
vector_combined = read_and_concatenate_files(vector_pattern, output_file="idecel_output/idecel_vector_combined.txt")
car_combined = read_and_concatenate_files(car_pattern, output_file="idecel_output/idecel_car_combined.txt")

# Function to compare sequences and write results
def compare_and_write_fasta(sequence, fasta_path, output_path):
    with open(fasta_path, 'r') as file:
        fasta_sequence = ''.join(file.readlines()[1:]).strip().replace("\n", "").upper()
    if sequence == fasta_sequence:
        print("The sequences are identical.")
    else:
        print("The sequences are different.")
    with open(output_path, 'w') as new_file:
        new_file.write('>New_Vector_Sequence\n')
        new_file.write(sequence + "\n")
        new_file.write('>Old_one\n')
        new_file.write(fasta_sequence)
    print(f"Results written to {output_path}")

# Compare CAR sequence
compare_and_write_fasta(
    sequence=car_combined.upper(),
    fasta_path='../CAR_constructs/Sequences/Idecel.fasta',
    output_path='new_car_sequence.fasta'
)

# Compare Vector sequence
compare_and_write_fasta(
    sequence=vector_combined.upper(),
    fasta_path='../Vector_systems/Sequences/Idecel_Vector.fasta',
    output_path='new_vector_sequence.fasta'
)


Combined content saved to idecel_output/idecel_vector_combined.txt
Combined content saved to idecel_output/idecel_car_combined.txt
The sequences are identical.
Results written to new_car_sequence.fasta
The sequences are identical.
Results written to new_vector_sequence.fasta


# Tisacel

## Create JPG from PDF

In [8]:
#US9499629
# Ensure the output directory exists
output_dir = "tisacel_output"
os.makedirs(output_dir, exist_ok=True)

# Define parameters
pdf_document = "US9499629.pdf"
file_name = "Tisacel_car"
start_page = 70
end_page = 71
dpi = 300

# Use the function to convert specified pages to JPG images
convert_pdf_to_jpg(pdf_path=pdf_document, 
                   output_dir=output_dir, 
                   file_name=file_name, 
                   start_page=start_page, 
                   end_page=end_page, 
                   dpi=dpi)


file_name = "Tisacel_vector"
start_page = 64
end_page = 67

convert_pdf_to_jpg(pdf_path=pdf_document, 
                   output_dir=output_dir, 
                   file_name=file_name, 
                   start_page=start_page, 
                   end_page=end_page, 
                   dpi=dpi)

Saved page 70 as image: tisacel_output/Tisacel_car_page_70.jpg
Saved page 71 as image: tisacel_output/Tisacel_car_page_71.jpg
Saved page 64 as image: tisacel_output/Tisacel_vector_page_64.jpg
Saved page 65 as image: tisacel_output/Tisacel_vector_page_65.jpg
Saved page 66 as image: tisacel_output/Tisacel_vector_page_66.jpg
Saved page 67 as image: tisacel_output/Tisacel_vector_page_67.jpg


## Get Sequences

In [13]:
import subprocess  # Ensure subprocess is imported

output_dir = "tisacel_output"

def generate_commands(file_name, pages, templates, template_strings, scales, eps, threshold, sequences):
    commands = []
    for page, sequence in zip(pages, sequences):
        commands.append({
            "description": f"{file_name} Page {page['page_number']}",
            "cmd": [
                "python", "template_matching.py",
                "--image_path", f"{output_dir}/{file_name}_page_{page['page_number']}.jpg",
                "--templates", *templates,
                "--template_strings", *template_strings,
                "--scales", str(scales[0]), str(scales[1]), str(scales[2]),
                "--eps", str(eps),
                "--threshold", str(threshold),
                "--output_image", f"{output_dir}/{file_name}_page_{page['page_number']}_highlighted.jpg",
                "--output_text", f"{output_dir}/{file_name}_part{page['part_number']}.txt",
                "--start_strings", sequence["start"],
                "--end_strings", sequence["end"]
            ]
        })
    return commands

# Configuration for CAR pages
file_name_car = "Tisacel_car"
car_pages = [
    {"page_number": 70, "part_number": 1},
    {"page_number": 71, "part_number": 2}
]
car_sequences = [
    {"start": "atggccttac", "end": "caaggggcac"},
    {"start": "gatggccttt", "end": "cccctcgct"}
]

# Configuration for Vector pages
file_name_vector = "Tisacel_vector"
vector_pages = [
    {"page_number": 64, "part_number": 1},
    {"page_number": 65, "part_number": 2},
    {"page_number": 66, "part_number": 3},
    {"page_number": 67, "part_number": 4}
]
vector_sequences = [
    {"start": "gcgcgctcac", "end": "ggcgataagt"},
    {"start": "cgtgtcttac", "end": "tagtaggagg"},
    {"start": "cttggtaggtttaagaa", "end": "aaactgcaggagtca"},
    {"start": "ggacctggcc", "end": "gtattac"}
]

# Shared configuration
templates = ["templates/a1.jpg", "templates/g1.jpg", "templates/c1.jpg", "templates/t1.jpg"]
template_strings = ["a", "g", "c", "t"]
scales = (0.6, 1.0, 50)  # (start, end, steps)
eps = 30
threshold = 0.85

# Generate commands
car_commands = generate_commands(file_name_car, car_pages, templates, template_strings, scales, eps, threshold, car_sequences)
vector_commands = generate_commands(file_name_vector, vector_pages, templates, template_strings, scales, eps, threshold, vector_sequences)

# Combine all commands
all_commands = car_commands + vector_commands

# Launch all processes in the background
processes = []
for cmd_info in all_commands:
    if cmd_info["description"] == "Tisacel_vector Page 66":
        try:
            process = subprocess.Popen(cmd_info["cmd"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            processes.append((cmd_info["description"], process))
            print(f"{cmd_info['description']} is running in background with PID: {process.pid}")
        except Exception as e:
            print(f"Failed to start process for {cmd_info['description']}: {e}")

# Optionally, wait for all processes to finish and handle their output
for description, process in processes:
    stdout, stderr = process.communicate()
    if process.returncode == 0:
        print(f"{description} completed successfully.")
    else:
        print(f"{description} failed with error:\n{stderr.decode()}")


Tisacel_vector Page 66 is running in background with PID: 2526349
Tisacel_vector Page 66 completed successfully.


## Compare sequences

In [18]:
# Define input pattern and output file name
input_pattern = "tisacel_output/Tisacel_car_part*.txt"
output_file = "tisacel_output/Tisacel_car_combined.txt"

# Find all matching files
files = sorted(glob.glob(input_pattern))

# Concatenate the contents of all files
with open(output_file, "w") as outfile:
    outfile.write(">tisacel_car\n")
    for file in files:
        with open(file, "r") as infile:
            content = infile.read()
            outfile.write(content)  # Add newline between parts for clarity

print(f"All Tisacel_car_part* files have been concatenated into {output_file}.")

# Define input pattern and output file name for vector parts
input_pattern_vector = "tisacel_output/Tisacel_vector_part*.txt"
output_file_vector = "tisacel_output/Tisacel_vector_combined.txt"

# Find all matching files
files_vector = sorted(glob.glob(input_pattern_vector))

# Concatenate the contents of all files
with open(output_file_vector, "w") as outfile_vector:
    outfile_vector.write(">tisacel_vector\n")
    for file in files_vector:
        with open(file, "r") as infile_vector:
            content_vector = infile_vector.read()
            outfile_vector.write(content_vector)  # Add newline between parts for clarity

print(f"All Tisacel_vector_part* files have been concatenated into {output_file_vector}.")




All Tisacel_car_part* files have been concatenated into tisacel_output/Tisacel_car_combined.txt.
All Tisacel_vector_part* files have been concatenated into tisacel_output/Tisacel_vector_combined.txt.


In [26]:
# Read the combined sequence file and extract the vector sequence
with open('tisacel_output/Tisacel_car_combined.txt', 'r') as combined_file:
    combined_lines = combined_file.readlines()
    tisacel_car_sequence = ''.join(combined_lines[1:]).strip()  # Extract sequence from the second line onward

# Read the FASTA sequence
fasta_file_path = '../CAR_constructs/Sequences/Tisacel.fasta'
with open(fasta_file_path, 'r') as fasta_file:
    fasta_lines = fasta_file.readlines()
    fasta_sequence = ''.join(fasta_lines[1:]).strip()  # Extract sequence from FASTA file

# Compare sequences and output results
if fasta_sequence.upper() == tisacel_car_sequence.upper():
    print("The sequences are identical.")
else:
    print("The sequences are different.")
    print(f"Tisacel CAR Sequence (Extracted): {tisacel_car_sequence[:50]}...{tisacel_car_sequence[-50:]}")
    print(f"FASTA Sequence: {fasta_sequence[:50]}...{fasta_sequence[-50:]}")

# Read the combined sequence file and extract the vector sequence
with open('tisacel_output/Tisacel_vector_combined.txt', 'r') as combined_file:
    combined_lines = combined_file.readlines()
    tisacel_vector_sequence = ''.join(combined_lines[1:]).strip()  # Extract sequence from the second line onward

# Read the FASTA sequence
fasta_file_path = '../Vector_systems/Sequences/Tisacel_Vector.fasta'
with open(fasta_file_path, 'r') as fasta_file:
    fasta_lines = fasta_file.readlines()
    fasta_sequence = ''.join(fasta_lines[1:]).strip()  # Extract sequence from FASTA file

# Compare sequences and output results
if fasta_sequence.upper() == tisacel_vector_sequence.upper():
    print("The sequences are identical.")
else:
    print("The sequences are different.")
    print(f"Tisacel CAR Sequence (Extracted): {tisacel_vector_sequence[:50]}...{tisacel_vector_sequence[-50:]}")
    print(f"FASTA Sequence: {fasta_sequence[:50]}...{fasta_sequence[-50:]}")

The sequences are identical.
The sequences are identical.
