In [7]:
import openpyxl

def remove_identical_sequences(pos_clean_file, non_redundant_file, output_file):
  """
  Reads sequences from pos-clean.xlsx, removes identical sequences (stripped) from non_redundant_file.xlsx, 
  and writes the filtered sequences to a new file.

  Args:
      pos_clean_file (str): Path to the pos-clean.xlsx file.
      non_redundant_file (str): Path to the Final_non_redundant_whole_sample_dataset_Non_Redundant_Peptides_without_holdout_2015_CPP_2015_Non_CPP.xlsx file.
      output_file (str): Path to the output file where filtered sequences will be written.
  """

  # Read sequences from pos-clean.xlsx, stripping hidden characters
  pos_clean_wb = openpyxl.load_workbook(pos_clean_file, data_only=True)  # Read data only, ignoring formatting
  pos_clean_sheet = pos_clean_wb.active
  pos_clean_sequences = [row[0].value.strip() for row in pos_clean_sheet.iter_rows(min_row=2)]  # Skip header row, strip

  # Read sequences and labels from non-redundant_file.xlsx, stripping hidden characters
  non_redundant_wb = openpyxl.load_workbook(non_redundant_file, data_only=True)
  non_redundant_sheet = non_redundant_wb.active
  sequences = [row[0].value.strip() for row in non_redundant_sheet.iter_rows(min_row=2)]  # Skip header row, strip
  labels = [row[1].value for row in non_redundant_sheet.iter_rows(min_row=2)]  # Skip header row

  # Filter sequences from non-redundant file, excluding those in pos_clean (stripped)
  filtered_sequences = []
  filtered_labels = []
  for i, sequence in enumerate(sequences):
    if sequence not in pos_clean_sequences:
      filtered_sequences.append(sequence)
      filtered_labels.append(labels[i])

  # Write filtered sequences and labels (optional) to a new file
  if output_file:
    output_wb = openpyxl.Workbook()
    output_sheet = output_wb.active
    output_sheet.append(["sequence", "label"])  # Header row
    for i in range(len(filtered_sequences)):
      output_sheet.append([filtered_sequences[i], filtered_labels[i]])
    output_wb.save(output_file)

  print(f"Total sequences in pos-clean.xlsx: {len(pos_clean_sequences)}")
  print(f"Total sequences before filtering: {len(sequences)}")
  print(f"Total sequences after filtering: {len(filtered_sequences)}")

# Replace these with your actual file paths
pos_clean_file = "pos-clean.xlsx"
non_redundant_file = "Final_non_redundant_whole_sample_dataset_Non_Redundant_Peptides_without_holdout_2015_CPP_4319_Non_CPP.xlsx"
output_file = "filtered_sequences.xlsx"  # Optional, set to None to not write a new file

remove_identical_sequences(pos_clean_file, non_redundant_file, output_file)


Total sequences in pos-clean.xlsx: 96
Total sequences before filtering: 6463
Total sequences after filtering: 6253


In [8]:
import openpyxl

def remove_identical_sequences(pos_clean_file, non_redundant_file, output_file):
  """
  Reads sequences from pos-clean.xlsx, removes identical sequences (stripped) from non_redundant_file.xlsx, 
  and writes the filtered sequences to a new file.

  Args:
      pos_clean_file (str): Path to the pos-clean.xlsx file.
      non_redundant_file (str): Path to the Final_non_redundant_whole_sample_dataset_Non_Redundant_Peptides_without_holdout_2015_CPP_2015_Non_CPP.xlsx file.
      output_file (str): Path to the output file where filtered sequences will be written.
  """

  # Read sequences from pos-clean.xlsx, stripping hidden characters
  pos_clean_wb = openpyxl.load_workbook(pos_clean_file, data_only=True)  # Read data only, ignoring formatting
  pos_clean_sheet = pos_clean_wb.active
  pos_clean_sequences = [row[0].value.strip() for row in pos_clean_sheet.iter_rows(min_row=2)]  # Skip header row, strip

  # Read sequences and labels from non-redundant_file.xlsx, stripping hidden characters
  non_redundant_wb = openpyxl.load_workbook(non_redundant_file, data_only=True)
  non_redundant_sheet = non_redundant_wb.active
  sequences = [row[0].value.strip() for row in non_redundant_sheet.iter_rows(min_row=2)]  # Skip header row, strip
  labels = [row[1].value for row in non_redundant_sheet.iter_rows(min_row=2)]  # Skip header row

  # Filter sequences from non-redundant file, excluding those in pos_clean (stripped)
  filtered_sequences = []
  filtered_labels = []
  for i, sequence in enumerate(sequences):
    if sequence not in pos_clean_sequences:
      filtered_sequences.append(sequence)
      filtered_labels.append(labels[i])

  # Write filtered sequences and labels (optional) to a new file
  if output_file:
    output_wb = openpyxl.Workbook()
    output_sheet = output_wb.active
    output_sheet.append(["sequence", "label"])  # Header row
    for i in range(len(filtered_sequences)):
      output_sheet.append([filtered_sequences[i], filtered_labels[i]])
    output_wb.save(output_file)

  print(f"Total sequences in pos-clean.xlsx: {len(pos_clean_sequences)}")
  print(f"Total sequences before filtering: {len(sequences)}")
  print(f"Total sequences after filtering: {len(filtered_sequences)}")

# Replace these with your actual file paths
pos_clean_file = "neg-clean.xlsx"
non_redundant_file = "filtered_sequences.xlsx"
output_file = "filtered_sequences-new.xlsx"  # Optional, set to None to not write a new file

remove_identical_sequences(pos_clean_file, non_redundant_file, output_file)


Total sequences in pos-clean.xlsx: 96
Total sequences before filtering: 6253
Total sequences after filtering: 6066


In [9]:
import openpyxl

def check_redundant_sequences(filtered_sequences_file):
  """
  Reads sequences from filtered_sequences.xlsx and checks for redundant sequences within the file.

  Args:
      filtered_sequences_file (str): Path to the filtered_sequences.xlsx file.
  """

  # Read sequences from filtered_sequences.xlsx, stripping hidden characters (optional)
  filtered_wb = openpyxl.load_workbook(filtered_sequences_file, data_only=True)  # Optional: Read data only
  filtered_sheet = filtered_wb.active
  sequences = [row[0].value.strip() for row in filtered_sheet.iter_rows(min_row=2)]  # Skip header row, strip

  # Create a set to store unique sequences (efficient for redundancy check)
  unique_sequences = set(sequences)

  # Check if the number of unique sequences is equal to the total number of sequences
  if len(unique_sequences) != len(sequences):
    print("There are redundant sequences present in the filtered file.")
  else:
    print("No redundant sequences found in the filtered file.")

# Replace this with the path to your filtered_sequences.xlsx file
filtered_sequences_file = "filtered_sequences.xlsx"

check_redundant_sequences(filtered_sequences_file)


There are redundant sequences present in the filtered file.


In [12]:
import openpyxl

def remove_redundant_sequences(filtered_sequences_file, output_file):
  """
  Reads sequences from filtered_sequences.xlsx, removes redundant sequences, and writes non-redundant sequences to a new file.

  Args:
      filtered_sequences_file (str): Path to the filtered_sequences.xlsx file.
      output_file (str): Path to the output file where non-redundant sequences will be written.
  """

  # Read sequences from filtered_sequences.xlsx, stripping hidden characters (optional)
  filtered_wb = openpyxl.load_workbook(filtered_sequences_file, data_only=True)  # Optional: Read data only
  filtered_sheet = filtered_wb.active
  sequences = [row[0].value.strip() for row in filtered_sheet.iter_rows(min_row=2)]  # Skip header row, strip

  # Create a set to store unique sequences (efficient for redundancy check)
  unique_sequences = set(sequences)

  # Filter out redundant sequences (optional, for printing information)
  non_redundant_sequences = list(unique_sequences)

  # Print information about redundancy (optional)
  if len(unique_sequences) != len(sequences):
    print(f"Removed {len(sequences) - len(unique_sequences)} redundant sequences.")

  # Write non-redundant sequences to a new file (optional)
  if output_file:
    output_wb = openpyxl.Workbook()
    output_sheet = output_wb.active
    output_sheet.append(["sequence"])  # Header row
    for sequence in non_redundant_sequences:
      output_sheet.append([sequence])
    output_wb.save(output_file)

# Replace these with your actual file paths
filtered_sequences_file = "filtered_sequences.xlsx"
output_file = "non_redundant_sequences.xlsx"  # Optional, set to None to not write a new file

remove_redundant_sequences(filtered_sequences_file, output_file)


Removed 587 redundant sequences.


In [13]:
import openpyxl

def check_redundant_sequences(filtered_sequences_file):
  """
  Reads sequences from filtered_sequences.xlsx and checks for redundant sequences within the file.

  Args:
      filtered_sequences_file (str): Path to the filtered_sequences.xlsx file.
  """

  # Read sequences from filtered_sequences.xlsx, stripping hidden characters (optional)
  filtered_wb = openpyxl.load_workbook(filtered_sequences_file, data_only=True)  # Optional: Read data only
  filtered_sheet = filtered_wb.active
  sequences = [row[0].value.strip() for row in filtered_sheet.iter_rows(min_row=2)]  # Skip header row, strip

  # Create a set to store unique sequences (efficient for redundancy check)
  unique_sequences = set(sequences)

  # Check if the number of unique sequences is equal to the total number of sequences
  if len(unique_sequences) != len(sequences):
    print("There are redundant sequences present in the filtered file.")
  else:
    print("No redundant sequences found in the filtered file.")

# Replace this with the path to your filtered_sequences.xlsx file
filtered_sequences_file = "non_redundant_sequences.xlsx"

check_redundant_sequences(filtered_sequences_file)


No redundant sequences found in the filtered file.


In [14]:
import openpyxl

def remove_redundant_sequences(filtered_sequences_file, output_file):
  """
  Reads sequences and labels from filtered_sequences.xlsx, removes redundant sequences, and writes non-redundant sequences with labels to a new file.

  Args:
      filtered_sequences_file (str): Path to the filtered_sequences.xlsx file.
      output_file (str): Path to the output file where non-redundant sequences with labels will be written.
  """

  # Read sequences and labels from filtered_sequences.xlsx, stripping hidden characters (optional)
  filtered_wb = openpyxl.load_workbook(filtered_sequences_file, data_only=True)  # Optional: Read data only
  filtered_sheet = filtered_wb.active
  sequences = [row[0].value.strip() for row in filtered_sheet.iter_rows(min_row=2)]  # Skip header row
  labels = [row[1].value for row in filtered_sheet.iter_rows(min_row=2)]  # Skip header row

  # Create a set to store unique sequences (efficient for redundancy check)
  unique_sequences = set(sequences)

  # Filter sequences based on unique set (optional, for printing information)
  non_redundant_sequences = []
  non_redundant_labels = []
  for i, sequence in enumerate(sequences):
    if sequence in unique_sequences:
      non_redundant_sequences.append(sequence)
      non_redundant_labels.append(labels[i])
      unique_sequences.remove(sequence)  # Remove from set after processing to avoid duplicates

  # Print information about redundancy (optional)
  if len(unique_sequences) != 0:
    print(f"Warning: Unexpected behavior. Removed {len(sequences) - len(non_redundant_sequences)} sequences, but set may still contain elements.")
  else:
    print(f"Removed {len(sequences) - len(non_redundant_sequences)} redundant sequences.")

  # Write non-redundant sequences and labels to a new file (optional)
  if output_file:
    output_wb = openpyxl.Workbook()
    output_sheet = output_wb.active
    output_sheet.append(["sequence", "label"])  # Header row
    for i in range(len(non_redundant_sequences)):
      output_sheet.append([non_redundant_sequences[i], non_redundant_labels[i]])
    output_wb.save(output_file)

# Replace these with your actual file paths
filtered_sequences_file = "filtered_sequences.xlsx"
output_file = "non_redundant_sequences_New.xlsx"  # Optional, set to None to not write a new file

remove_redundant_sequences(filtered_sequences_file, output_file)


Removed 587 redundant sequences.
