In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

import starfile

"""
Goal = find and remove bad particles in peak_local_max output

Requires = a unique particle identifier = particle coordinates and ts_name


Using run_it025_data.star...

Use = "rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"

Refinement stores translation and rotation info in the following columns:
    rlnOriginXAngst
    rlnOriginYAngst
    rlnOriginZAngst
    rlnAngleRot
    rlnAngleTilt
    rlnAnglePsi

Classification results are in the following columns:
    rlnClassNumber
    rlnLogLikeliContribution
    rlnMaxValueProbDistribution
    rlnNrOfSignificantSamples - not a good indicator of quality for cleaning

"""

# Set paths

In [105]:
project_dir = Path("/mnt/scratch/ribosomes/wws_EGFcontrol/")

peaks_star_path = project_dir / Path("subset_abs5rel0.1.star")
peaks_df = starfile.read(peaks_star_path)

class_data_star_path = project_dir / Path("Class3D/init_bin4/run_it025_data.star")
class_df = starfile.read(class_data_star_path)
class_particles_df = class_df["particles"]

# Input bad classes and separate them from good classes

In [106]:
bad_classes = ["3"]

bad_peaks_star_path = project_dir / Path("bad_peaks.star")
# bad_particles.star will be saved in the same directory as class_data_star_path
bad_particles_star_path = class_data_star_path.parent / Path("bad_particles.star")
cleaned_data_star_path = class_data_star_path.parent / Path("cleaned_run_it025_data.star")

# Print the number of particles in each class
print("Number of particles in each class:")
print(class_particles_df["rlnClassNumber"].value_counts())

# Remove bad classes from the class_particles_df and save to a new df
cleaned_particles_df = class_particles_df[~class_particles_df["rlnClassNumber"].astype(str).isin(bad_classes)]
bad_particles_df = class_particles_df[class_particles_df["rlnClassNumber"].astype(str).isin(bad_classes)]
# Print the number of particles in the cleaned data
print("Number of particles in the data after removing bad classes:")
print(cleaned_particles_df.shape[0])
# Print the number of bad particles
print("Number of bad particles:")
print(bad_particles_df.shape[0])

Number of particles in each class:
rlnClassNumber
2    1151
3       8
Name: count, dtype: int64
Number of particles in the data after removing bad classes:
1151
Number of bad particles:
8



# Save the bad particles to a star file. If the file already exists, append to it if the particles are not already in it

In [107]:
if bad_particles_star_path.exists():
    bad_particles_star_df = starfile.read(bad_particles_star_path)
    # Print the number of new bad particles
    print("Number of new bad particles in this classification:")
    print(bad_particles_df.shape[0])
    # Save the bad particles to a star file
    starfile.write(bad_particles_star_df, bad_particles_star_path, overwrite=True)
else:
    starfile.write(bad_particles_df, bad_particles_star_path, overwrite=False)

# Save the cleaned data to a star file
class_df["particles"] = cleaned_particles_df
starfile.write(class_df, cleaned_data_star_path, overwrite=True)

# Append the bad particles to bad_peaks_star_path
if bad_peaks_star_path.exists():
    bad_peaks_star_df = starfile.read(bad_peaks_star_path)
    # Print the number of bad particles in the previous classification
    print("Number of previously saved bad peaks:")
    print(bad_peaks_star_df.shape[0])
    # Print the number of new bad particles
    print("Number of new bad particles:")
    print(bad_particles_df.shape[0])
    # Append the new bad particles to the old ones
    bad_peaks_star_df = pd.concat([bad_peaks_star_df, bad_particles_df], ignore_index=True)
    # Drop duplicates
    bad_peaks_star_df = bad_peaks_star_df.drop_duplicates(subset=["rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"], keep="first")
    # Print the number of bad particles after the last classification
    print("Number of bad peaks after the last classification:")
    print(bad_peaks_star_df.shape[0])
    # Save the bad particles to a star file
    starfile.write(bad_peaks_star_df, bad_peaks_star_path, overwrite=True)
else:
    starfile.write(bad_particles_df, bad_peaks_star_path, overwrite=False)

Number of new bad particles in this classification:
8
Number of previously saved bad peaks:
8
Number of new bad particles:
8
Number of bad peaks after the last classification:
8



# Create a new particle STAR file from the new peaks file by removing previously processed peaks


In [138]:
# First, read the previously processed peaks
processed_star_path = project_dir / Path("Class3D/init_bin4/run_it025_data.star")
processed_star_df = starfile.read(processed_star_path)
new_star_df = processed_star_df.copy()
# Print the number of peaks in the processed data
print("Number of peaks in the processed data:")
print(processed_star_df["particles"].shape[0])
# Then, read the new peaks
new_peaks_star_path = project_dir / Path("particles_abs5rel0.1.star")
new_peaks_star_df = starfile.read(new_peaks_star_path)
# Print the number of new peaks
print("Number of peaks in the new file:")
print(new_peaks_star_df.shape[0])
# Append the new peaks to the old ones
new_particles_df = pd.concat([processed_star_df["particles"], new_peaks_star_df], ignore_index=True)
# Print the number of peaks after appending
print("Number of peaks after appending:")
print(new_particles_df.shape[0])
# Drop duplicates
new_particles_df = new_particles_df.drop_duplicates(subset=["rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"], keep="first")
# Print the number of peaks after dropping duplicates
print("Number of peaks after dropping duplicates:")
print(new_particles_df.shape[0])
print("This should be equal to the number of peaks in the new file.")
# Save the new star file
new_particles_df.drop(columns=[
    "rlnRandomSubset",
    "rlnGroupNumber", 
    "rlnClassNumber", 
    "rlnNormCorrection", 
    "rlnLogLikeliContribution", 
    "rlnMaxValueProbDistribution", 
    "rlnNrOfSignificantSamples",
    #"rlnOpticsGroup",
    "rlnImageName",
    "rlnCtfImage",
    "staParticleIndex",
    
    ], axis=1, inplace=True)
new_particles_df["rlnOpticsGroup"] = 1
new_particles_df[["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"]] = new_particles_df[["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"]].fillna(0)
new_star_df["particles"] = new_particles_df
new_star_df["optics"].drop(columns=[
    "rlnCtfDataAreCtfPremultiplied",
    "rlnImageDimensionality",
    "rlnTomoSubtomogramBinning",
    "rlnImagePixelSize",
    "rlnImageSize",
], axis=1, inplace=True)
# Save the new star file
new_star_path = project_dir / Path(f"ready_{new_peaks_star_path.stem}.star")
starfile.write(new_star_df, new_star_path, overwrite=True)

# Save the new star file to the optimisation set
# This is a temporary solution until I figure out how to do this in the pipeline
# IT MIGHT NOT WORK 
old_optimisation_set = Path("/mnt/scratch/ribosomes/wws_EGFcontrol/ImportTomo/job002/optimisation_set.star")
old_optimisation_set_df = starfile.read(old_optimisation_set)
new_optimisation_set_df = old_optimisation_set_df.copy()
new_optimisation_set_df["rlnTomoParticlesFile"] = new_star_path.name
starfile.write(new_optimisation_set_df, project_dir / Path("new_optimisation_set.star"), overwrite=True)

Number of peaks in the processed data:
1159
Number of peaks in the new file:
4601
Number of peaks after appending:
5760
Number of peaks after dropping duplicates:
4601
This should be equal to the number of peaks in the new file.
