In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

import starfile

"""
Goal = find and remove bad particles in peak_local_max output

Requires = a unique particle identifier = particle coordinates and ts_name


Using run_it025_data.star...

Use = "rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"

Refinement stores translation and rotation info in the following columns:
    rlnOriginXAngst
    rlnOriginYAngst
    rlnOriginZAngst
    rlnAngleRot
    rlnAngleTilt
    rlnAnglePsi

Classification results are in the following columns:
    rlnClassNumber
    rlnLogLikeliContribution
    rlnMaxValueProbDistribution
    rlnNrOfSignificantSamples - not a good indicator of quality for cleaning

"""

'\nGoal = find and remove bad particles in peak_local_max output\n\nRequires = a unique particle identifier = particle coordinates and ts_name\n\n\nUsing run_it025_data.star...\n\nUse = "rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"\n\nRefinement stores translation and rotation info in the following columns:\n    rlnOriginXAngst\n    rlnOriginYAngst\n    rlnOriginZAngst\n    rlnAngleRot\n    rlnAngleTilt\n    rlnAnglePsi\n\nClassification results are in the following columns:\n    rlnClassNumber\n    rlnLogLikeliContribution\n    rlnMaxValueProbDistribution\n    rlnNrOfSignificantSamples - not a good indicator of quality for cleaning\n\n'

# Set paths and junk classes

In [None]:
project_dir = Path("/mnt/scratch/ribosomes/wws_EGFcontrol/")

processed_peaks_star_path = project_dir / Path("subset_abs5rel0.1.star")
processed_classes_path = project_dir / Path("Class3D/init_bin4/run_it025_data.star")

junk_classes = ["3"]



# Separate particles into junk and usable groups 

In [None]:
processed_peaks_df = starfile.read(processed_peaks_star_path)
processed_classes_df = starfile.read(processed_classes_path)
processed_classes_particles_df = processed_classes_df["particles"]

identity_columns = ["rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"]
# bad_particles.star will be saved in the same directory as classified_data_star_path

# Print the number of particles in each class
print("Number of particles in each class:")
print(processed_classes_particles_df["rlnClassNumber"].value_counts())

# Remove bad classes from the classified_particles_df and save to a new df
processed_usable_df = processed_classes_particles_df[~processed_classes_particles_df["rlnClassNumber"].astype(str).isin(junk_classes)]
processed_junk_df= processed_classes_particles_df[
    processed_classes_particles_df["rlnClassNumber"].astype(str).isin(junk_classes)
    ]
# Print the number of particles in the cleaned data
print("Number of particles in the data after removing bad classes:")
print(processed_usable_df.shape[0])
# Print the number of bad particles
print("Number of bad particles:")
print(processed_junk_df.shape[0])


# Save the groups to star files.

In [None]:
# Save the junk found in this classification
processed_junk_path= processed_classes_path.parent / Path("junk_run_it025_data.star")
starfile.write(processed_junk_df, processed_junk_path, overwrite=True)

# Save the cleaned data to a star file
processed_usable_path= processed_classes_path.parent / Path("usable_run_it025_data.star")
processed_classes_df["particles"] = processed_usable_df
starfile.write(processed_classes_df, processed_usable_path, overwrite=True)

# Save the junk to the master list
all_junk_path = project_dir / Path("all_junk.star")
if all_junk_path.exists():
    # Append the new junk to the master list
    all_junk_df = starfile.read(all_junk_path)
    # Print the number of bad particles in the previous classification
    print("Number of previously saved bad peaks:")
    print(all_junk_df.shape[0])
    # Print the number of new bad particles
    print("Number of new bad particles:")
    print(processed_junk_df.shape[0])
    # Append the new bad particles to the old ones
    all_junk_df = pd.concat([all_junk_df, processed_junk_df], ignore_index=True)
    # Drop duplicates
    all_junk_df = all_junk_df.drop_duplicates(subset=identity_columns, keep="first")
    # Print the number of bad particles after the last classification
    print("Number of total junk peaks after the last classification:")
    print(all_junk_df.shape[0])
    # Save the bad particles to a star file
    starfile.write(all_junk_df, all_junk_path, overwrite=True)
else:
    starfile.write(processed_junk_df, all_junk_path, overwrite=False)

# Save the usable to the master list
all_usable_path = project_dir / Path("all_usable.star")
if all_usable_path.exists():
    # Append the new usable to the master list
    all_usable_df = starfile.read(all_usable_path)
    # Print the number of usable particles in the previous classification
    print("Number of previously saved usable particles:")
    print(all_usable_df.shape[0])
    # Print the number of new usable particles
    print("Number of new usable particles:")
    print(processed_usable_df.shape[0])
    # Append the new usable particles to the old ones
    all_usable_df = pd.concat([all_usable_df, processed_usable_df], ignore_index=True)
    # Drop duplicates
    all_usable_df = all_usable_df.drop_duplicates(subset=identity_columns, keep="first")
    # Print the number of usable particles after the last classification
    print("Number of total usable particles after the last classification:")
    print(all_usable_df.shape[0])
    # Save the usable particles to a star file
    starfile.write(all_usable_df, all_usable_path, overwrite=True)
else:
    starfile.write(processed_usable_df, all_usable_path, overwrite=False)


# Create a new peaks file with the junk particles removed

In [None]:
# First, read the previously processed peaks
processed_classes_path = project_dir / Path("Class3D/init_bin4/run_it025_data.star")
processed_classes_df = starfile.read(processed_classes_path)
# Print the number of peaks in the processed data
print("Number of peaks in the processed data:")
print(processed_classes_df["particles"].shape[0])
# Then, read the new peaks
new_peaks_path = project_dir / Path("eman2/segmentations/particles_abs1rel0.1.star")
new_peaks_df = starfile.read(new_peaks_path)
# Print the number of new peaks
print("Number of peaks in the new file:")
print(new_peaks_df.shape[0])

# Append the new peaks to the old ones
new_peaks_df = pd.concat([processed_classes_df["particles"], new_peaks_df], ignore_index=True)
# Print the number of peaks after appending
print("Number of peaks after appending:")
print(new_peaks_df.shape[0])
# Drop duplicates
new_peaks_df = new_peaks_df.drop_duplicates(subset=identity_columns, keep="first")
# Print the number of peaks after dropping duplicates
print("Number of peaks after dropping duplicates:")
print(new_peaks_df.shape[0])
print("This should be equal to the number of peaks in the new file.")
# Save the new star file
new_peaks_df.drop(columns=[
    "rlnRandomSubset",
    "rlnGroupNumber", 
    "rlnClassNumber", 
    "rlnNormCorrection", 
    "rlnLogLikeliContribution", 
    "rlnMaxValueProbDistribution", 
    "rlnNrOfSignificantSamples",
    "rlnOpticsGroup",
    "rlnImageName",
    "rlnCtfImage",
#    "staParticleIndex",
    
    ], axis=1, inplace=True)
#new_particles_df["rlnOpticsGroup"] = 1
#new_particles_df[["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"]] = new_particles_df[["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"]].fillna(0)
#new_star_df["particles"] = new_particles_df
#new_star_df["optics"].drop(columns=[
#    "rlnCtfDataAreCtfPremultiplied",
#    "rlnImageDimensionality",
#    "rlnTomoSubtomogramBinning",
#    "rlnImagePixelSize",
#    "rlnImageSize",
#], axis=1, inplace=True)
# Save the new star file

new_star_path = project_dir / Path(f"ready_{new_peaks_path.stem}.star")
starfile.write(new_peaks_df, new_star_path, overwrite=True)

## Replace rlnOriginXAngst, rlnOriginYAngst, rlnOriginZAngst with the previously found values
## Read the newly imported particles
#imported_particles_star_path = project_dir / Path("ImportTomo/abs5rel0.1/particles.star")
#imported_particles_star_df = starfile.read(imported_particles_star_path)
## Read the previously processed particles
#processed_particles_star_path = project_dir / Path("ready_particles_abs5rel0.1.star")
#processed_particles_star_df = starfile.read(processed_particles_star_path)
#
## If a row in imported_particles_star_df has the same identity_columns as a row in processed_particles_star_df, replace the ["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"] values in that row in imported_particles_star_df with the values in the row in processed_particles_star_df
#for index_processed, row_processed in processed_particles_star_df["particles"].iterrows():
#    for index_imported, row_imported in imported_particles_star_df["particles"].iterrows():
#        if index_imported % 1000 == 0 and index_imported != 0:
#            print(f"Processed {index_imported} rows.")
#        if (row_processed[identity_columns] == row_imported[identity_columns]).all():
#            imported_particles_star_df["particles"].loc[index_imported, ["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"]] = row_processed[["rlnOriginXAngst", "rlnOriginYAngst", "rlnOriginZAngst"]]
#
## Save the new star file
#save_path = project_dir / Path(f"ready_{imported_particles_star_path.stem}.star")
#starfile.write(imported_particles_star_df, save_path, overwrite=True)
#

# Prepare the particles.star from a PseudoSubtomo job for refinement or classification
## Imports translation and rotation information for particles from previous jobs

In [40]:
def prepare_for_refinement(
        relion_project_directory: Path,
        particles_star: Path,
        refined_particles_star: Path,
        ) -> None:
    """
    This function takes a particles.star file and imports the translation and rotation information from previous refinements.
    """ 

    relion_project_directory = Path(relion_project_directory).absolute()
    particles_star = Path(particles_star).absolute()
    refined_particles_star = Path(refined_particles_star).absolute()

    # Read the particles.star file
    particles_star_df = starfile.read(particles_star)
    # Read the refined particles.star file
    refined_particles_star_df = starfile.read(refined_particles_star)


    prepared_df = particles_star_df.copy()
    identity_columns = ["rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ", "rlnTomoName"]
    # Import the translation and rotation information from refined_particles_star_df
    prepared_df["particles"] = pd.concat([refined_particles_star_df, particles_star_df["particles"]], ignore_index=True)
    prepared_df["particles"] = prepared_df["particles"].drop_duplicates(subset=identity_columns, keep="first")
    prepared_df["particles"] = prepared_df["particles"].reset_index(drop=True)


    # Read the job number in rlnImageName from particles_star_df
    new_job_number = particles_star_df["particles"]["rlnImageName"].str.split("/", expand=True)[1][0]
    # Read the job number in rlnImageName from refined_particles_star_df
    refined_job_number= refined_particles_star_df["rlnImageName"].str.split("/", expand=True)[1][0]
    # Replace the job number with new_jobnumber in rlnImageName and rlnCtfIMage for the rows in prepared_df if they have refined_job_number
    prepared_df["particles"]["rlnImageName"] = prepared_df["particles"]["rlnImageName"].str.replace(refined_job_number, new_job_number)
    prepared_df["particles"]["rlnCtfImage"] = prepared_df["particles"]["rlnCtfImage"].str.replace(refined_job_number, new_job_number)


    # Save the new star file
    save_path = relion_project_directory / Path(f"prepared_for_refinement.star")
    starfile.write(prepared_df, save_path, overwrite=True)

In [41]:
prepare_for_refinement(
    relion_project_directory="/mnt/scratch/ribosomes/wws_EGFcontrol/",
    particles_star="/mnt/scratch/ribosomes/wws_EGFcontrol/PseudoSubtomo/job009/particles.star",
    refined_particles_star="/mnt/scratch/ribosomes/wws_EGFcontrol/all_usable.star",
)