In [None]:
# Step1: deisotope

import pandas as pd
import glob
from ms_deisotope import deconvolute_peaks, Averagine, MSDeconVFitter
import os


# Specify the directory containing your CSV or xlsx files
input_directory = "Input_files_folder"
# Specify the directory to save the results
output_directory = "Output_files_folder"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)
from ms_deisotope import Averagine

for file_path in glob.glob(input_directory+ '/*.csv'):
    df = pd.read_csv(file_path, header=[1])
    peaks = list(zip(df['mass'], df['abund']))
    # Define your custom averagine model
    custom_averagine = Averagine({"C": 4.9384, "H": 7.7583, "N": 1.3577, "O": 1.4773, "S": 0.0417})
    scorer = MSDeconVFitter(1.5) # change the value based on your data
    deconvoluted_peaks, _ = deconvolute_peaks(peaks, averagine=custom_averagine, scorer=scorer)

    # Create a DataFrame from the deconvoluted peaks
    # Assuming each peak in deconvoluted_peaks has 'mz' and 'abund' attributes
    mz_values = [peak.mz for peak in deconvoluted_peaks]
    abund_values = [peak.intensity for peak in deconvoluted_peaks]

    # Create a DataFrame using a dictionary to map column names to data lists
    deisotoped_df = pd.DataFrame({'mass': mz_values, 'abund': abund_values})

    # Define the output file path
    base_name = os.path.basename(file_path)
    output_file_path = os.path.join(output_directory, 'deisotoped_' + base_name)

    # Save the deisotoped data to the new folder
    deisotoped_df.to_csv(output_file_path, index=False)


In [None]:
# Step 2: average all the m/z values within 0.01 m/z range, to generate a list of m/z for alignment

import os
import pandas as pd
import numpy as np

# Specify the directory containing your CSV files
input_directory = "Input_files_folder"
output_directory = "Output_files_folder"


# create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    
# Initialize an empty DataFrame to store all data
all_data = pd.DataFrame(columns=['mass', 'abund'])

# Loop over all files in the input directory
for filename in os.listdir(input_directory):
    # Check if the file is a CSV file
    if filename.endswith('.csv'):
        # Construct the full file path
        filepath = os.path.join(input_directory, filename)

        # Load the data from the CSV file
        data = pd.read_csv(filepath)

        # Append the data to the all_data DataFrame
        all_data = all_data.append(data)

# Sort the DataFrame by 'mass'
all_data = all_data.sort_values('mass')

# Initialize an empty DataFrame to store the averaged data
averaged_data = pd.DataFrame(columns=['mass', 'abund'])

# Initialize the first row
current_row = all_data.iloc[0]
current_row_values = [current_row]

# Loop over the rest of the rows
for i in range(1, len(all_data)):
    # If the 'mass' value of the next row is close to the current one
    if np.abs(all_data.iloc[i]['mass'] - current_row['mass']) <= 0.01:
        # Add the current row to the list of similar rows
        current_row_values.append(all_data.iloc[i])
    else:
        # Calculate the average of the 'mass' and 'intensity' values of the similar rows
        averaged_row = pd.DataFrame([{
            'mass': np.mean([row['mass'] for row in current_row_values]),
            'abund': np.mean([row['abund'] for row in current_row_values])
        }])
        # Append the averaged row to the averaged_data DataFrame
        averaged_data = averaged_data.append(averaged_row, ignore_index=True)

        # Move on to the next row
        current_row = all_data.iloc[i]
        current_row_values = [current_row]

# Calculate the average of the 'mass' and 'intensity' values of the last group of similar rows
averaged_row = pd.DataFrame([{
    'mass': np.mean([row['mass'] for row in current_row_values]),
    'abund': np.mean([row['abund'] for row in current_row_values])
}])
# Append the averaged row to the averaged_data DataFrame
averaged_data = averaged_data.append(averaged_row, ignore_index=True)

averaged_data.to_csv(os.path.join(output_directory, "average_all_mz_from_deisotoped.csv"), index=None)


In [None]:
# Step 3: align all deisotoped csv files/peaks

import pandas as pd
import os
import numpy as np

# specify the directory you want to use
input_dir = "Input_files_folder"
output_dir = "Output_files_folder"

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# read all csv files and store them in a dictionary
dataframes = {}
all_mz_values = []
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(input_dir, filename))
        dataframes[filename] = df
        all_mz_values.extend(df.iloc[:, 0].values)

# define the m/z difference tolerance
mz_tolerance = 0.01

# define the ppm difference tolerance
ppm_tolerance = 5

# get a unique m/z value list
unique_mz = pd.read_csv("average_all_mz_from_deisotoped.csv") # this is the m/z list csv file generated from average
unique_mz_values = unique_mz.mass.to_list()

# iterate over all dataframes
for filename, df in dataframes.items():
    # iterate over all 'm/z' values in df
    for i in range(len(df)):
        # find the 'm/z' values in the unique list that are within the tolerance
        indices = np.where((np.abs(np.array(unique_mz_values) - df.iloc[i, 0]) <= mz_tolerance) | (np.abs((np.array(unique_mz_values) - df.iloc[i, 0]) / df.iloc[i, 0] * 1e6) <= ppm_tolerance))[0]
        
        # if the 'm/z' value exists in the unique list, replace it with the 'm/z' value in the unique list
        if indices.size > 0:
            df.iloc[i, 0] = unique_mz_values[indices[0]]
    
    # save the result as a new csv file
    df.to_csv(os.path.join(output_dir, 'aligned_' + filename), index=False)
