In [1]:
import os
import pandas as pd
## Import relevant libraries
import sys
import glob

sys.path.append('../../py_files/')
import quadrop as qd

qd.set_plotting_style()

In [2]:
def concatenate_merged_expression_piv(data_paths, output_save_folder=None):
    """
    Concatenate the merged_expression_PIV.csv files from multiple data paths into one file.

    Parameters:
    - data_paths: List of paths where the merged_expression_PIV.csv files are located.
    - output_folder: Folder where the merged_expression_PIV.csv files are stored.
    - merged_file_name: Name of the merged expression PIV file to read.
    - output_file_name: Name of the output concatenated file to save.
    - output_save_folder: Folder where the concatenated file will be saved. If None, save in the first data path's output folder.
    
    The concatenated file will be saved in the specified `output_save_folder`, or the first path in `data_paths` if not provided.
    """
    
    output_folder="output_data"
    merged_file_name="merged_expression_PIV.csv"
    output_file_name="merged_expression_PIV.csv"
    
    # List to store DataFrames from each path
    dataframes = []

    # Loop over each data path
    for path in data_paths:
        merged_file_path = os.path.join(path, output_folder, merged_file_name)
        
        # Check if the merged file exists
        if os.path.exists(merged_file_path):
            # Read the merged_expression_PIV.csv file
            df = pd.read_csv(merged_file_path)
            dataframes.append(df)
            print(f"Loaded data from: {merged_file_path}")
        else:
            print(f"File not found: {merged_file_path}")

    if not dataframes:
        print("No valid files found to concatenate.")
        return

    # Concatenate all DataFrames
    concatenated_df = pd.concat(dataframes, ignore_index=True)

    # Determine the output directory
    if output_save_folder is None:
        output_save_folder = os.path.join(data_paths[0], output_folder)
    
    # Create the output_data folder if it doesn't exist
    os.makedirs(os.path.join(output_save_folder, output_folder), exist_ok=True)
    
    # Save the concatenated DataFrame to the output directory
    output_file_path = os.path.join(output_save_folder, output_folder, output_file_name)
    concatenated_df.to_csv(output_file_path, index=False)

    # conditions = list(concatenated_df['condition'].unique())
    # columns = list(concatenated_df.columns)
    

    print(f"Concatenated DataFrame saved to: {output_file_path}")

    return concatenated_df
    


In [3]:

import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
import numpy as np

def sanitize_filename(name):
    """Helper function to replace spaces and special characters in filenames."""
    return name.replace(" ", "_").replace("[", "").replace("]", "").replace("/", "_")

def plot_expression_piv(data_path, conditions, x_column, y_column, output_folder="output_data", 
                        merged_file="merged_expression_PIV.csv", plot_output_folder="output_data/expression_piv_plots", 
                        sigma_x=None, sigma_y=1, x_log=False, y_log=False, frame_ranges=None, 
                        individual_plots=True, fill_na_method=None):
    """
    Plots the specified x_column vs y_column from the DataFrame for each condition and also generates a combined plot.
    
    Parameters:
    - data_path: Path to the data folder.
    - conditions: List of conditions to plot.
    - x_column: The column to use for the x-axis.
    - y_column: The column to use for the y-axis (with optional Gaussian smoothing).
    - output_folder: Folder where the merged data is stored.
    - merged_file: The merged CSV file name.
    - plot_output_folder: Folder where plots will be saved.
    - sigma_x: Gaussian smoothing factor to apply to the x-axis data (if None, no smoothing applied).
    - sigma_y: Gaussian smoothing factor to apply to the y-axis data (if None, no smoothing applied).
    - x_log: If True, set x-axis to log scale. Default is False.
    - y_log: If True, set y-axis to log scale. Default is False.
    - frame_ranges: List of (min_frame, max_frame) tuples corresponding to each condition.
    - individual_plots: If True, generate individual plots for each condition. Default is True.
    - fill_na_method: Method to fill NA values ('ffill', 'bfill', 'zero', None). If None, NA values are dropped.
    """
    
    # Load the merged DataFrame from the output_data folder
    merged_file_path = os.path.join(data_path, output_folder, merged_file)
    merged_df = pd.read_csv(merged_file_path)

    # Handle NA values based on the fill_na_method
    if fill_na_method == 'ffill':
        merged_df.fillna(method='ffill', inplace=True)
    elif fill_na_method == 'bfill':
        merged_df.fillna(method='bfill', inplace=True)
    elif fill_na_method == 'zero':
        merged_df.fillna(0, inplace=True)
    else:
        merged_df.dropna(subset=[x_column, y_column], inplace=True)  # Drop rows where x or y are NaN

    # Ensure frame_ranges is provided and matches the length of conditions
    if frame_ranges is None or len(frame_ranges) != len(conditions):
        raise ValueError("frame_ranges must be provided and match the length of conditions.")

    # Slice the DataFrame based on the frame_ranges for each condition
    sliced_dfs = []
    for condition, (min_frame, max_frame) in zip(conditions, frame_ranges):
        condition_df = merged_df[merged_df['condition'] == condition]
        if max_frame is not None:
            condition_df = condition_df.iloc[min_frame:max_frame]
        else:
            condition_df = condition_df.iloc[min_frame:]
        sliced_dfs.append(condition_df)
    merged_df = pd.concat(sliced_dfs)

    # Define the output folder for plots
    plot_output_dir = os.path.join(data_path, plot_output_folder)
    os.makedirs(plot_output_dir, exist_ok=True)  # Ensure the output directory exists

    # Initialize a combined plot for all conditions
    plt.figure(figsize=(10, 8))
    
    # Loop through each condition and plot the specified columns
    for condition in conditions:
        # Filter the DataFrame for the current condition
        condition_df = merged_df[merged_df['condition'] == condition]
        
        if condition_df.empty:
            print(f"No data available for condition: {condition}")
            continue
        
        # Apply Gaussian smoothing to the selected x-axis and y-axis columns
        smoothed_x = gaussian_filter1d(condition_df[x_column], sigma=sigma_x) if sigma_x is not None else condition_df[x_column]
        smoothed_y = gaussian_filter1d(condition_df[y_column], sigma=sigma_y) if sigma_y is not None else condition_df[y_column]
        
        # Filter out non-positive values if log scale is applied to either axis
        if x_log and y_log:
            positive_mask = (smoothed_x > 0) & (smoothed_y > 0)  # Both x and y must be positive
        elif x_log:
            positive_mask = smoothed_x > 0  # Only x must be positive
        elif y_log:
            positive_mask = smoothed_y > 0  # Only y must be positive
        else:
            positive_mask = np.ones(len(smoothed_x), dtype=bool)  # No filtering if log is not applied
        
        # Apply the positive mask to filter both x and y
        smoothed_x = smoothed_x[positive_mask]
        smoothed_y = smoothed_y[positive_mask]
        
        # Sanitize the column names and condition for use in filenames
        sanitized_x_column = sanitize_filename(x_column)
        sanitized_y_column = sanitize_filename(y_column)
        sanitized_condition = sanitize_filename(condition)
        
        # Generate individual plots if individual_plots is True
        if individual_plots:
            plt.figure(figsize=(8, 6))
            plt.plot(smoothed_x, smoothed_y, label=condition)
            plt.title(f'{x_column} vs {y_column} for Condition: {condition}')
            plt.xlabel(x_column)
            plt.ylabel(y_column)
            
            # Set log scale if specified
            if x_log:
                plt.xscale('log')
            if y_log:
                plt.yscale('log')
            
            plt.grid(True)

            # Save the individual plot
            output_file = os.path.join(plot_output_dir, f"{sanitized_x_column}_vs_{sanitized_y_column}_{sanitized_condition}.png")
            plt.savefig(output_file, dpi=300)
            plt.close()
            print(f"Plot saved for condition {condition} at {output_file}")

        # Add the condition's plot to the combined figure
        plt.plot(smoothed_x, smoothed_y, label=condition)

    # Finalize the combined plot with all conditions
    plt.title(f'{x_column} vs {y_column} for All Conditions (Smoothed)')
    plt.xlabel(x_column)
    plt.ylabel(y_column)

    # Set log scale if specified
    if x_log:
        plt.xscale('log')
    if y_log:
        plt.yscale('log')
    
    plt.grid(True)
    plt.legend()
    
    # Save the combined plot
    combined_plot_file = os.path.join(plot_output_dir, f"{sanitized_x_column}_vs_{sanitized_y_column}_All_Conditions.png")
    plt.savefig(combined_plot_file, dpi=300)
    plt.close()
    
    print(f"Combined plot saved at {combined_plot_file}")


    return merged_df

In [4]:
k401 = "../../../../Thomson Lab Dropbox/David Larios/activedrops/ubuntu/101324-k401-titration-rt/2p5TMB-1ulDNA_/"
kif3 = "../../../../Thomson Lab Dropbox/David Larios/activedrops/main/100624-kif3-titration-RT/2p5ulTMB-1ulDNAXnM_/"
 

output_save_folder = "../../../../Thomson Lab Dropbox/David Larios/activedrops/main/102224-k401-kif3_titrations-RT/"


# Example usage of the function:
df = concatenate_merged_expression_piv(
    data_paths=[k401, kif3],
    output_save_folder = output_save_folder
)

df

Loaded data from: ../../../../Thomson Lab Dropbox/David Larios/activedrops/ubuntu/101324-k401-titration-rt/2p5TMB-1ulDNA_/output_data/merged_expression_PIV.csv
Loaded data from: ../../../../Thomson Lab Dropbox/David Larios/activedrops/main/100624-kif3-titration-RT/2p5ulTMB-1ulDNAXnM_/output_data/merged_expression_PIV.csv
Concatenated DataFrame saved to: ../../../../Thomson Lab Dropbox/David Larios/activedrops/main/102224-k401-kif3_titrations-RT/output_data/merged_expression_PIV.csv


Unnamed: 0,condition,subcondition,time (s),Time_min,Time_h,Mean Intensity,Protein Concentration_ng_ul,Protein Concentration_nM,Number of Protein Molecules,Rate of Change of Protein Molecules per Second,...,dcev [1]_mean,shear [1/s]_mean,strain [1/s]_mean,vector direction [degrees]_mean,correlation length [m]_mean,distance [m]_mean,power [W]_mean,work [J]_mean,time (min),time (h)
0,K401_0p625nM-RT,Rep1,0,0.000000,0.000000,10.795248,1.376854,30.630794,3.675695e+10,,...,,,,,,,,,,
1,K401_0p625nM-RT,Rep1,60,1.000000,0.016667,10.763086,1.372752,30.539536,3.664744e+10,,...,,,,,,,,,,
2,K401_0p625nM-RT,Rep1,120,2.000000,0.033333,10.526032,1.342518,29.866912,3.584029e+10,,...,,,,,,,,,,
3,K401_0p625nM-RT,Rep1,180,3.000000,0.050000,10.755514,1.371786,30.518051,3.662166e+10,,...,,,,,,,,,,
4,K401_0p625nM-RT,Rep1,240,4.000000,0.066667,10.683785,1.362638,30.314526,3.637743e+10,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58555,Kif3_160nM_2-RT,Rep1,14360,239.333333,3.988889,28.764805,1.049415,16.295271,1.955433e+10,1.069131e+06,...,12.003832,1.496365e-05,0.000006,-90.366889,0.000054,0.00048,4.908141e-19,4.442985e-16,239.333333,3.988889
58556,Kif3_160nM_2-RT,Rep1,14368,239.466667,3.991111,28.877934,1.053543,16.359359,1.963123e+10,1.076540e+06,...,13.938095,-4.202169e-06,0.000002,74.834350,0.000054,0.00048,2.743649e-19,4.445728e-16,239.466667,3.991111
58557,Kif3_160nM_2-RT,Rep1,14376,239.600000,3.993333,28.730720,1.048172,16.275962,1.953115e+10,1.285783e+06,...,13.310110,1.362065e-05,0.000004,43.676002,0.000054,0.00048,2.028110e-20,4.445931e-16,239.600000,3.993333
58558,Kif3_160nM_2-RT,Rep1,14384,239.733333,3.995556,28.754401,1.049036,16.289377,1.954725e+10,1.941200e+06,...,19.892953,-9.859973e-07,0.000004,-58.293122,0.000054,0.00048,4.434767e-20,4.446374e-16,239.733333,3.995556


In [6]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
# Find rows where the condition contains an underscore
df = df[df['condition'].str.contains('_')]

# Extract motor protein type
df['motor'] = df['condition'].apply(lambda x: x.split('_')[0])

# Extract DNA concentration
df['DNA'] = df['condition'].apply(
    lambda x: float(x.split('-')[0].split('_')[1].replace('nM', '').replace('p', '.'))
)

# Add replicate column with default value 1
df['replicate'] = 1

# Find rows where the condition has a third entry when split by "_"
has_third_entry = df['condition'].apply(lambda x: len(x.split('_')) > 2)

# Parse replicate from third entry when split by "_"
df.loc[has_third_entry, 'replicate'] = df.loc[has_third_entry, 'condition'].apply(
    lambda x: int(x.split('_')[2].replace('-RT', ''))
)

# Display the first 5 rows of the DataFrame
df

Unnamed: 0,condition,subcondition,time (s),Time_min,Time_h,Mean Intensity,Protein Concentration_ng_ul,Protein Concentration_nM,Number of Protein Molecules,Rate of Change of Protein Molecules per Second,...,vector direction [degrees]_mean,correlation length [m]_mean,distance [m]_mean,power [W]_mean,work [J]_mean,time (min),time (h),motor,DNA,replicate
0,K401_0p625nM-RT,Rep1,0,0.000000,0.000000,10.795248,1.376854,30.630794,3.675695e+10,,...,,,,,,,,K401,0.625,1
1,K401_0p625nM-RT,Rep1,60,1.000000,0.016667,10.763086,1.372752,30.539536,3.664744e+10,,...,,,,,,,,K401,0.625,1
2,K401_0p625nM-RT,Rep1,120,2.000000,0.033333,10.526032,1.342518,29.866912,3.584029e+10,,...,,,,,,,,K401,0.625,1
3,K401_0p625nM-RT,Rep1,180,3.000000,0.050000,10.755514,1.371786,30.518051,3.662166e+10,,...,,,,,,,,K401,0.625,1
4,K401_0p625nM-RT,Rep1,240,4.000000,0.066667,10.683785,1.362638,30.314526,3.637743e+10,,...,,,,,,,,K401,0.625,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58555,Kif3_160nM_2-RT,Rep1,14360,239.333333,3.988889,28.764805,1.049415,16.295271,1.955433e+10,1.069131e+06,...,-90.366889,0.000054,0.00048,4.908141e-19,4.442985e-16,239.333333,3.988889,Kif3,160.000,2
58556,Kif3_160nM_2-RT,Rep1,14368,239.466667,3.991111,28.877934,1.053543,16.359359,1.963123e+10,1.076540e+06,...,74.834350,0.000054,0.00048,2.743649e-19,4.445728e-16,239.466667,3.991111,Kif3,160.000,2
58557,Kif3_160nM_2-RT,Rep1,14376,239.600000,3.993333,28.730720,1.048172,16.275962,1.953115e+10,1.285783e+06,...,43.676002,0.000054,0.00048,2.028110e-20,4.445931e-16,239.600000,3.993333,Kif3,160.000,2
58558,Kif3_160nM_2-RT,Rep1,14384,239.733333,3.995556,28.754401,1.049036,16.289377,1.954725e+10,1.941200e+06,...,-58.293122,0.000054,0.00048,4.434767e-20,4.446374e-16,239.733333,3.995556,Kif3,160.000,2


In [None]:
list(df['condition'][::-1].unique())

In [None]:
list(df.columns)

In [None]:
a = [
    'Kif3_160nM_1-RT',
    'Kif3_80nM_1-RT',
    'Kif3_40nM_1-RT',
    'Kif3_20nM_1-RT',
    'Kif3_10nM_1-RT',
    'Kif3_5nM_1-RT',
    'Kif3_2p5nM_1-RT',
    'Kif3_1p25nM_1-RT',
    # 'K401_160nM-RT',
    # 'K401_80nM-RT',
    # 'K401_40nM-RT',
    # 'K401_20nM-RT',
    # 'K401_10nM-RT',
    # 'K401_5nM-RT',
    # 'K401_2p5nM-RT',
    # 'K401_1p25nM-RT',
    # 'K401_0p625nM-RT'
    ]

a

In [171]:
k401_min = 0
k401_max = None
kif3_min = 0
kif3_max = None

frame_ranges = [
    (kif3_min, kif3_max), 
    (kif3_min, kif3_max),
    (kif3_min, kif3_max), 
    (kif3_min, kif3_max),
    (kif3_min, kif3_max), 
    (kif3_min, kif3_max),
    (kif3_min, kif3_max), 
    (kif3_min, kif3_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    # (k401_min, k401_max),
    ]


In [None]:
x_column = "time (min)"  # Example x-axis column
y_column = "Protein Concentration_nM"  # Example y-axis column

# x_column = "Protein Concentration_nM"  # Example y-axis column
# y_column = "velocity magnitude [m/s]_mean"  # Example y-axis column

# x_column = "time (min)"  # Example x-axis column
# y_column = "velocity magnitude [m/s]_mean"  # Example y-axis column

# x_column = "time (min)"  # Example x-axis column
# y_column = "power [W]_mean"  # Example y-axis column

# x_column = "time (h)"  # Example x-axis column
# y_column = "work [J]_mean"  # Example y-axis column

# x_column = "time (h)"  # Example x-axis column
# y_column = "distance [m]_mean"  # Example y-axis column

# x_column = "time (min)"  # Example x-axis column
# y_column = 'correlation length [m]_mean'  # Example y-axis column

# x_column = "Protein Concentration_nM"  # Example y-axis column
# y_column = "power [W]_mean"  # Example y-axis column


df = plot_expression_piv(
    output_save_folder,
    a,
    x_column, 
    y_column, 
    sigma_x=0.1,  
    sigma_y=0.1,
    x_log=False, 
    y_log=False, 
    frame_ranges=frame_ranges,
    individual_plots=False
    )

In [None]:
df

In [None]:
df['condition'].unique() 

In [None]:
y_columns = [
    "Protein Concentration_nM", 
    "velocity magnitude [m/s]_mean", 
    "distance [m]_mean",
    "Rate of Change of Protein Molecules per Second",
    "Translation Rate aa_s",
    "correlation length [m]_mean"
    ]

for y_column in y_columns:
    qd.plot_expression_piv(
        output_save_folder,
        ["H-RT", "H-29C",],
        x_column, 
        y_column, 
        sigma_x=0.1, 
        sigma_y=10, 
        x_log=False, 
        y_log=False, 
        min_frame=0, 
        max_frame=None, 
        individual_plots=False
        )
    

y_columns = [
    "power [W]_mean", 
    "work [J]_mean", 
    # "Number of Protein Molecules",
    ]

for y_column in y_columns:
    qd.plot_expression_piv(
        output_save_folder,
        ["H-RT", "H-29C",],
        x_column, 
        y_column, 
        sigma_x=0.1, 
        sigma_y=10, 
        x_log=False, 
        y_log=True, 
        min_frame=0, 
        max_frame=None, 
        individual_plots=False
        )

In [None]:

# List of features for PCA
features_pca = [
    "vorticity [1/s]_mean",
    # "velocity magnitude [m/s]_mean",
    # "distance [m]_mean",
    "divergence [1/s]_mean",
    "shear [1/s]_mean",
    "strain [1/s]_mean",
    # "correlation length [m]_mean", 
    "power [W]_mean",
    # "work [J]_mean",
    'vector direction [degrees]_mean',
    "Protein Concentration_nM", 
]

# Run PCA and save plot (with all conditions and subconditions in the same plot)
qd.plot_pca_expression_piv(output_save_folder, conditions=conditions, subconditions=['Rep1'], features=features_pca, sigma=1)
