In [3]:
import numpy as np
import math
import glob
import os
import pandas as pd
from collections import Counter
from statsmodels.nonparametric.smoothers_lowess import lowess

def combine_csvs_with_roi_and_drop_errors(count_folder, roi_folder, errors_file=None):
    slice_folders = glob.glob(os.path.join(count_folder, '*'))
    results = []
    
    for slice_folder in slice_folders:
        slice_name = os.path.basename(slice_folder)
        roi_file = os.path.join(roi_folder, f'{slice_name}.csv')
        
        if not os.path.exists(roi_file):
            print(f"ROI file for slice {slice_name} is missing.")
            continue
            
        roi_table = pd.read_csv(roi_file)
        roi_table.columns = ['Droplet', 'Area']
        
        scenario_folders = glob.glob(os.path.join(slice_folder, '*'))
        
        for scenario_folder in scenario_folders:
            scenario_name = os.path.basename(scenario_folder)
            count_files = glob.glob(os.path.join(scenario_folder, '*.csv'))
            
            if not count_files:
                print(f"Skipping scenario {scenario_name} for slice {slice_name} as count files are missing.")
                continue
            
            print(f"Processing scenario: {scenario_name} for slice {slice_name}")
            
            for count_file in count_files:
                csv_name = os.path.basename(count_file).replace(".csv", "")
                temp = pd.read_csv(count_file)
                temp['Scenario'] = scenario_name  # Add scenario information
                combined = pd.concat([temp, roi_table], axis=1)
                combined['Slice'] = csv_name  # Store the name of the CSV file under 'Slice'
                results.append(combined)
    
    if not results:
        print("No data to process.")
        return None
    
    all_results = pd.concat(results)
    all_results = all_results.drop(columns=['Total Area', 'Average Size', '%Area', 'Perim.'])
    all_results['Slice'] = all_results['Slice'].str.replace("_Simple Segmentation", "")
    all_results[['time', 'Well', 'date']] = all_results['Slice'].str.split('_', n=2, expand=True)
    all_results['time'] = all_results['time'].str.replace("h", "").astype(int)
    all_results['Slice'] = all_results['Slice'].str.replace(r'\d+h_', '', regex=True)
    all_results = all_results[['Slice', 'Count', 'Droplet', 'Area', 'time', 'Well', 'date', 'Scenario']]

    if errors_file is not None:
        # Read the error Excel file
        droplets_to_remove = pd.read_excel(errors_file)

        # Process each column (slice) in the Excel file
        for slice_name in droplets_to_remove.columns:
            # Get the list of droplets to remove for this slice
            droplets_to_drop = droplets_to_remove[slice_name].dropna().astype(int).tolist()

            # Remove these droplets from the all_results DataFrame for the specific slice
            all_results = all_results.loc[~((all_results['Droplet'].isin(droplets_to_drop)) & (all_results['Slice'] == slice_name)), :]

    return all_results


# Usage example:
count_folder = 'C:\\Users\\dinam\\Documents\\master\\Experiments\\New analysis\\Results_files\\Count'
roi_folder = 'C:\\Users\\dinam\\Documents\\master\\Experiments\\New analysis\\Results_files\\ROIcsv'
errors_file = 'C:\\Users\\dinam\\Documents\\master\\Experiments\\New analysis\\Results_files\\droplets_toRemove.xlsx'

combined_df = combine_csvs_with_roi_and_drop_errors(count_folder, roi_folder, errors_file)
print(combined_df)


Processing scenario: A for slice A1_300623
Processing scenario: B for slice A1_300623
Processing scenario: A for slice A2_250623
Processing scenario: B for slice A2_250623
Processing scenario: A for slice A2_260623
Processing scenario: B for slice A2_260623
Processing scenario: A for slice B1_300623
Processing scenario: B for slice B1_300623
Processing scenario: A for slice B2_020723
Processing scenario: B for slice B2_020723
Processing scenario: A for slice B2_200623
Processing scenario: B for slice B2_200623
Processing scenario: A for slice B2_250623
Processing scenario: B for slice B2_250623
Processing scenario: A for slice B2_260623
Processing scenario: B for slice B2_260623
Processing scenario: A for slice B2_300623
Processing scenario: B for slice B2_300623
Processing scenario: A for slice B3_100423
Processing scenario: B for slice B3_100423
Processing scenario: A for slice C3_100423
Processing scenario: B for slice C3_100423
          Slice  Count  Droplet        Area  time Well

In [8]:
df = combined_df.copy()
df['DW'] = df['Droplet'].astype(str) + '_' + df['Slice'].astype(str)

def update_droplet_counts(df):
    # Group by 'DW' and 'Scenario', then calculate the sum of counts and any/all zeros condition
    group = df.groupby(['DW', 'Scenario'])
    sum_counts = group['Count'].transform('sum')
    any_zeros = group['Count'].transform(lambda x: any(x == 0))
    all_zeros = group['Count'].transform(lambda x: all(x == 0))

    # Identify the droplets with sum counts < 25 and not all zeros
    update_condition = (sum_counts < 25) & ~all_zeros
    
    # Find 'DW's where any scenario meets the update condition
    dws_to_update = df.loc[update_condition, 'DW'].unique()

    # Update counts to 0 for both scenarios in these 'DW's
    df.loc[df['DW'].isin(dws_to_update), 'Count'] = 0

    return df


def remove_irregular_droplets(df):
    # Count the number of zeros in Weighted_Count for each DW
    zero_counts = df.groupby('DW')['Weighted_Count'].transform(lambda x: (x == 0).sum())

    # Identify DWs with more than 4 but not all zeros in Weighted_Count
    more_than_5_zeros = zero_counts > 4
    all_zeros = df.groupby('DW')['Weighted_Count'].transform(lambda x: all(x == 0))
    to_remove = more_than_5_zeros & ~all_zeros

    # Filter out the droplets that need to be removed
    filtered_df = df[~to_remove]

    # Create a DataFrame of removed droplets
    removed_droplets = df[to_remove].drop_duplicates()

    return filtered_df, removed_droplets



# Apply the function to the dataframe
filtered_df = update_droplet_counts(df.copy())


# Vectorized time delay calculation with .loc
time_delay_dict = {'A1': 0, 'A2': 15/60, 'B2': 45/60, 'B1': 30/60, 'B3': 0, 'C3': 27/60}
filtered_df.loc[:, 'actual_time'] = filtered_df['time'] + filtered_df['Well'].map(time_delay_dict)

# Vectorized Area_to_Volume calculation with .loc
Theta = np.radians(32)
D = 2 * np.sqrt(filtered_df['Area'] / np.pi)
filtered_df.loc[:, 'Volume'] = ((np.pi * D**3) / 24) * ((2 - 3 * np.cos(Theta) + np.cos(Theta)**3) / (np.sin(Theta)**3))
filtered_df.loc[:, 'log_Volume'] = np.log10(filtered_df['Volume'])

# Vectorized condition for InitialOD with .loc
condition_a = filtered_df['Well'].str.startswith('A')
condition_b3_c3 = filtered_df['Well'].isin(['B3', 'C3'])
filtered_df.loc[:, 'InitialOD'] = np.where(condition_a | condition_b3_c3, '0.01', '0.03')

# Binning with .loc
vol_labels = ['3 - 4', '4 - 5', '5 - 6', '6 - 7', '7 - 8']
cut_bins_vol = [3, 4, 5, 6, 7, 8]
filtered_df.loc[:, 'Bins_vol'] = pd.cut(filtered_df['log_Volume'], bins=cut_bins_vol)
filtered_df.loc[:, 'Bins_vol_txt'] = pd.cut(filtered_df['log_Volume'], bins=cut_bins_vol, labels=vol_labels)


# Define Gaussian functions for each scenario
def right_tail_gaussian(x, mu=0, sigma=4):
    return np.exp(-((x - mu) ** 2) / (2 * sigma ** 2))

def normal_gaussian(x, mu=24, sigma=8):
    return np.exp(-((x - mu) ** 2) / (2 * sigma ** 2))

# Apply Gaussian functions vectorized
filtered_df['Weight_A'] = np.where(filtered_df['Scenario'] == 'A', right_tail_gaussian(filtered_df['time']), 0)
filtered_df['Weight_B'] = np.where(filtered_df['Scenario'] == 'B', normal_gaussian(filtered_df['time']), 0)

# Normalize weights so they sum to 1 for each droplet and time point
group_cols = ['Slice', 'time', 'Droplet']
total_weights = filtered_df.groupby(group_cols)[['Weight_A', 'Weight_B']].transform('sum').sum(axis=1)
filtered_df['Weight_A'] /= total_weights
filtered_df['Weight_B'] /= total_weights

# Calculate the weighted count in each row
filtered_df['Weighted_Count_Row'] = filtered_df['Weight_A'] * filtered_df['Count'] + filtered_df['Weight_B'] * filtered_df['Count']

# Group by droplet and time to sum these up into a new DataFrame
result_df = filtered_df.groupby(group_cols).agg({'Weighted_Count_Row': 'sum'}).reset_index()
result_df.rename(columns={'Weighted_Count_Row': 'Weighted_Count'}, inplace=True)

# Get additional columns from the original DataFrame
additional_columns_df = filtered_df.groupby(group_cols)[['Well', 'date', 'DW', 'actual_time','Area', 'Volume', 'log_Volume', 'InitialOD', 'Bins_vol', 'Bins_vol_txt']].first().reset_index()

# Merge the additional columns into result_df
result_df = pd.merge(result_df, additional_columns_df, on=group_cols, how='left')

result_df, removed_droplets = remove_irregular_droplets(result_df)
print(f" ** {result_df['DW'].nunique()} droplets after filtering")



 ** 19655 droplets after filtering


In [27]:
# Sort the DataFrame by 'DW' and time 
weighted_counts_df = result_df.sort_values(by=['DW', 'time'])

# Define a function to apply the condition for each droplet
def update_counts(group):
    # Check if the first and second time points are < 1 and the rest are > 0
    if group['Weighted_Count'].iloc[0] < 1 and group['Weighted_Count'].iloc[1] < 1 \
       and all(group['Weighted_Count'].iloc[2:] > 0):
        group['Weighted_Count'].iloc[0] = 1
        group['Weighted_Count'].iloc[1] = 1
    return group

# Apply the function to each group of droplets
weighted_counts_df = weighted_counts_df.groupby('DW').apply(update_counts).reset_index(drop=True)

filtered_df = weighted_counts_df.copy()
filtered_df = filtered_df.drop(['Well', 'date'], axis=1)
filtered_df = filtered_df.loc[filtered_df['log_Volume'] >= 3]
filtered_df['InitialOD'] = filtered_df['InitialOD'].astype(str)



In [31]:
def OnlyBac_df(data_df):
    # Filter out droplets that have zero cells in all time points
    return data_df[data_df.groupby('DW')['Weighted_Count'].transform('sum') > 0]

def log_mean_fill(series):
    result = series.copy()
    for idx in range(1, len(series)):
        if pd.isna(series.iloc[idx]):
            before = series.iloc[idx - 1] if idx - 1 >= 0 else np.nan
            after = series.iloc[idx + 1] if idx + 1 < len(series) else np.nan

            # If both before and after are valid numbers, use log mean
            if pd.notna(before) and pd.notna(after) and before > 0 and after > 0:
                result.iloc[idx] = 10 ** ((np.log10(before) + np.log10(after)) / 2)
            else:
                # Find the nearest valid previous value for each NaN
                found_valid_value = False
                for j in range(1, idx + 1):
                    prev_value = series.iloc[idx - j] if idx - j >= 0 else np.nan
                    if pd.notna(prev_value) and prev_value > 0:
                        result.iloc[idx] = prev_value
                        found_valid_value = True
                        break
    return result

def process_droplet(group):
    group['Percent_Change'] = group['Weighted_Count'].pct_change()
    decrease_threshold = -0.5  # Threshold for decrease
    group['Weighted_Count'].where(~(group['Percent_Change'] < decrease_threshold), np.nan, inplace=True)
    group['Weighted_Count'] = log_mean_fill(group['Weighted_Count'])
    return group


def apply_lowess(group):
    counts = group['Weighted_Count'].values
    if len(counts) > 1:
        log_counts = np.log10(counts + 1)  # Adding 1 to avoid log(0)
        smoothed_log_values = lowess(log_counts, np.arange(len(log_counts)), frac=0.2)[:, 1]
        smoothed_values = 10**smoothed_log_values - 1  # Subtracting 1 to revert the earlier addition
        return pd.Series(smoothed_values, index=group.index)
    else:
        return pd.Series([np.nan] * len(group), index=group.index)
    
slice_to_new_name = {
    'A1_300623': 'C1LD',
    'B3_100423': 'C2LD',
    'C3_100423': 'C3LD',
    'A2_260623': 'C4LD',
    'A2_250623': 'C5LD',
    'B1_300623': 'C6HD',
    'B2_020723': 'C7HD',
    'B2_300623': 'C8HD',
    'B2_260623': 'C9HD',
    'B2_200623': 'C10HD',
    'B2_250623': 'C11HD'}

filtered_df['Slice'] = filtered_df['Slice'].map(slice_to_new_name)
filtered_df.to_csv('filtered_df.csv')

cb = OnlyBac_df(filtered_df)
cb._is_copy = None
cb['Weighted_Count'] = cb['Weighted_Count'].apply(lambda x: max(x, 1) if pd.notna(x) else x)
cb = cb.groupby('DW').apply(process_droplet).reset_index(drop=True)
# Apply LOWESS smoothing to each group
cb['lowess_count'] = cb.groupby('DW').apply(apply_lowess).reset_index(level=0, drop=True)

# Merge carrying capacity and initial cell count
cc = cb[cb['time'] >= 21].groupby('DW')['Weighted_Count'].mean().rename('Carrying_Capacity')
ic = cb[cb['time'] == 0].groupby('DW')['Weighted_Count'].first().rename('initial_cell_count')
cb = cb.merge(cc, on='DW', how='left').merge(ic, on='DW', how='left')
cb['Divisions'] = np.log2(cb['Carrying_Capacity'] / cb['initial_cell_count'])
cb['Divisions'].replace([np.inf, -np.inf], np.nan, inplace=True)
cb['normalized_count'] = cb['lowess_count'] / cb['Carrying_Capacity']
cb.to_csv('cb.csv')