From da8e9755cc9742b02370df14d5c95653290f51ee Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 4 Nov 2025 18:28:40 +0000 Subject: [PATCH] Add average_expression_replicates method to MSExpression Integrates a new method to average expression data across replicates for each strain. This method: - Takes replicate columns (e.g., ACN2586_1, ACN2586_2) and averages them into single columns per strain - Properly uses MSCondition class instead of a fake ExpressionCondition - Returns a new MSExpression object with averaged data - Handles cases where replicates exist, single columns exist, or no data is found - Includes comprehensive logging and error handling This method is useful for preprocessing expression data before analysis. --- modelseedpy/multiomics/msexpression.py | 68 ++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/modelseedpy/multiomics/msexpression.py b/modelseedpy/multiomics/msexpression.py index 65bd775..e93d1e8 100644 --- a/modelseedpy/multiomics/msexpression.py +++ b/modelseedpy/multiomics/msexpression.py @@ -598,6 +598,74 @@ def translate_data(self, target_type: str) -> 'MSExpression': new_expression._data.loc[feature.id, condition.id] = value return new_expression + def average_expression_replicates(self, strain_list: list) -> 'MSExpression': + """Average expression replicates for each strain. + + Takes an MSExpression object with replicate columns (e.g., ACN2586_1, ACN2586_2, ...) + and averages them to create single columns per strain (e.g., ACN2586). + + Args: + strain_list: List of strain names (e.g., ["ACN2586", "ACN2821", ...]) + + Returns: + New MSExpression object with averaged data per strain + + Raises: + ValueError: If no data found for any strain in the list + """ + try: + # Access the underlying DataFrame + expression_df = self._data.copy() + + # Create new DataFrame for averaged data + averaged_data = {} + + # Keep the index (gene/protein IDs) + averaged_data['index'] = expression_df.index + + # For each strain, find and average its replicates + for strain in strain_list: + # Find columns that match this strain pattern (e.g., ACN2586_1, ACN2586_2, ...) + replicate_cols = [col for col in expression_df.columns if col.startswith(f"{strain}_")] + + if replicate_cols: + # Average the replicates + averaged_data[strain] = expression_df[replicate_cols].mean(axis=1) + logger.info(f"Averaged {len(replicate_cols)} replicates for strain {strain}") + else: + # No replicates found - check if strain column exists as-is + if strain in expression_df.columns: + averaged_data[strain] = expression_df[strain] + logger.info(f"No replicates found for {strain}, using existing column") + else: + logger.warning(f"No data found for strain {strain}") + + # Create new DataFrame from averaged data + averaged_df = pd.DataFrame(averaged_data) + averaged_df.set_index('index', inplace=True) + + # Create a deep copy of the expression object + averaged_expression = copy.deepcopy(self) + + # Replace the data with averaged data + averaged_expression._data = averaged_df + + # Update conditions list to match new columns + # Clear and rebuild conditions using proper MSCondition class + averaged_expression.conditions = DictList() + for strain in strain_list: + if strain in averaged_df.columns: + condition = MSCondition(strain, averaged_expression) + averaged_expression.conditions.append(condition) + + logger.info(f"Created averaged expression data with {len(averaged_expression.conditions)} conditions") + + return averaged_expression + + except Exception as e: + logger.error(f"Error averaging expression replicates: {str(e)}") + raise + def fit_model_flux_to_data( self, model: 'MSModelUtil',