In [2]:
import pandas as pd
import numpy as np
from scipy import stats

def grubbs_test(data, alpha=0.05):
    """Perform Grubbs' test for outlier detection."""
    data = np.array(data)
    N = len(data)
    if N < 2:
        return False, []  # Not enough data to perform Grubbs' test
    
    mean = np.mean(data)
    std_dev = np.std(data, ddof=1)
    
    if std_dev == 0:
        return False, []  # No variability in data
    
    # Compute the Grubbs' statistic
    G = np.max(np.abs(data - mean)) / std_dev
    
    # Calculate the critical value
    t_critical = stats.t.ppf(1 - alpha / (2 * N), N - 2)
    G_critical = (N - 1) / np.sqrt(N) * np.sqrt(t_critical**2 / (N - 2 + t_critical**2))
    
    is_outlier = G > G_critical
    outlier_values = data[np.abs(data - mean) > G_critical * std_dev] if is_outlier else []
    
    return is_outlier, outlier_values

def average_batch(batch):
    """Calculate the average of a batch of values, excluding NaN values."""
    batch = np.array(batch)
    return np.nanmean(batch)

# Load the Excel file
file_path = '/Users/drk29580/Downloads/flowtest.xlsx'
df = pd.read_excel(file_path, header=None)

# Initialize a DataFrame to hold the cleaned data
cleaned_df = df.copy()

# Process each column to identify and blank out outliers
for col in range(df.shape[1]):
    column_data = df.iloc[:, col]
    
    # Process every batch of 4 rows
    for i in range(0, len(column_data), 4):
        batch = column_data[i:i + 4]
        if len(batch) == 4:  # Ensure we have a complete batch of 4
            is_outlier, outlier_values = grubbs_test(batch)
            # Create a mask for the batch
            mask = np.isin(batch, outlier_values) if is_outlier else [False] * len(batch)
            # Blank out the outlier values in the cleaned DataFrame
            cleaned_df.iloc[i:i + 4, col] = np.where(mask, np.nan, batch)

# Save the cleaned DataFrame to a new Excel file
cleaned_file_path = '/Users/drk29580/Downloads/hepg2_cleaned.xlsx'
cleaned_df.to_excel(cleaned_file_path, index=False)

# Process each column to calculate the average of every 4 rows
averaged_df = cleaned_df.copy()
for col in range(cleaned_df.shape[1]):
    column_data = cleaned_df.iloc[:, col]
    averaged_column = []
    
    # Process every batch of 4 rows
    for i in range(0, len(column_data), 4):
        batch = column_data[i:i + 4]
        # Calculate average, ignoring NaN values
        batch_avg = average_batch(batch)
        # Add the average to the result list
        averaged_column.extend([batch_avg] * len(batch))  # Replace entire batch with the average
    
    # Fill in the remaining values with NaN if the last batch is incomplete
    if len(averaged_column) < len(column_data):
        averaged_column.extend([np.nan] * (len(column_data) - len(averaged_column)))
    
    # Update the column in the DataFrame with averaged values
    averaged_df.iloc[:, col] = averaged_column

# Save the averaged DataFrame back to a new Excel file
averaged_df.to_excel('/Users/drk29580/Downloads/hepg2_averaged.xlsx', index=False)


  averaged_df.iloc[:, col] = averaged_column
  averaged_df.iloc[:, col] = averaged_column
  averaged_df.iloc[:, col] = averaged_column
  averaged_df.iloc[:, col] = averaged_column
  averaged_df.iloc[:, col] = averaged_column
