In [12]:
#Try comparing the collected data to a reference dataset
#https://onlinelibrary.wiley.com/doi/epdf/10.1002/cyto.b.20542

#the reference dataset only provides absolute number and 5th-95th percentile values (x10^6/L)
#can get a ROUGH estimate mean and SD

#1. Reasonable approxiate to take the median as the mean
#Mean ≈ Median
#2. Range between 5th and 95th perceentile covers ~2SD, can assume the SD is half the range divided by 2 (so /4)
#Standard Deviation ≈ (95th percentile - 5th percentile) / 4

import pandas as pd
from scipy import stats

#read in reference data table
df_ref = pd.read_csv('CSFrefstudy_counts.csv')

# Convert the 'Absolute' column to 'Mean'
df_ref['Mean'] = (df_ref['Median 5th'] + df_ref['Median 95th']) / 2

# Create a new calculated 'SD' column using the Median 5th and Median 95th percentile columns
df_ref['SD'] = (df_ref['Median 95th'] - df_ref['Median 5th']) / 4

# Drop the original Absolute and Median columns
df_ref.drop(columns=['Absolute', 'Median 5th', 'Median 95th'], inplace=True)

# Display the updated DataFrame
print(df_ref)


    Cell type Markers   Mean      SD
0      T cell     CD3  0.990  0.4200
1  CD4 T cell     CD4  0.755  0.3375
2  CD8 T cell     CD8  0.220  0.0900
3      B cell    CD19  0.015  0.0075


In [13]:
#read in actual dataset for first sample
df_actual = pd.read_csv('CSF 081723 844-220v1_counts.csv')
df_actual

Unnamed: 0,Cell type,Markers,Mean,SD
0,T cell,CD3,4559,1959
1,CD4 T cell,CD4,1963,847
2,CD8 T cell,CD8,16184,7482
3,B cell,CD19,4781,1683


In [14]:
#To account for machine setting differences
#1. Calculate the difference in means: Subtract the mean of the collected dataset from the mean of the reference dataset.
# Mean difference = Mean (collected dataset) - Mean (reference dataset)

#2. Calculate the ratio of standard deviations: Divide the standard deviation of the collected dataset by the standard deviation of the reference dataset.
# Standard deviation ratio = Standard deviation (collected dataset) / Standard deviation (reference dataset)

#3. Apply the adjustments to your collected data:
# Subtract the mean difference from each data point in your collected dataset.
# Divide each adjusted data point by the standard deviation ratio.

# Iterate over each row in the reference DataFrame
for index, row_ref in df_ref.iterrows():
    # Find the corresponding row in the actual DataFrame based on 'Markers'
    row_actual = df_actual.loc[df_actual['Markers'] == row_ref['Markers']]
    
    # If the marker exists in the actual DataFrame
    if not row_actual.empty:
        # Calculate mean difference
        mean_diff = row_actual['Mean'].values[0] - row_ref['Mean']
        
        # Calculate standard deviation ratio
        sd_ratio = row_actual['SD'].values[0] / row_ref['SD']
        
        # Apply adjustments to actual dataset
        df_actual.loc[df_actual['Markers'] == row_ref['Markers'], 'Mean'] -= mean_diff
        df_actual.loc[df_actual['Markers'] == row_ref['Markers'], 'SD'] /= sd_ratio

# Display the adjusted actual dataset
print(df_actual)

    Cell type Markers   Mean      SD
0      T cell     CD3  0.990  0.4200
1  CD4 T cell     CD4  0.755  0.3375
2  CD8 T cell     CD8  0.220  0.0900
3      B cell    CD19  0.015  0.0075


In [15]:
# Perform single-sample t-tests
for index, row in df_actual.iterrows():
    # Extract data for the current marker
    marker = row['Markers']
    mean_actual = row['Mean']
    sd_actual = row['SD']
    mean_ref = df_ref.loc[df_ref['Markers'] == marker, 'Mean'].values[0]
    sd_ref = df_ref.loc[df_ref['Markers'] == marker, 'SD'].values[0]
    
    # Perform t-test
    t_stat, p_value = stats.ttest_1samp([mean_actual], mean_ref)
    
    # Print results
    print(f"Marker: {marker}")
    print(f"T-statistic: {t_stat}")
    print(f"P-value: {p_value}")
    print(f"Mean (Actual): {mean_actual}, Mean (Reference): {mean_ref}")
    print(f"SD (Actual): {sd_actual}, SD (Reference): {sd_ref}")
    print()

Marker: CD3
T-statistic: nan
P-value: nan
Mean (Actual): 0.9899999999997817, Mean (Reference): 0.99
SD (Actual): 0.42000000000000004, SD (Reference): 0.42000000000000004

Marker: CD4
T-statistic: nan
P-value: nan
Mean (Actual): 0.7550000000001091, Mean (Reference): 0.755
SD (Actual): 0.33749999999999997, SD (Reference): 0.33749999999999997

Marker: CD8
T-statistic: nan
P-value: nan
Mean (Actual): 0.21999999999934516, Mean (Reference): 0.22
SD (Actual): 0.09000000000000001, SD (Reference): 0.09000000000000001

Marker: CD19
T-statistic: nan
P-value: nan
Mean (Actual): 0.015000000000327418, Mean (Reference): 0.015
SD (Actual): 0.0075, SD (Reference): 0.0075



  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
