In [22]:
import pandas as pd

data_path = "../data/raw/bf_efficiency_raw_data.csv"
df = pd.read_csv(data_path)

df = pd.read_csv(data_path)
df.shape

df.info()

numeric_cols = [
    "collection_1_output",
    "collection_2_output",
    "collection_3_output",
    "collection_4_output",
    "blood_input_ml",
    "egg_output_total"
]

for col in numeric_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(float)
    )


df[numeric_cols].dtypes

date_cols = [
    "collection_1_date",
    "collection_2_date",
    "collection_3_date",
    "collection_4_date"
]

for col in date_cols:
    df[col] = pd.to_datetime(df[col], dayfirst=True, errors="coerce")

df[date_cols].head()

df["efficiency"] = df["egg_output_total"] / df["blood_input_ml"]
df["eggs_per_hour"] = df["egg_output_total"] / df["time_between_bf_hrs"]

df[["blood_input_ml", "egg_output_total", "efficiency", "eggs_per_hour"]].head()

df.shape

df.describe()

output_path = "../data/processed/bf_efficiency_clean.csv"
df.to_csv(output_path, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   line_id              158 non-null    object
 1   trial_id             158 non-null    object
 2   colony_strength      158 non-null    object
 3   BF1                  158 non-null    object
 4   BF2                  158 non-null    object
 5   BF3                  158 non-null    object
 6   BF4                  158 non-null    object
 7   bf1_amounts_ml       158 non-null    int64 
 8   bf2_amounts_ml       158 non-null    int64 
 9   bf3_amounts_ml       158 non-null    int64 
 10  bf4_amounts_ml       158 non-null    int64 
 11  collection_1_date    158 non-null    object
 12  collection_2_date    158 non-null    object
 13  collection_3_date    158 non-null    object
 14  collection_4_date    158 non-null    object
 15  collection_1_output  158 non-null    object
 16  collecti

In [23]:
import pandas as pd
import numpy as np

In [24]:
DATA_PATH = "../data/raw/bf_efficiency_raw_data.csv"
OUTPUT_PATH = "../data/processed/bf_efficiency_clean.csv"

df = pd.read_csv(DATA_PATH)

assert not df.empty, "Raw dataset is empty"


In [25]:
df.shape

(158, 23)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   line_id              158 non-null    object
 1   trial_id             158 non-null    object
 2   colony_strength      158 non-null    object
 3   BF1                  158 non-null    object
 4   BF2                  158 non-null    object
 5   BF3                  158 non-null    object
 6   BF4                  158 non-null    object
 7   bf1_amounts_ml       158 non-null    int64 
 8   bf2_amounts_ml       158 non-null    int64 
 9   bf3_amounts_ml       158 non-null    int64 
 10  bf4_amounts_ml       158 non-null    int64 
 11  collection_1_date    158 non-null    object
 12  collection_2_date    158 non-null    object
 13  collection_3_date    158 non-null    object
 14  collection_4_date    158 non-null    object
 15  collection_1_output  158 non-null    object
 16  collecti

In [27]:
numeric_cols = [
    "collection_1_output",
    "collection_2_output",
    "collection_3_output",
    "collection_4_output",
    "blood_input_ml",
    "egg_output_total",
    "time_between_bf_hrs"
]


In [28]:
def clean_numeric(series: pd.Series) -> pd.Series:
    cleaned = (
        series
        .astype(str)
        .str.replace(",", "", regex=False)
        .str.strip()
    )
    return pd.to_numeric(cleaned, errors="coerce")

for col in numeric_cols:
    df[col] = clean_numeric(df[col])

In [29]:
df[numeric_cols].isna().mean().sort_values(ascending=False)


collection_1_output    0.0
collection_2_output    0.0
collection_3_output    0.0
collection_4_output    0.0
blood_input_ml         0.0
egg_output_total       0.0
time_between_bf_hrs    0.0
dtype: float64

In [30]:
date_cols = [
    "collection_1_date",
    "collection_2_date",
    "collection_3_date",
    "collection_4_date"
]

for col in date_cols:
    df[col] = pd.to_datetime(df[col], dayfirst=True, errors="coerce")


In [31]:
df[date_cols].isna().sum()


collection_1_date    0
collection_2_date    0
collection_3_date    0
collection_4_date    0
dtype: int64

In [32]:
df[date_cols].agg(["min", "max"])


Unnamed: 0,collection_1_date,collection_2_date,collection_3_date,collection_4_date
min,2025-09-03,2025-09-05,2025-09-09,2025-09-12
max,2025-12-03,2025-12-05,2025-12-08,2025-12-12


In [33]:
(df["blood_input_ml"] < 0).sum()


np.int64(0)

In [34]:
(df["egg_output_total"] < 0).sum()


np.int64(0)

In [35]:
(df["time_between_bf_hrs"] <= 0).sum()


np.int64(0)

In [36]:
df["efficiency"] = (
    df["egg_output_total"] /
    df["blood_input_ml"].replace(0, np.nan)
)

df["eggs_per_hour"] = (
    df["egg_output_total"] /
    df["time_between_bf_hrs"].replace(0, np.nan)
)


In [37]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [38]:
df[[
    "blood_input_ml",
    "egg_output_total",
    "time_between_bf_hrs",
    "efficiency",
    "eggs_per_hour"
]].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])


Unnamed: 0,blood_input_ml,egg_output_total,time_between_bf_hrs,efficiency,eggs_per_hour
count,158.0,158.0,158.0,158.0,158.0
mean,2458.734177,1783843.0,48.0,722.576984,37163.396624
std,470.60164,660746.6,0.0,265.5368,13765.554613
min,240.0,9700.0,48.0,40.416667,202.083333
5%,2080.0,782050.0,48.0,351.927885,16292.708333
25%,2400.0,1410250.0,48.0,550.3125,29380.208333
50%,2400.0,1793500.0,48.0,696.25,37364.583333
75%,2400.0,2128000.0,48.0,860.416667,44333.333333
95%,3260.0,2783500.0,48.0,1159.791667,57989.583333
max,3600.0,4355000.0,48.0,1814.583333,90729.166667


In [39]:
df.shape

(158, 25)

In [40]:
df.head()

Unnamed: 0,line_id,trial_id,colony_strength,BF1,BF2,BF3,BF4,bf1_amounts_ml,bf2_amounts_ml,bf3_amounts_ml,...,collection_1_output,collection_2_output,collection_3_output,collection_4_output,time_between_bf_hrs,blood_input_ml,egg_output_total,location,efficiency,eggs_per_hour
0,LAO BRO 20.2.1,20.2.1,80000,01-09-2025\n02-09-2025,04-09-2025\n05-09-2025,08-09-2025\n09-09-2025,11-09-2025\n12-09-2025,600,600,600,...,230500,415000,202000,218000,48,2400,1065500,22C,443.958333,22197.916667
1,LAO BRO 20.2.2,20.2.2,80000,01-09-2025\n02-09-2025,04-09-2025\n05-09-2025,08-09-2025\n09-09-2025,11-09-2025\n12-09-2025,600,600,600,...,273000,302000,187000,393000,48,2400,1155000,22C,481.25,24062.5
2,LAO BRO 20.2.3,20.2.3,120000,01-09-2025\n02-09-2025,04-09-2025\n05-09-2025,08-09-2025\n09-09-2025,11-09-2025\n12-09-2025,600,600,600,...,680000,520000,367000,350000,48,2400,1917000,22A,798.75,39937.5
3,LAO BRO 20.2.4,20.2.4,120000,01-09-2025\n02-09-2025,04-09-2025\n05-09-2025,08-09-2025\n09-09-2025,11-09-2025\n12-09-2025,600,600,600,...,238000,390000,295000,213000,48,2400,1136000,18B,473.333333,23666.666667
4,LAO BRO 20.2.5,20.2.5,120000,01-09-2025\n02-09-2025,04-09-2025\n05-09-2025,08-09-2025\n09-09-2025,11-09-2025\n12-09-2025,600,600,600,...,610000,550000,390000,200000,48,2400,1750000,18B,729.166667,36458.333333


In [41]:
df.sort_index(inplace=True)

df.to_csv(OUTPUT_PATH, index=False)

print(f"Cleaned dataset saved to: {OUTPUT_PATH}")


Cleaned dataset saved to: ../data/processed/bf_efficiency_clean.csv
