In [None]:
import pandas as pd

df = pd.read_csv("../data/processed/bf_efficiency_clean.csv")
df.head()


In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt

plt.hist(df["efficiency"], bins=20)
plt.xlabel("Eggs per ml of blood")
plt.ylabel("Number of trials")
plt.title("Distribution of Egg Production Efficiency")
plt.show()

In [None]:
plt.scatter(df["blood_input_ml"], df["egg_output_total"])
plt.xlabel("Blood Input (ml)")
plt.ylabel("Egg Output")
plt.title("Blood Input vs Egg Output")
plt.show()


In [None]:
df.groupby("line_id")["efficiency"].mean().sort_values(ascending=False)


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.scatter(df["blood_input_ml"], df["efficiency"])
plt.xlabel("Blood input (ml)")
plt.ylabel("Efficiency")
plt.title("Efficiency vs Input (All Trials)")
plt.show()


In [None]:
import os
os.makedirs("../reports/figures", exist_ok=True)


bucket_df = (
    df.assign(input_bucket=(df["blood_input_ml"] // 500) * 500)
      .groupby("input_bucket")
      .agg(
          avg_efficiency=("efficiency", "mean"),
          n_trials=("efficiency", "count")
      )
      .reset_index()
)

plt.figure()
plt.plot(bucket_df["input_bucket"], bucket_df["avg_efficiency"], marker="o")
plt.xlabel("Input bucket (g)")
plt.ylabel("Average efficiency")
plt.title("Average Efficiency by Input Bucket")
plt.savefig("../reports/figures/efficiency_by_bucket.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(
    bucket_df["input_bucket"].astype(str),
    bucket_df["n_trials"]
)
plt.xlabel("Input bucket (g)")
plt.ylabel("Number of trials")
plt.title("Trial Count per Input Bucket")
plt.show()

In [None]:
print(bucket_df.shape)
bucket_df.head()
bucket_df.dtypes

In [None]:
import numpy as np
os.makedirs("../reports/figures", exist_ok=True)

x = df["blood_input_ml"]
y = df["efficiency"]

z = np.polyfit(x, y, 2)
p = np.poly1d(z)

x_sorted = np.sort(x)
plt.figure()
plt.scatter(x, y, alpha=0.3)
plt.plot(x_sorted, p(x_sorted))
plt.xlabel("Blood input (ml)")
plt.ylabel("Efficiency")
plt.title("Efficiency vs Input with Smoothed Trend")
plt.savefig("../reports/figures/efficiency_vs_input_smooth_trend.png", dpi=300, bbox_inches="tight")
plt.show()
