Section 1 - Generate Dataset


In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of rows
num_rows = 90000

# Generate Cow IDs
cow_ids = [f"COW_{i:05d}" for i in range(1, num_rows + 1)]

# Categories
groups = ["Fresh", "Late Lactation", "Dry", "Heifer"]
health_statuses = ["Healthy", "Mastitis", "Lameness", "Metabolic Disorder"]

# Random selections
cow_groups = np.random.choice(groups, size=num_rows)
health_status = np.random.choice(health_statuses, size=num_rows, p=[0.85, 0.07, 0.05, 0.03])
treatment_flag = np.random.choice(["Yes", "No"], size=num_rows, p=[0.2, 0.8])

# Numeric data
lactation_stage = np.random.randint(1, 305, size=num_rows)
body_condition_score = np.round(np.random.uniform(2.0, 4.5, size=num_rows), 1)

corn_silage = np.round(np.random.uniform(10, 30, size=num_rows), 2)
alfalfa = np.round(np.random.uniform(5, 15, size=num_rows), 2)
soymeal = np.round(np.random.uniform(1, 5, size=num_rows), 2)
minerals = np.round(np.random.uniform(0.1, 0.5, size=num_rows), 2)

feed_cost = np.round(corn_silage * 0.2 + alfalfa * 0.15 + soymeal * 0.3 + minerals * 0.5 + np.random.uniform(0.5, 2.0, size=num_rows), 2)

milk_yield = np.round(np.random.uniform(15, 45, size=num_rows), 2)
milk_fat = np.round(np.random.uniform(3.0, 4.5, size=num_rows), 2)
milk_protein = np.round(np.random.uniform(2.8, 3.6, size=num_rows), 2)
somatic_cell_count = np.random.randint(50000, 400000, size=num_rows)

temperature = np.round(np.random.uniform(30, 95, size=num_rows), 1)
humidity = np.round(np.random.uniform(30, 90, size=num_rows), 1)
precipitation = np.round(np.random.uniform(0, 20, size=num_rows), 1)
soil_moisture = np.round(np.random.uniform(10, 40, size=num_rows), 1)

vet_cost = np.round(np.random.uniform(0.05, 5.0, size=num_rows), 2)
fuel_cost = np.round(np.random.uniform(0.5, 3.0, size=num_rows), 2)
nitrogen_runoff = np.round(np.random.uniform(0.1, 2.0, size=num_rows), 2)
feed_waste = np.round(np.random.uniform(0.1, 3.0, size=num_rows), 2)

# Derived KPIs
feed_to_yield_ratio = np.round((corn_silage + alfalfa + soymeal + minerals) / milk_yield, 3)
health_maintenance_index = np.where(health_status == "Healthy", 1, 0)
sustainability_index = np.round(np.random.uniform(65, 95, size=num_rows), 2)

# Create DataFrame
data = pd.DataFrame({
    "Cow_ID": cow_ids,
    "Group": cow_groups,
    "Lactation_Stage": lactation_stage,
    "Body_Condition_Score": body_condition_score,
    "Health_Status": health_status,
    "Treatment_Flag": treatment_flag,
    "Corn_Silage_kg": corn_silage,
    "Alfalfa_kg": alfalfa,
    "Soymeal_kg": soymeal,
    "Minerals_kg": minerals,
    "Feed_Cost_USD": feed_cost,
    "Milk_Yield_Liters": milk_yield,
    "Milk_Fat_%": milk_fat,
    "Milk_Protein_%": milk_protein,
    "Somatic_Cell_Count": somatic_cell_count,
    "Temperature_F": temperature,
    "Humidity_%": humidity,
    "Precipitation_mm": precipitation,
    "Soil_Moisture_%": soil_moisture,
    "Veterinary_Cost_USD": vet_cost,
    "Fuel_Cost_USD": fuel_cost,
    "Nitrogen_Runoff_kg": nitrogen_runoff,
    "Feed_Waste_kg": feed_waste,
    "Feed_to_Yield_Ratio": feed_to_yield_ratio,
    "Health_Maintenance_Index": health_maintenance_index,
    "Sustainability_Index": sustainability_index
})

# Save to CSV
data.to_csv("cedar_creek_dairy_dataset_generated.csv", index=False)
print("Synthetic dataset generated and saved as cedar_creek_dairy_dataset_generated.csv")

Section 2 - Predictive Modeling Script

In [None]:
# Predictive Modeling for Milk Yield
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("cedar_creek_dairy_dataset_generated.csv")

# Use the generated target (liters) and ensure consistent column names
# Drop rows with missing target
df = df.dropna(subset=['Milk_Yield_Liters'])

# Create a simple Health_Flag (1 = Healthy, 0 = non-healthy)
if 'Health_Status' in df.columns:
    df['Health_Flag'] = (df['Health_Status'] == 'Healthy').astype(int)

# Convert Fahrenheit to Celsius
if 'Temperature_F' in df.columns:
    df['Temperature_C'] = (df['Temperature_F'] - 32) * 5.0/9.0

# Select features and target
features = [
    'Corn_Silage_kg', 'Alfalfa_kg', 'Soymeal_kg', 'Minerals_kg',
    'Body_Condition_Score', 'Health_Flag', 'Temperature_C', 'Humidity_%', 'Precipitation_mm'
]
# Keep only features that exist
features = [f for f in features if f in df.columns]
target = 'Milk_Yield_Liters'

X = df[features]
y = df[target]

# Handle missing values in features
X = X.fillna(X.mean())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions & evaluation
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

# Feature importance
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title("Feature Importance for Milk Yield Prediction")
plt.tight_layout()
plt.show()