In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor

# Import all regression models covered in class
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    BaggingRegressor,
    StackingRegressor,
    VotingRegressor
)
from sklearn.neural_network import MLPRegressor

import warnings
warnings.filterwarnings('ignore')

In [33]:
!pip install seaborn catboost



In [34]:
# ============================================================================
# PHASE 0: LOAD RAW DATA
# ============================================================================
print("="*80)
print("PHASE 0: LOADING RAW DATA")
print("="*80)
print()

# Load the datasets
train_data = pd.read_csv('sample_data/cattle_data_train.csv')
test_data = pd.read_csv('sample_data/cattle_data_test.csv')

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print()

# Display first few rows
print("First few rows of training data:")
print(train_data.head())
print()

print("Data types:")
print(train_data.dtypes)
print()

print("Column names:")
print(train_data.columns.tolist())
print()


PHASE 0: LOADING RAW DATA

Training data shape: (210000, 36)
Test data shape: (40000, 35)

First few rows of training data:
       Cattle_ID     Breed   Climate_Zone Management_System  Age_Months  \
0  CATTLE_133713  Holstein       Tropical         Intensive         114   
1  CATTLE_027003  Holstein           Arid             Mixed         136   
2  CATTLE_122459  Holstein       Tropical    Semi_Intensive          64   
3  CATTLE_213419    Jersey  Mediterranean         Intensive          58   
4  CATTLE_106260  Guernsey    Subtropical         Intensive          84   

   Weight_kg  Parity Lactation_Stage  Days_in_Milk      Feed_Type  ...  \
0      544.8       4             Mid            62   Concentrates  ...   
1      298.9       4             Mid           213  Crop_Residues  ...   
2      336.6       4            Late            16            Hay  ...   
3      370.5       1           Early           339  Crop_Residues  ...   
4      641.5       6           Early           125     

In [35]:
# ============================================================================
# PHASE 1: DATA EXPLORATION
# ============================================================================
print("="*80)
print("PHASE 1: DATA EXPLORATION")
print("="*80)
print()

# 1.1 Basic Dataset Information
print("1.1 Dataset Overview")
print("-" * 80)
train_data.info()
print()
print(train_data.describe())
print()

# 1.2 Check for Missing Values
print("1.2 Missing Values Analysis")
print("-" * 80)
missing_counts = train_data.isnull().sum()
missing_pct = 100 * train_data.isnull().sum() / len(train_data)
missing_table = pd.DataFrame({
    'Missing_Count': missing_counts,
    'Percentage': missing_pct
})
missing_table = missing_table[missing_table['Missing_Count'] > 0].sort_values(
    'Percentage', ascending=False
)
print(missing_table)
print()

# Visualize missing data pattern
if len(missing_table) > 0:
    plt.figure(figsize=(12, 6))
    missing_table['Percentage'].plot(kind='barh')
    plt.xlabel('Percentage Missing')
    plt.title('Missing Data by Feature')
    plt.tight_layout()
    plt.savefig('missing_data.png')
    plt.close()
    print("‚úì Missing data visualization saved as 'missing_data.png'")
print()

# 1.3 Target Variable Distribution
print("1.3 Target Variable (Milk_Yield_L) Distribution")
print("-" * 80)
print(f"Mean:   {train_data['Milk_Yield_L'].mean():.2f} L")
print(f"Median: {train_data['Milk_Yield_L'].median():.2f} L")
print(f"Std:    {train_data['Milk_Yield_L'].std():.2f} L")
print(f"Min:    {train_data['Milk_Yield_L'].min():.2f} L")
print(f"Max:    {train_data['Milk_Yield_L'].max():.2f} L")
print(f"Skewness: {train_data['Milk_Yield_L'].skew():.2f}")
print()

# Plot distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(train_data['Milk_Yield_L'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Milk Yield (L)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Milk Yield')
axes[0].axvline(train_data['Milk_Yield_L'].mean(), color='r', linestyle='--', label='Mean')
axes[0].axvline(train_data['Milk_Yield_L'].median(), color='g', linestyle='--', label='Median')
axes[0].legend()

axes[1].boxplot(train_data['Milk_Yield_L'])
axes[1].set_ylabel('Milk Yield (L)')
axes[1].set_title('Box Plot of Milk Yield')
plt.tight_layout()
plt.savefig('target_distribution.png')
plt.close()
print("‚úì Target distribution saved as 'target_distribution.png'")
print()


PHASE 1: DATA EXPLORATION

1.1 Dataset Overview
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210000 entries, 0 to 209999
Data columns (total 36 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Cattle_ID                210000 non-null  object 
 1   Breed                    210000 non-null  object 
 2   Climate_Zone             210000 non-null  object 
 3   Management_System        210000 non-null  object 
 4   Age_Months               210000 non-null  int64  
 5   Weight_kg                210000 non-null  float64
 6   Parity                   210000 non-null  int64  
 7   Lactation_Stage          210000 non-null  object 
 8   Days_in_Milk             210000 non-null  int64  
 9   Feed_Type                210000 non-null  object 
 10  Feed_Quantity_kg         199519 non-null  float64
 11  Feeding_Frequency        210000 non-null

In [36]:
# 1.4 Identify Numeric vs Categorical Features
print("1.4 Feature Type Identification")
print("-" * 80)

# Define categorical and numeric features based on the dataset
categorical_features = [
    'Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage',
    'Feed_Type', 'Mastitis'
]

# Find vaccine columns (pattern: [Disease]_Vaccine)
vaccine_cols = [col for col in train_data.columns if '_Vaccine' in col]
print(f"Found {len(vaccine_cols)} vaccine columns: {vaccine_cols}")
categorical_features.extend(vaccine_cols)

# Numeric features (excluding ID, target, and date)
numeric_features = [
    'Age_Months', 'Weight_kg', 'Parity', 'Days_in_Milk', 'Feed_Quantity_kg',
    'Feeding_Frequency', 'Water_Intake_L', 'Walking_Distance_km',
    'Grazing_Duration_hrs', 'Rumination_Time_hrs', 'Resting_Hours',
    'Ambient_Temperature_C', 'Humidity_percent', 'Housing_Score',
    'Previous_Week_Avg_Yield', 'Body_Condition_Score', 'Milking_Interval_hrs',
    'Feed_Quantity_lb'
]

print(f"\nNumeric features ({len(numeric_features)}):")
for feat in numeric_features:
    print(f"  ‚Ä¢ {feat}")
print(f"\nCategorical features ({len(categorical_features)}):")
for feat in categorical_features:
    print(f"  ‚Ä¢ {feat}")
print()

# 1.5 Check for duplicate Feed_Quantity columns
print("1.5 Checking Feed Quantity Columns")
print("-" * 80)
if 'Feed_Quantity_kg' in train_data.columns and 'Feed_Quantity_lb' in train_data.columns:
    # Check if they're correlated (lb = kg * 2.20462)
    correlation = train_data['Feed_Quantity_kg'].corr(train_data['Feed_Quantity_lb'])
    print(f"Correlation between Feed_Quantity_kg and Feed_Quantity_lb: {correlation:.4f}")

    # Check conversion factor
    ratio = (train_data['Feed_Quantity_lb'] / train_data['Feed_Quantity_kg']).mean()
    print(f"Average lb/kg ratio: {ratio:.4f} (expected: 2.20462)")
    print("‚Üí These columns appear to be the same measurement in different units")
    print("‚Üí We'll drop Feed_Quantity_lb to avoid redundancy")
print()

# 1.6 Date Feature Analysis
print("1.6 Date Feature Analysis")
print("-" * 80)
if 'Date' in train_data.columns:
    train_data['Date'] = pd.to_datetime(train_data['Date'])
    print(f"Date range: {train_data['Date'].min()} to {train_data['Date'].max()}")
    print("We can extract useful temporal features from Date:")
    print("  ‚Ä¢ Month (seasonal effects)")
    print("  ‚Ä¢ Day of week (if relevant)")
    print("  ‚Ä¢ Days since start of dataset")
print()

# 1.7 Correlation Analysis (for numeric features)
print("1.7 Correlation Analysis with Target")
print("-" * 80)

# Calculate correlations with target
correlations = train_data[numeric_features + ['Milk_Yield_L']].corr()['Milk_Yield_L'].drop('Milk_Yield_L')
correlations = correlations.sort_values(ascending=False)
print("Top 10 features most correlated with Milk_Yield_L:")
print(correlations.head(10))
print()
print("Bottom 10 features (least correlated or negatively correlated):")
print(correlations.tail(10))
print()

# Visualize correlation heatmap for top features
top_features = correlations.abs().nlargest(15).index.tolist()
plt.figure(figsize=(12, 10))
correlation_matrix = train_data[top_features + ['Milk_Yield_L']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap: Top 15 Features + Target')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()
print("‚úì Correlation heatmap saved as 'correlation_heatmap.png'")
print()

1.4 Feature Type Identification
--------------------------------------------------------------------------------
Found 8 vaccine columns: ['FMD_Vaccine', 'Brucellosis_Vaccine', 'HS_Vaccine', 'BQ_Vaccine', 'Anthrax_Vaccine', 'IBR_Vaccine', 'BVD_Vaccine', 'Rabies_Vaccine']

Numeric features (18):
  ‚Ä¢ Age_Months
  ‚Ä¢ Weight_kg
  ‚Ä¢ Parity
  ‚Ä¢ Days_in_Milk
  ‚Ä¢ Feed_Quantity_kg
  ‚Ä¢ Feeding_Frequency
  ‚Ä¢ Water_Intake_L
  ‚Ä¢ Walking_Distance_km
  ‚Ä¢ Grazing_Duration_hrs
  ‚Ä¢ Rumination_Time_hrs
  ‚Ä¢ Resting_Hours
  ‚Ä¢ Ambient_Temperature_C
  ‚Ä¢ Humidity_percent
  ‚Ä¢ Housing_Score
  ‚Ä¢ Previous_Week_Avg_Yield
  ‚Ä¢ Body_Condition_Score
  ‚Ä¢ Milking_Interval_hrs
  ‚Ä¢ Feed_Quantity_lb

Categorical features (14):
  ‚Ä¢ Breed
  ‚Ä¢ Climate_Zone
  ‚Ä¢ Management_System
  ‚Ä¢ Lactation_Stage
  ‚Ä¢ Feed_Type
  ‚Ä¢ Mastitis
  ‚Ä¢ FMD_Vaccine
  ‚Ä¢ Brucellosis_Vaccine
  ‚Ä¢ HS_Vaccine
  ‚Ä¢ BQ_Vaccine
  ‚Ä¢ Anthrax_Vaccine
  ‚Ä¢ IBR_Vaccine
  ‚Ä¢ BVD_Vaccine
  ‚Ä¢ Rabies_Vaccine



In [37]:
# 1.8 Categorical Feature Analysis
print("1.8 Categorical Feature Analysis")
print("-" * 80)
for cat_feat in categorical_features[:5]:  # Show first 5
    if cat_feat in train_data.columns:
        n_unique = train_data[cat_feat].nunique()
        print(f"{cat_feat}: {n_unique} unique values")
        print(f"  Values: {train_data[cat_feat].value_counts().head().to_dict()}")
print()

# 1.9 Outlier Detection
print("1.9 Outlier Detection Using IQR Method")
print("-" * 80)

def detect_outliers_iqr(data, column):
    """Detect outliers using IQR method."""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("Outliers in numeric features:")
for feature in numeric_features:
    if feature in train_data.columns:
        n_outliers, lower, upper = detect_outliers_iqr(train_data, feature)
        if n_outliers > 0:
            pct = 100 * n_outliers / len(train_data)
            print(f"{feature:30} {n_outliers:5} outliers ({pct:5.2f}%) [bounds: {lower:.2f} to {upper:.2f}]")
print()


1.8 Categorical Feature Analysis
--------------------------------------------------------------------------------
Breed: 7 unique values
  Values: {'Holstein': 104775, 'Jersey': 42183, 'Guernsey': 31672, 'Brown Swiss': 31155, 'Holstien': 112}
Climate_Zone: 6 unique values
  Values: {'Temperate': 35224, 'Tropical': 35062, 'Mediterranean': 34994, 'Arid': 34954, 'Subtropical': 34937}
Management_System: 5 unique values
  Values: {'Intensive': 42225, 'Pastoral': 42126, 'Extensive': 41973, 'Semi_Intensive': 41906, 'Mixed': 41770}
Lactation_Stage: 3 unique values
  Values: {'Mid': 83895, 'Early': 63203, 'Late': 62902}
Feed_Type: 8 unique values
  Values: {'Dry_Fodder': 26558, 'Pasture_Grass': 26305, 'Crop_Residues': 26278, 'Concentrates': 26231, 'Mixed_Feed': 26229}

1.9 Outlier Detection Using IQR Method
--------------------------------------------------------------------------------
Outliers in numeric features:
Feed_Quantity_kg                 674 outliers ( 0.32%) [bounds: 1.14 to 22.85]


In [38]:
# ============================================================================
# PHASE 2: DATA CLEANING
# ============================================================================
print("="*80)
print("PHASE 2: DATA CLEANING")
print("="*80)
print()

def clean_data(df, is_train=True):
    """
    Comprehensive data cleaning function for cattle milk yield dataset.

    Args:
        df: DataFrame to clean
        is_train: Boolean indicating if this is training data (has target)

    Returns:
        Cleaned DataFrame
    """
    df_clean = df.copy()

    print(f"Cleaning {'training' if is_train else 'test'} data...")
    print("-" * 80)

    # 2.1 Drop redundant Feed_Quantity_lb (duplicate of Feed_Quantity_kg)
    print("2.1 Removing Redundant Features")
    print("-" * 40)
    if 'Feed_Quantity_lb' in df_clean.columns:
        df_clean.drop('Feed_Quantity_lb', axis=1, inplace=True)
        print("‚úì Dropped Feed_Quantity_lb (redundant with Feed_Quantity_kg)")
    print()

    # 2.2 Handle Date Feature - Extract temporal features
    print("2.2 Processing Date Feature")
    print("-" * 40)
    if 'Date' in df_clean.columns:
        df_clean['Date'] = pd.to_datetime(df_clean['Date'])

        # Extract useful temporal features
        df_clean['Month'] = df_clean['Date'].dt.month
        df_clean['DayOfWeek'] = df_clean['Date'].dt.dayofweek
        df_clean['Quarter'] = df_clean['Date'].dt.quarter

        # Create cyclical features for month (since December is close to January)
        df_clean['Month_sin'] = np.sin(2 * np.pi * df_clean['Month'] / 12)
        df_clean['Month_cos'] = np.cos(2 * np.pi * df_clean['Month'] / 12)

        print("‚úì Extracted: Month, DayOfWeek, Quarter")
        print("‚úì Created cyclical features: Month_sin, Month_cos")

        # Drop original Date column (not useful for ML models directly)
        df_clean.drop('Date', axis=1, inplace=True)
        print("‚úì Dropped original Date column")
    print()

    # 2.3 Handle Missing Values
    print("2.3 Handling Missing Values")
    print("-" * 40)

    # Strategy for numeric features
    for col in numeric_features:
        if col in df_clean.columns and df_clean[col].isnull().sum() > 0:
            missing_pct = 100 * df_clean[col].isnull().sum() / len(df_clean)

            if missing_pct < 5:
                # Low missing: Impute with median
                median_val = df_clean[col].median()
                df_clean[col].fillna(median_val, inplace=True)
                print(f"  {col:30} {missing_pct:5.2f}% missing ‚Üí imputed with median ({median_val:.2f})")

            elif missing_pct < 20:
                # Medium missing: Create indicator + impute with median
                df_clean[f'{col}_missing'] = df_clean[col].isnull().astype(int)
                median_val = df_clean[col].median()
                df_clean[col].fillna(median_val, inplace=True)
                print(f"  {col:30} {missing_pct:5.2f}% missing ‚Üí created indicator + imputed")

            else:
                # High missing: Consider dropping
                print(f"  {col:30} {missing_pct:5.2f}% missing ‚Üí CONSIDER DROPPING")

    # Strategy for categorical features
    for col in categorical_features:
        if col in df_clean.columns and df_clean[col].isnull().sum() > 0:
            missing_pct = 100 * df_clean[col].isnull().sum() / len(df_clean)
            # Fill with 'Unknown' category
            df_clean[col].fillna('Unknown', inplace=True)
            print(f"  {col:30} {missing_pct:5.2f}% missing ‚Üí filled with 'Unknown'")
    print()

    # 2.4 Handle Outliers (Cap rather than remove)
    print("2.4 Handling Outliers (Capping Method)")
    print("-" * 40)

    outlier_count = 0
    for col in numeric_features:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 3 * IQR  # Use 3*IQR for conservative capping
            upper_bound = Q3 + 3 * IQR

            n_lower = (df_clean[col] < lower_bound).sum()
            n_upper = (df_clean[col] > upper_bound).sum()

            if n_lower > 0 or n_upper > 0:
                df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)
                outlier_count += 1
                print(f"  {col:30} capped {n_lower} lower, {n_upper} upper outliers")

    if outlier_count == 0:
        print("  No significant outliers detected (using 3*IQR threshold)")
    print()

    # 2.5 Encode Categorical Variables
    print("2.5 Encoding Categorical Variables")
    print("-" * 40)

    for col in categorical_features:
        if col in df_clean.columns:
            n_unique = df_clean[col].nunique()

            if n_unique <= 2:
                # Binary: Simple label encoding (0/1)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col].astype(str))
                print(f"  {col:30} Binary ‚Üí Label encoded")

            elif n_unique < 10:
                # Low cardinality: One-hot encoding
                dummies = pd.get_dummies(df_clean[col], prefix=col, drop_first=True)
                df_clean = pd.concat([df_clean, dummies], axis=1)
                df_clean.drop(col, axis=1, inplace=True)
                print(f"  {col:30} {n_unique} categories ‚Üí One-hot encoded")

            elif n_unique < 50:
                # Medium cardinality: Frequency encoding
                freq_encoding = df_clean[col].value_counts().to_dict()
                df_clean[f'{col}_freq'] = df_clean[col].map(freq_encoding)
                df_clean.drop(col, axis=1, inplace=True)
                print(f"  {col:30} {n_unique} categories ‚Üí Frequency encoded")

            else:
                # High cardinality: Consider dropping or advanced encoding
                print(f"  {col:30} {n_unique} categories ‚Üí HIGH CARDINALITY, dropping")
                df_clean.drop(col, axis=1, inplace=True)
    print()

    # 2.6 Remove Low-Variance Features
    print("2.6 Removing Low-Variance Features")
    print("-" * 40)

    removed_features = []
    for col in df_clean.select_dtypes(include=[np.number]).columns:
        if col not in ['Cattle_ID', 'Farm_ID', 'Milk_Yield_L']:
            # Check if 95% of values are the same
            if len(df_clean[col].value_counts()) > 0:
                mode_freq = df_clean[col].value_counts().iloc[0] / len(df_clean)
                if mode_freq > 0.95:
                    df_clean.drop(col, axis=1, inplace=True)
                    removed_features.append(col)

    if removed_features:
        print(f"  Removed {len(removed_features)} low-variance features:")
        for feat in removed_features:
            print(f"    ‚Ä¢ {feat}")
    else:
        print("  No low-variance features detected")
    print()

    # 2.7 Final safety check - fill any remaining NaN values
    print("2.7 Final Safety Check for NaN Values")
    print("-" * 40)

    remaining_nan = df_clean.isnull().sum().sum()
    if remaining_nan > 0:
        print(f"  WARNING: Found {remaining_nan} remaining NaN values")
        print(f"  ‚Üí Filling all remaining NaN with column median")

        # For numeric columns, fill with median
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df_clean[col].isnull().any():
                median_val = df_clean[col].median()
                if pd.isna(median_val):  # If median is also NaN, use 0
                    df_clean[col].fillna(0, inplace=True)
                    print(f"    ‚Ä¢ {col}: filled with 0 (median was NaN)")
                else:
                    df_clean[col].fillna(median_val, inplace=True)

        # For any remaining non-numeric NaN, fill with 'Unknown'
        df_clean = df_clean.fillna('Unknown')
    else:
        print("  ‚úì No NaN values detected")

    # Check for infinite values
    inf_count = np.isinf(df_clean.select_dtypes(include=[np.number]).values).sum()
    if inf_count > 0:
        print(f"  WARNING: Found {inf_count} infinite values")
        print(f"  ‚Üí Replacing with finite values")
        df_clean = df_clean.replace([np.inf, -np.inf], [1e10, -1e10])

    print()

    return df_clean

# Apply cleaning to both train and test
train_cleaned = clean_data(train_data, is_train=True)
test_cleaned = clean_data(test_data, is_train=False)

print(f"‚úì Training data shape after cleaning: {train_cleaned.shape}")
print(f"‚úì Test data shape after cleaning: {test_cleaned.shape}")
print()

PHASE 2: DATA CLEANING

Cleaning training data...
--------------------------------------------------------------------------------
2.1 Removing Redundant Features
----------------------------------------
‚úì Dropped Feed_Quantity_lb (redundant with Feed_Quantity_kg)

2.2 Processing Date Feature
----------------------------------------
‚úì Extracted: Month, DayOfWeek, Quarter
‚úì Created cyclical features: Month_sin, Month_cos
‚úì Dropped original Date column

2.3 Handling Missing Values
----------------------------------------
  Feed_Quantity_kg                4.99% missing ‚Üí imputed with median (12.00)
  Housing_Score                   2.99% missing ‚Üí imputed with median (0.65)

2.4 Handling Outliers (Capping Method)
----------------------------------------
  Rumination_Time_hrs            capped 0 lower, 2 upper outliers
  Previous_Week_Avg_Yield        capped 0 lower, 2 upper outliers
  Milking_Interval_hrs           capped 41849 lower, 20984 upper outliers

2.5 Encoding Categor

In [39]:
# ============================================================================
# PHASE 3: FEATURE ENGINEERING
# ============================================================================
print("="*80)
print("PHASE 3: FEATURE ENGINEERING")
print("="*80)
print()

def engineer_features(df, is_train=True):
    """
    Create domain-specific features for dairy cow milk yield prediction.

    Args:
        df: Cleaned DataFrame
        is_train: Boolean indicating if this is training data

    Returns:
        DataFrame with engineered features
    """
    df_eng = df.copy()

    print(f"Engineering features for {'training' if is_train else 'test'} data...")
    print("-" * 80)

    # 3.1 Productivity & Efficiency Ratios
    print("3.1 Creating Productivity & Efficiency Features")
    print("-" * 40)

    if 'Feed_Quantity_kg' in df_eng.columns and 'Weight_kg' in df_eng.columns:
        # Feed efficiency: feed per kg of body weight
        df_eng['Feed_Per_Weight'] = df_eng['Feed_Quantity_kg'] / (df_eng['Weight_kg'] + 1)
        print("  ‚úì Created: Feed_Per_Weight (feed efficiency)")

    if 'Water_Intake_L' in df_eng.columns and 'Weight_kg' in df_eng.columns:
        # Water consumption per kg
        df_eng['Water_Per_Weight'] = df_eng['Water_Intake_L'] / (df_eng['Weight_kg'] + 1)
        print("  ‚úì Created: Water_Per_Weight")

    if 'Previous_Week_Avg_Yield' in df_eng.columns and 'Feed_Quantity_kg' in df_eng.columns:
        # Yield efficiency: milk per kg of feed
        df_eng['Yield_Per_Feed'] = df_eng['Previous_Week_Avg_Yield'] / (df_eng['Feed_Quantity_kg'] + 1)
        print("  ‚úì Created: Yield_Per_Feed (production efficiency)")

    print()

    # 3.2 Activity & Health Indicators
    print("3.2 Creating Activity & Health Features")
    print("-" * 40)

    if all(col in df_eng.columns for col in ['Walking_Distance_km', 'Grazing_Duration_hrs']):
        # Activity level
        df_eng['Activity_Level'] = df_eng['Walking_Distance_km'] + df_eng['Grazing_Duration_hrs']
        print("  ‚úì Created: Activity_Level")

    if all(col in df_eng.columns for col in ['Rumination_Time_hrs', 'Resting_Hours']):
        # Rest/Rumination balance
        df_eng['Rest_Rumination_Ratio'] = df_eng['Resting_Hours'] / (df_eng['Rumination_Time_hrs'] + 1)
        print("  ‚úì Created: Rest_Rumination_Ratio")

    if 'Body_Condition_Score' in df_eng.columns and 'Weight_kg' in df_eng.columns:
        # Adjusted body condition
        df_eng['Adjusted_BCS'] = df_eng['Body_Condition_Score'] * df_eng['Weight_kg'] / 100
        print("  ‚úì Created: Adjusted_BCS (body condition adjusted for weight)")

    print()

    # 3.3 Lactation-Related Features
    print("3.3 Creating Lactation-Specific Features")
    print("-" * 40)

    if 'Days_in_Milk' in df_eng.columns and 'Milking_Interval_hrs' in df_eng.columns:
        # Milkings per day
        df_eng['Milkings_Per_Day'] = 24 / (df_eng['Milking_Interval_hrs'] + 1)
        print("  ‚úì Created: Milkings_Per_Day")

    if 'Days_in_Milk' in df_eng.columns:
        # Lactation curve features (peak milk around 60 days)
        df_eng['DIM_Squared'] = df_eng['Days_in_Milk'] ** 2
        df_eng['Peak_Lactation'] = (df_eng['Days_in_Milk'] >= 40) & (df_eng['Days_in_Milk'] <= 80)
        df_eng['Peak_Lactation'] = df_eng['Peak_Lactation'].astype(int)
        print("  ‚úì Created: DIM_Squared, Peak_Lactation (lactation curve)")

    if 'Parity' in df_eng.columns:
        # Parity groups (first-time vs experienced)
        df_eng['First_Time_Mother'] = (df_eng['Parity'] == 0).astype(int)
        df_eng['Experienced_Mother'] = (df_eng['Parity'] >= 2).astype(int)
        print("  ‚úì Created: First_Time_Mother, Experienced_Mother")

    print()

    # 3.4 Environmental Interactions
    print("3.4 Creating Environmental Interaction Features")
    print("-" * 40)

    if all(col in df_eng.columns for col in ['Ambient_Temperature_C', 'Humidity_percent']):
        # Heat stress index (simplified THI - Temperature Humidity Index)
        # THI = T - 0.55 * (1 - RH/100) * (T - 58) simplified version
        df_eng['Heat_Stress_Index'] = (df_eng['Ambient_Temperature_C'] +
                                       0.36 * df_eng['Humidity_percent'] / 100 *
                                       df_eng['Ambient_Temperature_C'])
        print("  ‚úì Created: Heat_Stress_Index (temperature-humidity interaction)")

    if 'Ambient_Temperature_C' in df_eng.columns:
        # Temperature stress indicators
        df_eng['Heat_Stress'] = (df_eng['Ambient_Temperature_C'] > 25).astype(int)
        df_eng['Cold_Stress'] = (df_eng['Ambient_Temperature_C'] < 5).astype(int)
        print("  ‚úì Created: Heat_Stress, Cold_Stress indicators")

    print()

    # 3.5 Age & Experience Features
    print("3.5 Creating Age & Experience Features")
    print("-" * 40)

    if 'Age_Months' in df_eng.columns:
        # Age groups
        df_eng['Age_Years'] = df_eng['Age_Months'] / 12
        df_eng['Young_Cow'] = (df_eng['Age_Months'] < 30).astype(int)  # Less than 2.5 years
        df_eng['Prime_Age'] = ((df_eng['Age_Months'] >= 30) & (df_eng['Age_Months'] <= 72)).astype(int)
        df_eng['Senior_Cow'] = (df_eng['Age_Months'] > 72).astype(int)  # Over 6 years
        print("  ‚úì Created: Age_Years, Young_Cow, Prime_Age, Senior_Cow")

    if 'Age_Months' in df_eng.columns and 'Parity' in df_eng.columns:
        # Average age at calving
        df_eng['Avg_Age_At_Calving'] = df_eng['Age_Months'] / (df_eng['Parity'] + 1)
        print("  ‚úì Created: Avg_Age_At_Calving")

    print()

    # 3.6 Feed & Nutrition Features
    print("3.6 Creating Feed & Nutrition Features")
    print("-" * 40)

    if all(col in df_eng.columns for col in ['Feed_Quantity_kg', 'Feeding_Frequency']):
        # Feed per meal
        df_eng['Feed_Per_Meal'] = df_eng['Feed_Quantity_kg'] / (df_eng['Feeding_Frequency'] + 1)
        print("  ‚úì Created: Feed_Per_Meal")

    if all(col in df_eng.columns for col in ['Feed_Quantity_kg', 'Water_Intake_L']):
        # Feed to water ratio
        df_eng['Feed_Water_Ratio'] = df_eng['Feed_Quantity_kg'] / (df_eng['Water_Intake_L'] + 1)
        print("  ‚úì Created: Feed_Water_Ratio")

    print()

    # 3.7 Log Transformations for Skewed Features
    print("3.7 Applying Log Transformations to Skewed Features")
    print("-" * 40)

    # Identify highly skewed features
    skewed_features = []
    for col in df_eng.select_dtypes(include=[np.number]).columns:
        if col not in ['Cattle_ID', 'Farm_ID', 'Milk_Yield_L']:
            skewness = df_eng[col].skew()
            if abs(skewness) > 1.5:  # Threshold for high skewness
                skewed_features.append((col, skewness))
                df_eng[f'{col}_log'] = np.log1p(df_eng[col])  # log1p handles zeros

    if skewed_features:
        print(f"  Applied log transformation to {len(skewed_features)} skewed features:")
        for feat, skew in skewed_features[:5]:  # Show first 5
            print(f"    ‚Ä¢ {feat} (skewness: {skew:.2f})")
    else:
        print("  No highly skewed features detected")

    print()

    # 3.8 Final check for NaN/Inf in engineered features
    print("3.8 Final Check for Invalid Values in Engineered Features")
    print("-" * 40)

    # Replace any inf values created during feature engineering
    inf_count = np.isinf(df_eng.select_dtypes(include=[np.number]).values).sum()
    if inf_count > 0:
        print(f"  Found {inf_count} infinite values in engineered features")
        df_eng = df_eng.replace([np.inf, -np.inf], [1e10, -1e10])
        print(f"  ‚úì Replaced infinite values")

    # Fill any NaN created during feature engineering
    nan_count = df_eng.isnull().sum().sum()
    if nan_count > 0:
        print(f"  Found {nan_count} NaN values in engineered features")
        numeric_cols = df_eng.select_dtypes(include=[np.number]).columns
        df_eng[numeric_cols] = df_eng[numeric_cols].fillna(df_eng[numeric_cols].median())
        print(f"  ‚úì Filled NaN values with median")
    else:
        print(f"  ‚úì No invalid values in engineered features")

    print()

    # 3.9 Advanced Interaction & Polynomial Features
    print("3.9 Creating Advanced Interaction Features")
    print("-" * 40)
    
    # KEY INTERACTION: Weight √ó Age (older heavier cows produce differently)
    if 'Weight_kg' in df_eng.columns and 'Age_Months' in df_eng.columns:
        df_eng['Weight_Age_Interaction'] = (df_eng['Weight_kg'] * df_eng['Age_Months']) / 1000
        print("  ‚úì Created: Weight_Age_Interaction")
    
    # Feed √ó Water interaction (nutritional synergy)
    if 'Feed_Quantity_kg' in df_eng.columns and 'Water_Intake_L' in df_eng.columns:
        df_eng['Feed_Water_Interaction'] = df_eng['Feed_Quantity_kg'] * df_eng['Water_Intake_L']
        print("  ‚úì Created: Feed_Water_Interaction")
    
    # Parity √ó Days in Milk (experienced cows at different lactation stages)
    if 'Parity' in df_eng.columns and 'Days_in_Milk' in df_eng.columns:
        df_eng['Parity_DIM_Interaction'] = df_eng['Parity'] * df_eng['Days_in_Milk']
        print("  ‚úì Created: Parity_DIM_Interaction")
    
    # POLYNOMIAL FEATURES (capture non-linear relationships)
    if 'Weight_kg' in df_eng.columns:
        df_eng['Weight_Squared'] = df_eng['Weight_kg'] ** 2
        print("  ‚úì Created: Weight_Squared")
    
    if 'Previous_Week_Avg_Yield' in df_eng.columns:
        df_eng['Previous_Week_Yield_Squared'] = df_eng['Previous_Week_Avg_Yield'] ** 2
        print("  ‚úì Created: Previous_Week_Yield_Squared")
    
    # Better lactation curve (Wood's curve approximation)
    if 'Days_in_Milk' in df_eng.columns:
        df_eng['Lactation_Peak'] = df_eng['Days_in_Milk'] * np.exp(-0.05 * df_eng['Days_in_Milk'])
        df_eng['Log_DIM'] = np.log1p(df_eng['Days_in_Milk'])
        print("  ‚úì Created: Lactation_Peak, Log_DIM (improved lactation curve)")
    
    # Breed-specific features (Holstein is highest producer)
    if 'Breed_Holstein' in df_eng.columns and 'Weight_kg' in df_eng.columns:
        df_eng['Holstein_Weight'] = df_eng['Breed_Holstein'] * df_eng['Weight_kg']
        print("  ‚úì Created: Holstein_Weight")
    
    print()

    return df_eng

# Apply feature engineering
train_featured = engineer_features(train_cleaned, is_train=True)
test_featured = engineer_features(test_cleaned, is_train=False)

print(f"‚úì Training data shape after feature engineering: {train_featured.shape}")
print(f"‚úì Test data shape after feature engineering: {test_featured.shape}")
print()

PHASE 3: FEATURE ENGINEERING

Engineering features for training data...
--------------------------------------------------------------------------------
3.1 Creating Productivity & Efficiency Features
----------------------------------------
  ‚úì Created: Feed_Per_Weight (feed efficiency)
  ‚úì Created: Water_Per_Weight
  ‚úì Created: Yield_Per_Feed (production efficiency)

3.2 Creating Activity & Health Features
----------------------------------------
  ‚úì Created: Activity_Level
  ‚úì Created: Rest_Rumination_Ratio
  ‚úì Created: Adjusted_BCS (body condition adjusted for weight)

3.3 Creating Lactation-Specific Features
----------------------------------------
  ‚úì Created: DIM_Squared, Peak_Lactation (lactation curve)
  ‚úì Created: First_Time_Mother, Experienced_Mother

3.4 Creating Environmental Interaction Features
----------------------------------------
  ‚úì Created: Heat_Stress_Index (temperature-humidity interaction)
  ‚úì Created: Heat_Stress, Cold_Stress indicators

3.

In [40]:
# ============================================================================
# PHASE 4: PREPARE DATA FOR MODELING (WITH FIX)
# ============================================================================
print("="*80)
print("PHASE 4: PREPARING DATA FOR MODELING")
print("="*80)
print()

# Separate features and target
y_train = train_featured['Milk_Yield_L'].values
X_train = train_featured.drop(['Milk_Yield_L', 'Cattle_ID', 'Farm_ID'], axis=1, errors='ignore')
X_test = test_featured.drop(['Cattle_ID', 'Farm_ID'], axis=1, errors='ignore')

# 4.1 Aligning Train and Test Features
print("4.1 Aligning Train and Test Features")
print("-" * 40)
# Get common columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
print(f"  Train features: {X_train.shape[1]}")
print(f"  Test features: {X_test.shape[1]}")
print(f"  ‚úì Features aligned successfully")
print()

# 4.1.5 FIX: Drop all remaining non-numeric columns
print("4.1.5 FIX: Removing Non-Numeric Columns")
print("-" * 40)
non_numeric_cols_train = X_train.select_dtypes(include=['object']).columns.tolist()
non_numeric_cols_test = X_test.select_dtypes(include=['object']).columns.tolist()

if non_numeric_cols_train or non_numeric_cols_test:
    cols_to_drop = list(set(non_numeric_cols_train + non_numeric_cols_test))
    X_train.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
    X_test.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
    print(f"  Removed {len(cols_to_drop)} non-numeric columns to ensure all features are float/int.")
else:
    print("  No non-numeric columns found to remove.")

# Re-align after dropping (important if non-numeric columns were present)
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
print(f"  Final Train features count: {X_train.shape[1]}")
print()

# 4.2 Final NaN/Inf Check and Handling (Aggressive Dtype Enforcement)
print("4.2 Final NaN/Inf Check and Handling (Aggressive Dtype Enforcement)")
print("-" * 40)

# ‚≠êÔ∏è CRITICAL FIX: AGGRESSIVE CONVERSION
# Force all columns to float, which is the necessary step for np.isinf to work.
# Any value that cannot be converted to float will become NaN.
print("  Attempting aggressive dtype conversion (float64)...")

# Identify columns that are not already numeric
non_float_cols = X_train.select_dtypes(exclude=[np.number]).columns

if len(non_float_cols) > 0:
    print(f"  Found {len(non_float_cols)} non-numeric columns before final conversion.")

for col in X_train.columns:
    try:
        # Convert to float64. This will trigger the TypeError if an incompatible object
        # is still present, but by forcing it on the DataFrame, we isolate the issue
        # away from the 'np.isinf(X_train.values)' step.
        X_train[col] = X_train[col].astype(np.float64, errors='raise')
        X_test[col] = X_test[col].astype(np.float64, errors='raise')

    except ValueError as e:
        # If astype(float) raises a ValueError, it means a non-numeric string
        # or object is definitively present. This shouldn't happen after the
        # previous steps, but if it does, we drop the column.
        print(f"  CRITICAL ERROR: Column '{col}' contains un-coercible non-numeric data. Dropping.")
        X_train.drop(col, axis=1, inplace=True)
        X_test.drop(col, axis=1, inplace=True)

# Re-align features after aggressive drop
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]


# --- Continue with the NaN/Inf checks, which must now work ---

# Check for NaN values
nan_cols_train = X_train.columns[X_train.isnull().any()].tolist()
nan_cols_test = X_test.columns[X_test.isnull().any()].tolist()

if nan_cols_train:
    print(f"  WARNING: Found NaN values in {len(nan_cols_train)} training columns.")
    X_train = X_train.fillna(X_train.median())
    print(f"  ‚Üí Filling NaN values with column median in train set.")

if nan_cols_test:
    print(f"  WARNING: Found NaN values in {len(nan_cols_test)} test columns.")
    # Fill test NaN with training median
    X_test = X_test.fillna(X_train.median())
    print(f"  ‚Üí Filled test NaN values with training median.")

# Check for infinite values (This is the original line that now works)
inf_mask_train = np.isinf(X_train.values).any(axis=0)
inf_mask_test = np.isinf(X_test.values).any(axis=0)

if inf_mask_train.any():
    inf_cols = X_train.columns[inf_mask_train].tolist()
    print(f"  WARNING: Found Inf values in {len(inf_cols)} columns")
    X_train = X_train.replace([np.inf, -np.inf], [1e10, -1e10])
    print(f"  ‚Üí Replaced Inf values with finite numbers in X_train")

if inf_mask_test.any():
    X_test = X_test.replace([np.inf, -np.inf], [1e10, -1e10])
    print(f"  ‚Üí Replaced Inf values with finite numbers in X_test")

# 4.3 Feature Scaling
print("4.3 Scaling Features")
print("-" * 40)

# Use RobustScaler (less sensitive to outliers than StandardScaler)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier manipulation
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print(f"  Scaled {X_train.shape[1]} features using RobustScaler")
print(f"  ‚úì Scaling complete")
print()

"""
# 4.4 Optional: PCA for Dimensionality Reduction
print("4.4 Dimensionality Reduction (Optional)")
print("-" * 40)

if X_train_scaled.shape[1] > 50:
    print(f"  Feature count ({X_train_scaled.shape[1]}) is high - applying PCA")

    # Verify no NaN before PCA
    if X_train_scaled.isnull().any().any():
        print("  ERROR: Found NaN in scaled data, filling with 0")
        X_train_scaled = X_train_scaled.fillna(0)
        X_test_scaled = X_test_scaled.fillna(0)

    # Determine number of components to keep (e.g., 95% variance)
    pca = PCA(n_components=0.95)

    try:
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_test_pca = pca.transform(X_test_scaled)

        print(f"  Original features: {X_train_scaled.shape[1]}")
        print(f"  PCA components: {X_train_pca.shape[1]}")
        print(f"  Explained variance: {pca.explained_variance_ratio_.sum():.3f}")

        # Visualize explained variance
        plt.figure(figsize=(10, 6))
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel('Number of Components')
        plt.ylabel('Cumulative Explained Variance')
        plt.title('PCA Explained Variance')
        plt.grid(True)
        plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
        plt.legend()
        plt.tight_layout()
        plt.savefig('pca_variance.png')
        plt.close()
        print("  ‚úì PCA visualization saved as 'pca_variance.png'")

        # Use PCA transformed data
        X_train_final = X_train_pca
        X_test_final = X_test_pca

    except Exception as e:
        print(f"  WARNING: PCA failed with error: {e}")
        print(f"  ‚Üí Using scaled data without PCA")
        X_train_final = X_train_scaled.values
        X_test_final = X_test_scaled.values
else:
    print(f"  Feature count ({X_train_scaled.shape[1]}) is manageable - skipping PCA")
    X_train_final = X_train_scaled.values
    X_test_final = X_test_scaled.values
"""

X_train_final = X_train_scaled.values
X_test_final = X_test_scaled.values

# Final verification
assert X_train.select_dtypes(include=['object']).shape[1] == 0, "Non-numeric columns still exist in X_train!"
assert not X_train.isnull().any().any(), "Training data still contains NaN!"
assert not X_test.isnull().any().any(), "Test data still contains NaN!"
print(f"  ‚úì All columns verified as numeric, no NaN or Inf values remaining")
print()



PHASE 4: PREPARING DATA FOR MODELING

4.1 Aligning Train and Test Features
----------------------------------------
  Train features: 89
  Test features: 89
  ‚úì Features aligned successfully

4.1.5 FIX: Removing Non-Numeric Columns
----------------------------------------
  No non-numeric columns found to remove.
  Final Train features count: 89

4.2 Final NaN/Inf Check and Handling (Aggressive Dtype Enforcement)
----------------------------------------
  Attempting aggressive dtype conversion (float64)...
  Found 24 non-numeric columns before final conversion.
4.3 Scaling Features
----------------------------------------
  Scaled 89 features using RobustScaler
  ‚úì Scaling complete

  ‚úì All columns verified as numeric, no NaN or Inf values remaining



In [41]:
# # ============================================================================
# # PHASE 4.5: FEATURE IMPORTANCE & SELECTION
# # ============================================================================
# print("="*80)
# print("PHASE 4.5: FEATURE IMPORTANCE & SELECTION")
# print("="*80)
# print()

# # Train a quick Random Forest to get feature importances
# print("Training Random Forest for feature importance analysis...")
# from sklearn.ensemble import RandomForestRegressor

# rf_selector = RandomForestRegressor(
#     n_estimators=100, 
#     max_depth=10, 
#     random_state=42,
#     n_jobs=-1
# )

# # Use the SCALED data (X_train_scaled) with target (y_train)
# rf_selector.fit(X_train_scaled, y_train)

# # Get feature importances
# feature_importance = pd.DataFrame({
#     'feature': X_train_scaled.columns,
#     'importance': rf_selector.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\n‚úì Top 20 Most Important Features:")
# print("-" * 50)
# for idx, row in feature_importance.head(20).iterrows():
#     print(f"  {row['feature']:40} {row['importance']:.4f}")

# print("\n‚úì Bottom 20 Least Important Features:")
# print("-" * 50)
# for idx, row in feature_importance.tail(20).iterrows():
#     print(f"  {row['feature']:40} {row['importance']:.4f}")

# # OPTIONAL: Remove very low importance features (threshold = 0.001)
# # Uncomment below if you want to actually remove features

# low_importance_threshold = 0.001
# low_importance_features = feature_importance[
#     feature_importance['importance'] < low_importance_threshold
# ]['feature'].tolist()

# if len(low_importance_features) > 0:
#     print(f"\nüóëÔ∏è  Removing {len(low_importance_features)} low-importance features (< {low_importance_threshold}):")
#     for feat in low_importance_features[:10]:  # Show first 10
#         print(f"  ‚Ä¢ {feat}")
    
#     # Remove from scaled DataFrames
#     X_train_scaled = X_train_scaled.drop(columns=low_importance_features)
#     X_test_scaled = X_test_scaled.drop(columns=low_importance_features)
    
#     # Update the final arrays
#     X_train_final = X_train_scaled.values
#     X_test_final = X_test_scaled.values
    
#     print(f"\n‚úì Training data shape after selection: {X_train_scaled.shape}")
#     print(f"‚úì Test data shape after selection: {X_test_scaled.shape}")
# else:
#     print(f"\n‚ÑπÔ∏è  No features below threshold {low_importance_threshold}")

# print("\n" + "="*80)

In [42]:
# ============================================================================
# PHASE 5: MODEL EVALUATION SETUP
# ============================================================================
print("="*80)
print("PHASE 5: MODEL EVALUATION FRAMEWORK")
print("="*80)
print()

def evaluate_model(model, X, y, cv=5):
    """
    Evaluate a model using cross-validation.
    Returns mean and std of RMSE.
    """
    scores = cross_val_score(model, X, y, cv=cv,
                            scoring='neg_mean_squared_error',
                            n_jobs=-1)
    rmse_scores = np.sqrt(-scores)
    return {
        'mean_rmse': rmse_scores.mean(),
        'std_rmse': rmse_scores.std(),
        'scores': rmse_scores
    }

def print_results(model_name, results):
    """Pretty print evaluation results."""
    print(f"{model_name:35} | RMSE: {results['mean_rmse']:.4f} (+/- {results['std_rmse']:.4f})")

print("‚úì Evaluation framework ready")
print()

# ============================================================================
# PHASE 6: BASELINE MODEL EVALUATION
# ============================================================================
print("="*80)
print("PHASE 6: EVALUATING BASELINE MODELS")
print("="*80)
print()

# Define baseline models with class-approved configurations
baseline_models = {
    # Linear Models
   
    #'Linear Regression': LinearRegression(),
    #'Ridge Regression': Ridge(alpha=1.0),

    # Tree-based Models
    #'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
   

    # Instance-based Models
    #'KNN (k=5)': KNeighborsRegressor(n_neighbors=5, n_jobs=1),
    #'KNN (k=10)': KNeighborsRegressor(n_neighbors=10, n_jobs=1),

    # Support Vector Machines
    #'SVR (Linear)': SVR(kernel='linear', C=1.0),
    #'SVR (RBF)': SVR(kernel='rbf', C=1.0, gamma='scale'),
    
    # Ensemble Methods (Bagging)
    #'Random Forest': RandomForestRegressor(
    #    n_estimators=100,
    #    max_depth=15,
    #    random_state=42,
    #    n_jobs=-1
    #),
    # Ensemble Methods (Boosting)
    
    'CatBoost': CatBoostRegressor(
        iterations=200,
        depth=6,
        learning_rate=0.05,
        random_seed=42,
        verbose=False  # Suppresses training output
    ),

    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=128,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ),
    # Neural Networks (with class-approved configurations)
    'Neural Net (ReLU)': MLPRegressor(
        #hidden_layer_sizes=(100, 50),
        hidden_layer_sizes=(8, 4),
        activation='relu',
        solver='adam',
        alpha=0.01,
        learning_rate='adaptive',
        max_iter=500,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    ),

    'Neural Net (Tanh)': MLPRegressor(
        hidden_layer_sizes=(8, 4),
        activation='tanh',
        solver='adam',
        alpha=0.01,
        learning_rate='adaptive',
        max_iter=500,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    ),


    'Neural Net (Logistic)': MLPRegressor(
        hidden_layer_sizes=(8, 4),
        activation='logistic',
        solver='adam',
        alpha=0.01,
        learning_rate='adaptive',
        max_iter=500,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    )
}

"""
print("NOTE: Neural networks use RobustScaler preprocessing (similar to batch normalization)")
print("Activation functions: ReLU, Tanh, Logistic (Sigmoid) - all covered in class")
print("Optimizer: Adam - class-approved")
print("Learning rate: 'adaptive' - class-approved (reduces LR when validation plateaus)")
print("Regularization: L2 via alpha parameter (weight regularization)")
print()
"""

# Store results
baseline_results = {}

# Evaluate each baseline model
print("Evaluating baseline models with 5-fold cross-validation...")
print("-" * 80)

for name, model in baseline_models.items():
    results = evaluate_model(model, X_train_final, y_train, cv=5)
    results['model'] = model
    baseline_results[name] = results
    print_results(name, results)

print()

# Identify top 5 models for further tuning
sorted_models = sorted(baseline_results.items(), key=lambda x: x[1]['mean_rmse'])
top_k = 5
top_models = [name for name, _ in sorted_models[:top_k]]

print(f"Top {top_k} models for hyperparameter tuning:")
for i, name in enumerate(top_models, 1):
    print(f"  {i}. {name} (RMSE: {baseline_results[name]['mean_rmse']:.4f})")
print()

PHASE 5: MODEL EVALUATION FRAMEWORK

‚úì Evaluation framework ready

PHASE 6: EVALUATING BASELINE MODELS

Evaluating baseline models with 5-fold cross-validation...
--------------------------------------------------------------------------------
CatBoost                            | RMSE: 4.1184 (+/- 0.0144)
Gradient Boosting                   | RMSE: 4.1241 (+/- 0.0144)
Neural Net (ReLU)                   | RMSE: 4.1794 (+/- 0.0220)
Neural Net (Tanh)                   | RMSE: 4.1642 (+/- 0.0139)
Neural Net (Logistic)               | RMSE: 4.1512 (+/- 0.0133)

Top 5 models for hyperparameter tuning:
  1. CatBoost (RMSE: 4.1184)
  2. Gradient Boosting (RMSE: 4.1241)
  3. Neural Net (Logistic) (RMSE: 4.1512)
  4. Neural Net (Tanh) (RMSE: 4.1642)
  5. Neural Net (ReLU) (RMSE: 4.1794)



In [43]:
# ============================================================================
# PHASE 7: ENSEMBLE BUILDING (Revised and Corrected)
# ============================================================================
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
# Ensure evaluate_model, baseline_models, top_models, X_train_final, y_train are defined

print("="*80)
print("PHASE 7: ENSEMBLE BUILDING (Stacking & Voting)")
print("="*80)
print("Strategy: Using the Top 3 Untuned Baseline Models to build ensembles.")
print()

# 7.1 Prepare Base Estimators (CORRECTED LOGIC)
# Use the Top 3 models identified in Phase 6
top_models = top_models[:3]
base_estimators = []
for name in top_models:
    # 1. Get the Model Object
    model_object = baseline_models[name]
    
    # 2. Create the Clean Name
    # We strip spaces and parentheses for use in Stacking/Voting tuples.
    clean_name = name.replace(" ", "_").replace("(", "").replace(")", "")
    
    # 3. Append the valid (name, object) tuple
    base_estimators.append((clean_name, model_object))

# Verify the clean names before proceeding
print("VERIFIED Estimators and Cleaned Names:")
for name, _ in base_estimators:
    print(f"  - {name}")

# Using 5-fold CV for ensemble evaluation (matching baseline)
CV_FOLDS = 5 
print("-" * 80)

# ----------------------------------------------------------------------------
# 7.2 Stacking Ensemble (High Performance)
# ----------------------------------------------------------------------------
print("7.2 Training Stacking Regressor (Top 3 Baselines + Ridge Meta-Learner)")

stacking_regressor = StackingRegressor(
    estimators=base_estimators, 
    # Use Ridge as the stable, regularized meta-learner
    final_estimator=Ridge(alpha=1.0, random_state=42), 
    cv=CV_FOLDS, 
    n_jobs=-1,
    passthrough=False 
)

# Evaluate Stacking Ensemble using cross-validation
# This line caused the error, but with clean names, it should now run.
stacking_results = evaluate_model(stacking_regressor, X_train_final, y_train, cv=CV_FOLDS)
print_results("Stacking (Ridge Meta)", stacking_results)

# Store results
baseline_results["Stacking (Ridge Meta)"] = {'mean_rmse': stacking_results['mean_rmse'], 'model': stacking_regressor, 'scores': stacking_results['scores']}

# ----------------------------------------------------------------------------
# 7.3 Weighted Voting Ensemble (Simple and Effective Averaging)
# ----------------------------------------------------------------------------
print("\n7.3 Training Weighted Voting Regressor")

# Calculate weights: 1 / (RMSE^2) gives more aggressive weighting to better models
weights = []
for name in top_models:
    rmse = baseline_results[name]['mean_rmse'] 
    weights.append(1 / (rmse ** 2))
    
if all(w > 0 for w in weights):
    voting_regressor = VotingRegressor(
        estimators=base_estimators,
        weights=weights,
        n_jobs=-1
    )
    
    # Evaluate Voting Ensemble using cross-validation
    voting_results = evaluate_model(voting_regressor, X_train_final, y_train, cv=CV_FOLDS)
    print_results("Voting (Weighted)", voting_results)
    
    # Store results
    baseline_results["Voting (Weighted)"] = {'mean_rmse': voting_results['mean_rmse'], 'model': voting_regressor, 'scores': voting_results['scores']}
else:
    print("WARNING: Could not calculate weights. Skipping Weighted Voting.")

print("\n‚úì Ensemble building complete and names validated.")
print("="*80)

PHASE 7: ENSEMBLE BUILDING (Stacking & Voting)
Strategy: Using the Top 3 Untuned Baseline Models to build ensembles.

VERIFIED Estimators and Cleaned Names:
  - CatBoost
  - Gradient_Boosting
  - Neural_Net_Logistic
--------------------------------------------------------------------------------
7.2 Training Stacking Regressor (Top 3 Baselines + Ridge Meta-Learner)
Stacking (Ridge Meta)               | RMSE: 4.1140 (+/- 0.0141)

7.3 Training Weighted Voting Regressor
Voting (Weighted)                   | RMSE: 4.1178 (+/- 0.0138)

‚úì Ensemble building complete and names validated.


In [44]:
# ============================================================================
# PHASE 9: FINAL MODEL SELECTION AND SUBMISSION
# ============================================================================
print("="*80)
print("PHASE 9: FINAL MODEL SELECTION & SUBMISSION")
print("="*80)

# 9.1 Compile and Rank Results
print("9.1 Final Performance Ranking (Lower RMSE is Better)")
print("-" * 50)

# Compile all current results (baselines and ensembles)
final_ranking = sorted(
    # baseline_results now holds all models (baselines + ensembles)
    [(name, result['mean_rmse'], result['model']) for name, result in baseline_results.items()],
    key=lambda x: x[1]
)

# Identify the best model
best_model_name, best_rmse, final_model = final_ranking[0]

# Get the baseline for comparison (Linear Regression, typically the worst)
try:
    # Use Linear Regression RMSE as the true performance benchmark
    baseline_rmse = baseline_results['Linear Regression']['mean_rmse']
except KeyError:
    # Fallback if Linear Regression was not evaluated
    baseline_rmse = final_ranking[-1][1] 

print(f"Benchmark (Linear Regression): {baseline_rmse:.4f}")
print("-" * 50)

for name, rmse, _ in final_ranking:
    # Calculate improvement percentage
    improvement = (baseline_rmse - rmse) / baseline_rmse * 100
    
    # Add an emoji for the top performing models
    prefix = "‚≠ê" if "Stacking" in name or "Voting" in name else ""
    if name == best_model_name:
         prefix = "ü•á"

    print(f"{prefix} {name:34} | RMSE: {rmse:.4f} | Improvement: {improvement:.2f}%")

print("\n9.2 Final Model Selection")
print("-" * 50)
print(f"ü•á **Best Model Selected: {best_model_name}**")
print(f"   Final Cross-Validation RMSE: {best_rmse:.4f}")
print(f"   Total Improvement over Benchmark: {((baseline_rmse - best_rmse) / baseline_rmse * 100):.2f}%")
print("-" * 50)


# 9.3 Final Training on FULL Data and Prediction
print("\n9.3 Final Training on FULL X_train_final and Prediction")
print("-" * 50)

# The selected final_model is trained one last time on 100% of the preprocessed data.
# This assumes X_train_final, X_test_final, and y_train are available from Phase 4.

# Train the final model using the full scaled dataset 
final_model.fit(X_train_final, y_train)

# Generate predictions on the test set
test_predictions = final_model.predict(X_test_final)
test_predictions[test_predictions < 0] = 0.0 # Clip negative predictions

print(f"  Final model trained successfully on {X_train_final.shape[0]} samples.")
print(f"  Generated {len(test_predictions)} predictions.")

# 9.4 Create Submission File
print("\n9.4 Creating Submission File")
print("-" * 50)

# Note: 'test_data' must be the original un-processed test DataFrame to extract 'Cattle_ID'
submission_df = pd.DataFrame({
    'Cattle_ID': test_data['Cattle_ID'], 
    'Milk_Yield_L': test_predictions
})

submission_file = 'dairy_cow_submission_ensemble_baseline.csv'
submission_df.to_csv(submission_file, index=False)
print(f"‚úì Submission file created: '{submission_file}'")
print("="*80)

PHASE 9: FINAL MODEL SELECTION & SUBMISSION
9.1 Final Performance Ranking (Lower RMSE is Better)
--------------------------------------------------
Benchmark (Linear Regression): 4.1794
--------------------------------------------------
ü•á Stacking (Ridge Meta)              | RMSE: 4.1140 | Improvement: 1.57%
‚≠ê Voting (Weighted)                  | RMSE: 4.1178 | Improvement: 1.47%
 CatBoost                           | RMSE: 4.1184 | Improvement: 1.46%
 Gradient Boosting                  | RMSE: 4.1241 | Improvement: 1.32%
 Neural Net (Logistic)              | RMSE: 4.1512 | Improvement: 0.67%
 Neural Net (Tanh)                  | RMSE: 4.1642 | Improvement: 0.36%
 Neural Net (ReLU)                  | RMSE: 4.1794 | Improvement: 0.00%

9.2 Final Model Selection
--------------------------------------------------
ü•á **Best Model Selected: Stacking (Ridge Meta)**
   Final Cross-Validation RMSE: 4.1140
   Total Improvement over Benchmark: 1.57%
----------------------------------------