# House Prices – Model Inference

Test Set Prediction using the Best Model from Model Registry

## 1. Setup and Imports

In [9]:
%pip install -q dagshub mlflow

import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import warnings
warnings.filterwarnings('ignore')

os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/g-kitiashvili/ML-assignment1.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME'] = 'g-kitiashvili'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '1c2227158cc19daf66bb3b241116a8e8c5f1cd20'

print("House Prices - Model Inference")
print("=" * 50)

Note: you may need to restart the kernel to use updated packages.
House Prices - Model Inference


## 2. Load Test Data and Preprocessing Functions

In [10]:
# Load test data
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')
test_ids = test_data['Id'].copy()

print(f"Test data shape: {test_data.shape}")

# Define the same preprocessing functions from experiment notebook
def clean_data(df, is_train=True):
    """
    Data cleaning function (same as in experiment)
    """
    df_clean = df.copy()
    
    # Remove outliers (only for training data)
    if is_train:
        # Remove extreme outliers based on GrLivArea and SalePrice
        df_clean = df_clean.drop(df_clean[(df_clean['GrLivArea'] > 4000) & 
                                         (df_clean['SalePrice'] < 300000)].index)
    
    # Handle missing values for specific features
    # Categorical features with meaningful NA
    categorical_na_features = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                              'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                              'PoolQC', 'Fence', 'MiscFeature']
    
    for feature in categorical_na_features:
        if feature in df_clean.columns:
            df_clean[feature] = df_clean[feature].fillna('None')
    
    # Numerical features with meaningful zero
    numerical_zero_features = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                              'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
                              'GarageArea', 'GarageCars', 'MasVnrArea']
    
    for feature in numerical_zero_features:
        if feature in df_clean.columns:
            df_clean[feature] = df_clean[feature].fillna(0)
    
    # Other categorical features - fill with mode
    categorical_features = df_clean.select_dtypes(include=['object']).columns
    for feature in categorical_features:
        if df_clean[feature].isnull().any():
            mode_value = df_clean[feature].mode()[0] if len(df_clean[feature].mode()) > 0 else 'Unknown'
            df_clean[feature] = df_clean[feature].fillna(mode_value)
    
    # Numerical features - fill with median
    numerical_features = df_clean.select_dtypes(include=[np.number]).columns
    for feature in numerical_features:
        if df_clean[feature].isnull().any():
            median_value = df_clean[feature].median()
            df_clean[feature] = df_clean[feature].fillna(median_value)
    
    return df_clean

def feature_engineering(df):
    """
    Feature engineering function (same as in experiment)
    """
    df_fe = df.copy()
    
    # Create new features
    df_fe['TotalSF'] = df_fe['TotalBsmtSF'] + df_fe['1stFlrSF'] + df_fe['2ndFlrSF']
    df_fe['Total_Bathrooms'] = (df_fe['FullBath'] + (0.5 * df_fe['HalfBath']) +
                               df_fe['BsmtFullBath'] + (0.5 * df_fe['BsmtHalfBath']))
    df_fe['Total_porch_sf'] = (df_fe['OpenPorchSF'] + df_fe['3SsnPorch'] +
                              df_fe['EnclosedPorch'] + df_fe['ScreenPorch'] +
                              df_fe['WoodDeckSF'])
    
    # Age of house
    df_fe['HouseAge'] = df_fe['YrSold'] - df_fe['YearBuilt']
    df_fe['RemodAge'] = df_fe['YrSold'] - df_fe['YearRemodAdd']
    
    # Garage age
    df_fe['GarageAge'] = df_fe['YrSold'] - df_fe['GarageYrBlt']
    df_fe['GarageAge'] = df_fe['GarageAge'].fillna(0)
    
    # Has features
    df_fe['HasBasement'] = (df_fe['TotalBsmtSF'] > 0).astype(int)
    df_fe['HasGarage'] = (df_fe['GarageArea'] > 0).astype(int)
    df_fe['HasFireplace'] = (df_fe['Fireplaces'] > 0).astype(int)
    df_fe['HasPool'] = (df_fe['PoolArea'] > 0).astype(int)
    
    # Price per square foot proxies
    df_fe['Price_per_sqft_total'] = df_fe['TotalSF']
    df_fe['Price_per_sqft_living'] = df_fe['GrLivArea']
    
    return df_fe

Test data shape: (1459, 80)


## 3. Load Best Model from MLflow

In [11]:
print("Loading best model and preprocessing artifacts from MLflow...")

try:
    # Load latest version of the registered model
    model_name = "house_prices_final_model"
    model_version = "latest"
    
    model_uri = f"models:/{model_name}/{model_version}"
    model = mlflow.sklearn.load_model(model_uri)
    
    print(f"Successfully loaded model: {model_name}")
    print(f"Model type: {type(model)}")
    
    # Get the run ID to load preprocessing artifacts
    client = MlflowClient()
    model_version_info = client.get_model_version(model_name, model_version)
    run_id = model_version_info.run_id
    
    print(f"Model run ID: {run_id}")
    
except Exception as e:
    print(f"Error loading model from registry: {e}")
    print("Attempting to load from latest run...")
    
    # Fallback: load from latest experiment run
    experiment = mlflow.get_experiment_by_name("house-prices-experiments")
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    
    # Find the run with final model registration
    final_run = None
    for _, run in runs.iterrows():
        if run['tags.mlflow.runName'] == 'Final_Model_Registration':
            final_run = run
            break
    
    if final_run is None:
        final_run = runs.iloc[0]  # Use latest run as fallback
    
    run_id = final_run.run_id
    model_uri = f"runs:/{run_id}/final_model"
    model = mlflow.sklearn.load_model(model_uri)
    print(f"Successfully loaded model from run: {run_id}")

Loading best model and preprocessing artifacts from MLflow...


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00,  7.25it/s]


Successfully loaded model: house_prices_final_model
Model type: <class 'sklearn.linear_model._coordinate_descent.Lasso'>
Error loading model from registry: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}
Attempting to load from latest run...


Downloading artifacts: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]

Successfully loaded model from run: ed7c4e51921249dd84d77527ab25578f





## 4. Recreate Preprocessing Pipeline

In [12]:
print("Recreating preprocessing pipeline...")

# Load training data to recreate the exact preprocessing pipeline
train_data = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')

# Apply same preprocessing steps as training
train_clean = clean_data(train_data, is_train=True)
test_clean = clean_data(test_data, is_train=False)

train_fe = feature_engineering(train_clean)
test_fe = feature_engineering(test_clean)

print(f"Train data shape after feature engineering: {train_fe.shape}")
print(f"Test data shape after feature engineering: {test_fe.shape}")

Recreating preprocessing pipeline...
Train data shape after feature engineering: (1458, 93)
Test data shape after feature engineering: (1459, 92)


## 5. Feature Encoding

In [13]:
def encode_features(train_df, test_df, target_col='SalePrice'):
    """
    Encode categorical features (exact same function as training)
    """
    # Separate features and target
    if target_col in train_df.columns:
        X_train = train_df.drop(columns=[target_col])
        y_train = train_df[target_col]
    else:
        X_train = train_df.copy()
        y_train = None
    
    X_test = test_df.copy()
    
    # Get categorical and numerical features
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove ID column if present
    if 'Id' in numerical_features:
        numerical_features.remove('Id')
        X_train = X_train.drop('Id', axis=1)
        X_test = X_test.drop('Id', axis=1)
    
    # One-hot encoding for categorical features
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)
    
    # Align columns between train and test
    missing_cols_test = set(X_train_encoded.columns) - set(X_test_encoded.columns)
    missing_cols_train = set(X_test_encoded.columns) - set(X_train_encoded.columns)
    
    # Add missing columns to test set
    for col in missing_cols_test:
        X_test_encoded[col] = 0
    
    # Add missing columns to train set
    for col in missing_cols_train:
        X_train_encoded[col] = 0
    
    # Reorder columns
    X_test_encoded = X_test_encoded[X_train_encoded.columns]
    
    return X_train_encoded, X_test_encoded, y_train

X_train_encoded, X_test_encoded, y_train = encode_features(train_fe, test_fe)
print(f"Encoded training features shape: {X_train_encoded.shape}")
print(f"Encoded test features shape: {X_test_encoded.shape}")

Encoded training features shape: (1458, 269)
Encoded test features shape: (1459, 269)


## 6. Feature Selection & Prediction

In [14]:
print("Recreating feature selection...")

# We need to recreate the feature selection based on the best method found in training
# From the training script, we need to determine which method was used

# Import required modules for feature selection
from sklearn.feature_selection import SelectKBest, f_regression, RFE, SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

# Try to load feature selection info from run parameters
try:
    run = mlflow.get_run(run_id)
    
    # Check if we can get the feature selection method from parameters
    feature_selection_method = None
    for param_key, param_value in run.data.params.items():
        if 'feature_selection' in param_key.lower():
            feature_selection_method = param_value
            break
    
    if feature_selection_method is None:
        # Default to trying all methods and picking the one that gives the right number of features
        print("Feature selection method not found in parameters, recreating...")
        
        # Try different feature selection methods to match the expected number of features
        methods_to_try = ['univariate', 'rfe', 'lasso']
        
        for method in methods_to_try:
            if method == 'univariate':
                selector = SelectKBest(score_func=f_regression, k=50)
            elif method == 'rfe':
                estimator = RandomForestRegressor(n_estimators=50, random_state=42)
                selector = RFE(estimator, n_features_to_select=50)
            elif method == 'lasso':
                lasso = Lasso(alpha=0.01, random_state=42)
                selector = SelectFromModel(lasso)
            
            # Fit on training data
            X_train_selected = selector.fit_transform(X_train_encoded, y_train)
            
            print(f"Method {method}: {X_train_selected.shape[1]} features selected")
            
            # Check if this matches our model's expected input
            if X_train_selected.shape[1] == 268:  # Expected number from error message
                feature_selection_method = method
                print(f"Found matching method: {method}")
                break
        
        if feature_selection_method is None:
            # If still not found, try with different k values for SelectKBest
            for k in [268, 260, 250, 240]:
                selector = SelectKBest(score_func=f_regression, k=k)
                X_train_selected = selector.fit_transform(X_train_encoded, y_train)
                if X_train_selected.shape[1] == 268:
                    feature_selection_method = f'univariate_k{k}'
                    print(f"Found matching method: univariate with k={k}")
                    break
    
    print(f"Using feature selection method: {feature_selection_method}")
    
except Exception as e:
    print(f"Error retrieving run info: {e}")
    feature_selection_method = 'univariate'  # Default fallback

# Apply the identified feature selection method
if 'univariate' in feature_selection_method:
    if 'k' in feature_selection_method:
        k = int(feature_selection_method.split('k')[1])
    else:
        k = 268  # Use the expected number directly
    selector = SelectKBest(score_func=f_regression, k=k)
elif feature_selection_method == 'rfe':
    estimator = RandomForestRegressor(n_estimators=50, random_state=42)
    selector = RFE(estimator, n_features_to_select=268)
elif feature_selection_method == 'lasso':
    lasso = Lasso(alpha=0.01, random_state=42)
    selector = SelectFromModel(lasso)
else:
    # Default fallback
    selector = SelectKBest(score_func=f_regression, k=268)

# Fit selector on training data and transform both train and test
X_train_selected = selector.fit_transform(X_train_encoded, y_train)
X_test_selected = selector.transform(X_test_encoded)

print(f"Selected features shape - Train: {X_train_selected.shape}")
print(f"Selected features shape - Test: {X_test_selected.shape}")


Recreating feature selection...
Using feature selection method: lasso
Selected features shape - Train: (1458, 268)
Selected features shape - Test: (1459, 268)


In [15]:
print("Generating predictions...")

with mlflow.start_run(run_name="Test_Predictions_Fixed") as run:
    # Make predictions
    predictions = model.predict(X_test_selected)
    
    # Log prediction info
    mlflow.log_param("test_samples", len(predictions))
    mlflow.log_param("features_used", X_test_selected.shape[1])
    mlflow.log_param("feature_selection_method", feature_selection_method)
    mlflow.log_metric("prediction_mean", predictions.mean())
    mlflow.log_metric("prediction_std", predictions.std())
    mlflow.log_metric("prediction_min", predictions.min())
    mlflow.log_metric("prediction_max", predictions.max())
    
    print(f"Predictions generated for {len(predictions)} samples")
    print(f"Using {X_test_selected.shape[1]} features")
    print(f"Prediction statistics:")
    print(f"  Mean: ${predictions.mean():,.2f}")
    print(f"  Std:  ${predictions.std():,.2f}")
    print(f"  Min:  ${predictions.min():,.2f}")
    print(f"  Max:  ${predictions.max():,.2f}")

Generating predictions...
Predictions generated for 1459 samples
Using 268 features
Prediction statistics:
  Mean: $175,398.37
  Std:  $79,001.74
  Min:  $16,458.88
  Max:  $725,429.48
🏃 View run Test_Predictions_Fixed at: https://dagshub.com/g-kitiashvili/ML-assignment1.mlflow/#/experiments/0/runs/5693b05d8b1b41efb772b3483a055f0d
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment1.mlflow/#/experiments/0


## 7. Create Submission File

In [16]:
print("Creating submission file...")

# Create submission dataframe
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

# Save submission file
submission_filename = 'house_prices_submission.csv'
submission.to_csv(submission_filename, index=False)

print(f"Submission file saved as: {submission_filename}")
print(f"Submission shape: {submission.shape}")
print("\nFirst 10 predictions:")
print(submission.head(10))

print("\n=== INFERENCE COMPLETED SUCCESSFULLY ===")
print(f"Submission file '{submission_filename}' ready for Kaggle upload!")

Creating submission file...
Submission file saved as: house_prices_submission.csv
Submission shape: (1459, 2)

First 10 predictions:
     Id      SalePrice
0  1461  101462.695518
1  1462  152352.762983
2  1463  181233.428427
3  1464  193084.790770
4  1465  198917.694059
5  1466  164474.887398
6  1467  169928.213979
7  1468  151305.291734
8  1469  204763.001443
9  1470  104635.186381

=== INFERENCE COMPLETED SUCCESSFULLY ===
Submission file 'house_prices_submission.csv' ready for Kaggle upload!
