# House Prices – Model Inference

Test Set Prediction using the Best Model from Model Registry

## 1. Setup and Imports

In [None]:
%pip install -q dagshub mlflow

import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import warnings
warnings.filterwarnings('ignore')

os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/g-kitiashvili/ML-assignment1.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME'] = 'g-kitiashvili'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '1c2227158cc19daf66bb3b241116a8e8c5f1cd20'

print("House Prices - Model Inference")
print("=" * 50)

## 2. Load Test Data and Preprocessing Functions

In [None]:
# Load test data
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')
test_ids = test_data['Id'].copy()
print(f"Test data shape: {test_data.shape}")

def clean_data(df, is_train=True):
    df_clean = df.copy()
    if is_train:
        df_clean = df_clean.drop(
            df_clean[(df_clean['GrLivArea'] > 4000) &
                     (df_clean['SalePrice'] < 300000)].index
        )
    cat_na = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
              'BsmtFinType2','FireplaceQu','GarageType','GarageFinish',
              'GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
    for feat in cat_na:
        if feat in df_clean:
            df_clean[feat] = df_clean[feat].fillna('None')
    num_zero = ['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
                'BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageArea',
                'GarageCars','MasVnrArea']
    for feat in num_zero:
        if feat in df_clean:
            df_clean[feat] = df_clean[feat].fillna(0)
    for feat in df_clean.select_dtypes('object'):
        if df_clean[feat].isnull().any():
            df_clean[feat] = df_clean[feat].fillna(df_clean[feat].mode()[0])
    for feat in df_clean.select_dtypes(include=[np.number]):
        if df_clean[feat].isnull().any():
            df_clean[feat] = df_clean[feat].fillna(df_clean[feat].median())
    return df_clean

def feature_engineering(df):
    df_fe = df.copy()
    df_fe['TotalSF'] = df_fe['TotalBsmtSF'] + df_fe['1stFlrSF'] + df_fe['2ndFlrSF']
    df_fe['Total_Bathrooms'] = (df_fe['FullBath'] + 0.5*df_fe['HalfBath'] +
                               df_fe['BsmtFullBath'] + 0.5*df_fe['BsmtHalfBath'])
    df_fe['Total_porch_sf'] = (df_fe['OpenPorchSF'] + df_fe['3SsnPorch'] +
                              df_fe['EnclosedPorch'] + df_fe['ScreenPorch'] +
                              df_fe['WoodDeckSF'])
    df_fe['HouseAge'] = df_fe['YrSold'] - df_fe['YearBuilt']
    df_fe['RemodAge'] = df_fe['YrSold'] - df_fe['YearRemodAdd']
    df_fe['GarageAge'] = (df_fe['YrSold'] - df_fe['GarageYrBlt']).fillna(0)
    df_fe['HasBasement']  = (df_fe['TotalBsmtSF'] > 0).astype(int)
    df_fe['HasGarage']    = (df_fe['GarageArea']  > 0).astype(int)
    df_fe['HasFireplace'] = (df_fe['Fireplaces'] > 0).astype(int)
    df_fe['HasPool']      = (df_fe['PoolArea']   > 0).astype(int)
    df_fe['Price_per_sqft_total']  = df_fe['TotalSF']
    df_fe['Price_per_sqft_living'] = df_fe['GrLivArea']
    return df_fe

## 3. Load Best Model from MLflow

In [None]:
print("Loading best model and artifacts...")
try:
    model_name = "house_prices_final_model"
    model_version = "latest"
    model_uri = f"models:/{model_name}/{model_version}"
    model = mlflow.sklearn.load_model(model_uri)
    client = MlflowClient()
    info = client.get_model_version(model_name, model_version)
    run_id = info.run_id
    print(f"Loaded model run: {run_id}")
except:
    exp = mlflow.get_experiment_by_name("house-prices-experiments")
    runs = mlflow.search_runs([exp.experiment_id])
    final = runs[runs['tags.mlflow.runName']=='Final_Model_Registration'].iloc[0]
    run_id = final.run_id
    model = mlflow.sklearn.load_model(f"runs:/{run_id}/final_model")
    print(f"Fallback loaded run: {run_id}")

## 4. Recreate Preprocessing Pipeline

In [None]:
train_data = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
train_clean = clean_data(train_data, is_train=True)
test_clean  = clean_data(test_data,  is_train=False)
train_fe = feature_engineering(train_clean)
test_fe  = feature_engineering(test_clean)
print(f"After FE: train {train_fe.shape}, test {test_fe.shape}")

## 5. Feature Encoding

In [None]:
def encode_features(train_df, test_df, target_col='SalePrice'):
    if target_col in train_df:
        X_train = train_df.drop(columns=[target_col]); y_train = train_df[target_col]
    else:
        X_train = train_df.copy(); y_train = None
    X_test = test_df.copy()
    cats = X_train.select_dtypes('object').columns.tolist()
    if 'Id' in X_train.columns:
        X_train.drop('Id',axis=1,inplace=True); X_test.drop('Id',axis=1,inplace=True)
    X_train_enc = pd.get_dummies(X_train, columns=cats, drop_first=True)
    X_test_enc  = pd.get_dummies(X_test,  columns=cats, drop_first=True)
    for c in set(X_train_enc.columns)-set(X_test_enc.columns): X_test_enc[c]=0
    for c in set(X_test_enc.columns)-set(X_train_enc.columns): X_train_enc[c]=0
    X_test_enc = X_test_enc[X_train_enc.columns]
    return X_train_enc, X_test_enc, y_train

X_train_enc, X_test_enc, y_train = encode_features(train_fe, test_fe)
print(f"Encoded shapes: train {X_train_enc.shape}, test {X_test_enc.shape}")

## 6. Feature Selection & Prediction

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
k = 268
selector = SelectKBest(score_func=f_regression, k=k)
X_train_sel = selector.fit_transform(X_train_enc, y_train)
X_test_sel  = selector.transform(X_test_enc)
print(f"Selected features: {X_train_sel.shape[1]}")

preds = model.predict(X_test_sel)
print(f"Generated {len(preds)} predictions")

## 7. Create Submission File

In [None]:
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': preds})
submission.to_csv('house_prices_submission.csv', index=False)
print(f"Saved submission.csv with shape {submission.shape}")
print(submission.head(10))