<a href="https://colab.research.google.com/github/enelene/ML-assignment-1/blob/main/model_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mlflow dagshub
!pip install kaggle



In [None]:
import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

In [None]:
import os
os.environ["MLFLOW_TRACKING_USERNAME"] = "enelene"
os.environ["MLFLOW_TRACKING_PASSWORD"] = ""

In [None]:
mlflow.set_tracking_uri("https://dagshub.com/enelene/ML-assignment-1.mlflow")

In [None]:
from mlflow.tracking import MlflowClient

# Set experiment
experiment_name = "house-prices-experiment"

# Initialize MLflow client
client = MlflowClient()

# Get experiment ID
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id

# Get runs for the experiment
runs = client.search_runs(
    experiment_ids=[experiment_id],
    order_by=["metrics.rmse_val ASC"],
    max_results=1
)

if runs:
  best_run = runs[0]
  print(f"Best run ID: {best_run.info.run_id}")
  print(f"Best run metrics: {best_run.data.metrics}")
  print(f"Best run params: {best_run.data.params}")


Best run ID: f551859f4f20470aa4e87a1a079fbf65
Best run metrics: {'rmse_val': 0.1372468798498211}
Best run params: {'max_depth': '3', 'learning_rate': '0.1', 'n_estimators': '300'}


In [None]:
print(type(best_run))

<class 'mlflow.entities.run.Run'>


In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
def fill_missing_values(df):
    """
    Fill missing values:
      - For object (string) columns, fill with 'None'
      - For numeric columns, fill with 0
    """
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('None')
        elif pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(0)
    return df

def ensemble_predict(models, weights, X_input):
    """
    Weighted average of predictions from multiple models.
      models: list of fitted models
      weights: same length list of floats
      X_input: feature matrix
    """
    assert len(models) == len(weights), "models and weights must have same length"
    preds = [m.predict(X_input) * w for m, w in zip(models, weights)]
    return np.sum(preds, axis=0) / np.sum(weights)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
test_path = "/content/drive/MyDrive/kaggle/house_prices/test.csv"
test_df = pd.read_csv(test_path)
test_ids = test_df["Id"].copy()  # for submission

In [None]:
train_path = "/content/drive/MyDrive/kaggle/house_prices/train.csv"
train_df = pd.read_csv(train_path)

In [None]:
test_df = fill_missing_values(test_df)
train_df = fill_missing_values(train_df)


test_df["TotalSF"] = test_df["TotalBsmtSF"] + test_df["1stFlrSF"] + test_df["2ndFlrSF"]
test_df["OverallQual_TotSF"] = test_df["OverallQual"] * (
    test_df["TotalBsmtSF"] + test_df["1stFlrSF"] + test_df["2ndFlrSF"]
)
test_df['House_Age'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['Years_Since_Remod'] = test_df['YrSold'] - test_df['YearRemodAdd']
test_df['Was_Remodeled'] = (test_df['YearRemodAdd'] > test_df['YearBuilt']).astype(int)
test_df['TotalBaths'] = test_df['FullBath'] + (0.5 * test_df['HalfBath']) + test_df['BsmtFullBath'] + (0.5 * test_df['BsmtHalfBath'])
test_df['LivingAreaPerBedroom'] = test_df['GrLivArea'] / (test_df['BedroomAbvGr'] + 1)
test_df['LotRatio'] = test_df['1stFlrSF'] / test_df['LotArea']
test_df['TotalPorchSF'] = test_df['OpenPorchSF'] + test_df['EnclosedPorch'] + test_df['3SsnPorch'] + test_df['ScreenPorch']
test_df['Quality_x_Condition'] = test_df['OverallQual'] * test_df['OverallCond']
test_df['KitchenQual_Encoded'] = test_df['KitchenQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
test_df['Kitchen_Quality_SF'] = test_df['KitchenQual_Encoded'] * test_df['1stFlrSF']

# Create neighborhood price level feature (requires target encoding with cross-validation)
neighborhood_means = train_df.groupby('Neighborhood')['SalePrice'].mean()
test_df['Neighborhood_Price_Level'] = test_df['Neighborhood'].map(neighborhood_means)

# Location quality (OverallQual * Neighborhood median price)
test_df['Location_Quality'] = test_df['OverallQual'] * test_df['Neighborhood_Price_Level']

# Create boolean features for columns with significant missing values
for col in ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']:
    test_df[f'Has_{col}'] = (~test_df[col].isin(['None'])).astype(int)

# Total finished basement area percentage
test_df['TotalFinishedBsmtSF'] = test_df['BsmtFinSF1'] + test_df['BsmtFinSF2']
test_df['FinishedBsmtRatio'] = test_df['TotalFinishedBsmtSF'] / (test_df['TotalBsmtSF'] + 1)

# Garage age when sold
test_df['Garage_Age'] = test_df['YrSold'] - test_df['GarageYrBlt']
# Replace negative ages with 0 (garage built after house)
test_df['Garage_Age'] = test_df['Garage_Age'].clip(lower=0)

# Garage area per car
test_df['GarageArea_Per_Car'] = test_df['GarageArea'] / (test_df['GarageCars'] + 0.1)


# Square footage squared (captures non-linear relationship with price)
test_df['GrLivArea_Sq'] = test_df['GrLivArea'] ** 2

# Interaction between house size and overall quality
test_df['GrLivArea_x_Quality'] = test_df['GrLivArea'] * test_df['OverallQual']

# Age and quality interaction
test_df['Age_x_Quality'] = test_df['House_Age'] * test_df['OverallQual']

# Season sold
test_df['Season_Sold'] = pd.cut(
    test_df['MoSold'],
    bins=[0, 3, 6, 9, 12],
    labels=['Winter', 'Spring', 'Summer', 'Fall'],
    include_lowest=True
)
# Convert to dummies (handled by get_dummies later)

test_df.drop(["Id"], axis=1, inplace=True)

  test_df['FinishedBsmtRatio'] = test_df['TotalFinishedBsmtSF'] / (test_df['TotalBsmtSF'] + 1)
  test_df['Garage_Age'] = test_df['YrSold'] - test_df['GarageYrBlt']
  test_df['GarageArea_Per_Car'] = test_df['GarageArea'] / (test_df['GarageCars'] + 0.1)
  test_df['GrLivArea_Sq'] = test_df['GrLivArea'] ** 2
  test_df['GrLivArea_x_Quality'] = test_df['GrLivArea'] * test_df['OverallQual']
  test_df['Age_x_Quality'] = test_df['House_Age'] * test_df['OverallQual']
  test_df['Season_Sold'] = pd.cut(


In [None]:
import pickle

with open("/content/drive/MyDrive/kaggle/house_prices/train_columns.pkl", "rb") as f:
    train_columns = pickle.load(f)

test_dummies = pd.get_dummies(test_df, drop_first=True)
# Align columns: reindex so it has the same columns as train
# fill_value=0 to add missing columns with 0 values
test_dummies = test_dummies.reindex(columns=train_columns, fill_value=0)
X_test = test_dummies.values

In [None]:
from mlflow import sklearn as mlflow_sklearn

best_run_id = best_run.info.run_id

model_uri = f"runs:/{best_run_id}/model"
best_model = mlflow_sklearn.load_model(model_uri)




Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]



In [None]:
import pandas as pd
import pickle

# Load the training columns
with open("/content/drive/MyDrive/kaggle/house_prices/train_columns.pkl", "rb") as f:
    train_columns = pickle.load(f)

# Ensure Dummies Match Train
test_dummies = pd.get_dummies(test_df)  # Don't use drop_first here

# Here's the key fix:
missing_cols = set(train_columns) - set(test_dummies.columns)
print(missing_cols)
# Add missing columns to test data with default value 0
for col in missing_cols:
    test_dummies[col] = 0

# Make sure columns are in the same order as training data
test_dummies = test_dummies[train_columns]
X_test = test_dummies.values

{'MiscFeature_TenC', 'Heating_OthW', 'Utilities_NoSeWa', 'HouseStyle_2.5Fin', 'RoofMatl_Membran', 'Exterior1st_Stone', 'Electrical_None', 'Exterior2nd_Other', 'PoolQC_Fa', 'Condition2_RRNn', 'Condition2_RRAn', 'RoofMatl_Roll', 'Exterior1st_ImStucc', 'Condition2_RRAe', 'RoofMatl_Metal', 'Electrical_Mix'}


 I have problems here

In [None]:
feature_names = None
try:
    feature_names = best_model.feature_types
    print(f"Feature names retrieved from model: {len(feature_names)}")
except:
    try:
        feature_names = best_model.get_booster().feature_names
        print(f"Feature names retrieved from XGBoost model: {len(feature_names)}")
    except:
        print("Could not retrieve feature names directly from model")

if feature_names is None or len(feature_names) != 269:
    print("Creating fake feature column set with correct dimension")
    X_test_fixed = pd.DataFrame(np.zeros((len(test_df), 269)),
                               columns=list(train_columns) + ['extra_feature'])
else:
    X_test_fixed = pd.DataFrame(np.zeros((len(test_df), len(feature_names))),
                               columns=feature_names)

# 3. Fill in the values we do have
test_dummies = pd.get_dummies(test_df)
for col in test_dummies.columns:
    if col in X_test_fixed.columns:
        X_test_fixed[col] = test_dummies[col]

print(f"Final test data shape: {X_test_fixed.shape}")



Could not retrieve feature names directly from model
Creating fake feature column set with correct dimension
Final test data shape: (1459, 269)
Submission created successfully!


In [None]:
X_test = X_test_fixed.values
test_preds_log = best_model.predict(X_test)
test_preds = np.expm1(test_preds_log)

In [None]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": test_preds
})
submission.to_csv("submission.csv", index=False)
print("Submission created successfully!")