In [23]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import joblib
import os

In [24]:
# File paths

DATA_FILENAME = 'C:/Users/SOHAM/dsp-soham-chakraborty/data/train.csv'
MODEL_PATH = 'C:/Users/SOHAM/dsp-soham-chakraborty/models/'

## Model Build Function

In [25]:
def build_model(data: pd.DataFrame) -> dict:
 
    # Select useful features
    label_col = 'SalePrice'
    useful_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'YearRemodAdd', 'ExterQual']
   
    # Select features and label for training data
    useful_data = data[useful_features + [label_col]]
    
    # Split data to avoid leakage
    train_df, test_df = train_test_split(useful_data, test_size=0.33, random_state=42)


    # Preprocess data
    continuous_features = ['GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'YearRemodAdd']
    ordinal_feature = 'ExterQual'

    # Ordinal Encoding
    exterior_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
    train_df[ordinal_feature] = train_df[ordinal_feature].apply(lambda x: exterior_quality_dict.get(x, np.nan))
    test_df[ordinal_feature] = test_df[ordinal_feature].apply(lambda x: exterior_quality_dict.get(x, np.nan))

    # Scaling
    scaler = StandardScaler()
    scaler.fit(train_df[continuous_features])
    train_scaled = scaler.transform(train_df[continuous_features])
    test_scaled = scaler.transform(test_df[continuous_features])

    # Combine preprocessed features
    train_preprocessed = pd.concat([
        pd.DataFrame(train_scaled, columns=continuous_features),
        train_df[ordinal_feature].reset_index(drop=True)
    ], axis=1)
    test_preprocessed = pd.concat([
        pd.DataFrame(test_scaled, columns=continuous_features),
        test_df[ordinal_feature].reset_index(drop=True)
    ], axis=1)

    # Train the model
    model = LinearRegression()
    model.fit(train_preprocessed, train_df[label_col])

    # Evaluate the model
    y_pred = model.predict(test_preprocessed)
    rmse = np.sqrt(mean_squared_error(test_df[label_col], y_pred))
    return {'rmse': rmse}
    #print(f"Root Mean Squared Error: {rmse}")

    # Save model and transformers
    joblib.dump(model, 'models/model.joblib')
    joblib.dump(scaler, 'models/scaler.joblib')

    return {'rmse': rmse}

## Prediction Function

In [26]:
# Function for making predictions on new data

def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    continuous_features = ['GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'YearRemodAdd']
    ordinal_feature = 'ExterQual'
    exterior_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
    
    scaler = joblib.load('models/scaler.joblib')
    model = joblib.load('models/model.joblib')
    
    # Preprocess data
    input_data[ordinal_feature] = input_data[ordinal_feature].apply(lambda x: exterior_quality_dict.get(x, np.nan))
    input_scaled = scaler.transform(input_data[continuous_features])
    input_preprocessed = pd.concat([
        pd.DataFrame(input_scaled, columns=continuous_features),
        input_data[ordinal_feature].reset_index(drop=True)
    ], axis=1)
    
    predictions = model.predict(input_preprocessed)
    return predictions


# Model Building

In [27]:
# Model Building
training_data_df = pd.read_csv(DATA_FILENAME)
model_performance_dict = build_model(training_data_df)
print(model_performance_dict)

{'rmse': 42629.32431774989}


# Model Inference

In [None]:
# Model Inference
test_data_path = 'C:/Users/SOHAM/dsp-soham-chakraborty/data/test.csv'
new_data_df = pd.read_csv(test_data_path)
predictions = make_predictions(new_data_df)
print(predictions)