In [43]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import joblib
import os

In [44]:
# File paths

DATA_FILENAME = 'C:/Users/SOHAM/dsp-soham-chakraborty/data/train.csv'
MODEL_PATH = 'C:/Users/SOHAM/dsp-soham-chakraborty/models/'

## Model Build Function

In [45]:
def build_model(data: pd.DataFrame) -> dict:
 
    # Select useful features
    label_col = 'SalePrice'
    useful_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'ExterQual']
   
    # Select features and label for training data
    useful_data = data[useful_features + [label_col]]
    
    # Split data to avoid leakage
    train_df, test_df = train_test_split(useful_data, test_size=0.33, random_state=42)


    # Preprocess data
    continuous_features = ['GrLivArea', 'GarageCars', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']
    ordinal_feature = 'ExterQual'
    
    # Ordinal Encoding
    exterior_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}
    train_df[ordinal_feature] = train_df[ordinal_feature].apply(lambda x: exterior_quality_dict.get(x, np.nan))
    test_df[ordinal_feature] = test_df[ordinal_feature].apply(lambda x: exterior_quality_dict.get(x, np.nan))

    
    # Scaling
    scaler = StandardScaler()
    scaler.fit(train_df[continuous_features])
    train_scaled = scaler.transform(train_df[continuous_features])
    test_scaled = scaler.transform(test_df[continuous_features])

    # Combine preprocessed features
    train_preprocessed = pd.concat([
        pd.DataFrame(train_scaled, columns=continuous_features),
        train_df[ordinal_feature].reset_index(drop=True)
    ], axis=1)
    test_preprocessed = pd.concat([
        pd.DataFrame(test_scaled, columns=continuous_features),
        test_df[ordinal_feature].reset_index(drop=True)
    ], axis=1)

    # Train the model
    model = LinearRegression()
    model.fit(train_preprocessed, train_df[label_col])

    # Evaluate the model
    y_pred = model.predict(test_preprocessed)
    rmse = np.sqrt(mean_squared_error(test_df[label_col], y_pred))
    return {'rmse': rmse}
    #print(f"Root Mean Squared Error: {rmse}")

    # Saving the model and scaler
    joblib.dump(model, os.path.join(MODEL_PATH, 'model.joblib'))
    joblib.dump(scaler, os.path.join(MODEL_PATH, 'scaler.joblib'))

    return {'rmse': rmse}

## Prediction Function

In [46]:
# Function for making predictions on new data

def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Features used during model training
    continuous_features = ['GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'YearRemodAdd']
    ordinal_feature = 'ExterQual'
    exterior_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}

    # Load scaler and model
    scaler = joblib.load(os.path.join(MODEL_PATH, 'scaler.joblib'))
    model = joblib.load(os.path.join(MODEL_PATH, 'model.joblib'))

    # Preprocess ordinal feature
    input_data[ordinal_feature] = input_data[ordinal_feature].apply(lambda x: exterior_quality_dict.get(x, np.nan))

    # Ensure the test data has the same features as the training data
    input_data_subset = input_data[continuous_features + [ordinal_feature]]

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    input_data_imputed = pd.DataFrame(imputer.fit_transform(input_data_subset), columns=input_data_subset.columns)

    # Apply the scaler to the continuous features
    input_scaled = scaler.transform(input_data_imputed[continuous_features])

    # Combine scaled continuous features and ordinal feature into a DataFrame
    input_preprocessed = pd.concat([
        pd.DataFrame(input_scaled, columns=continuous_features),
        input_data_imputed[ordinal_feature].reset_index(drop=True)
    ], axis=1)

    # Make predictions
    predictions = model.predict(input_preprocessed)
    return predictions

# Model Building

In [47]:
# Model Building
training_data_df = pd.read_csv(DATA_FILENAME)
model_performance_dict = build_model(training_data_df)
print(model_performance_dict)

{'rmse': 42793.603797757896}


# Model Inference

In [48]:
# Model Inference
test_data_path = 'C:/Users/SOHAM/dsp-soham-chakraborty/data/test.csv'
new_data_df = pd.read_csv(test_data_path)
predictions = make_predictions(new_data_df)
print(predictions)

[101872.6718482  129607.08349072 189739.96063635 ... 157001.48484206
 107617.70951989 234409.30836151]
