In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import os

In [2]:
# File paths

DATA_FILENAME = 'C:/Users/SOHAM/dsp-soham-chakraborty/data/train.csv'
MODEL_PATH = 'C:/Users/SOHAM/dsp-soham-chakraborty/models/'

## Model Build Function

In [3]:
# Function for building the model
def build_model(data: pd.DataFrame) -> dict:
    # Split data to avoid leakage
    train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

    # Persist the trained model
    label_col = 'SalePrice'
    useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

    # Select features and label for training data
    train_df = train_df[useful_features + [label_col]]

    # Feature groups
    continuous_features = ['TotRmsAbvGrd', 'YrSold', '1stFlrSF']
    categorical_features = ['Foundation', 'KitchenQual']
    features_to_one_hot_encode = ['Foundation']
    features_to_ordinal_encode = ['KitchenQual']

    # One Hot Encoding
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoder.fit(train_df[features_to_one_hot_encode])
    train_one_hot_encoded = one_hot_encoder.transform(train_df[features_to_one_hot_encode])

    # Ordinal Encoding
    kitchen_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}
    train_df['KitchenQual'] = train_df['KitchenQual'].apply(lambda x: kitchen_quality_dict[x])

    # Scaling
    scaler = StandardScaler()
    scaler.fit(train_df[continuous_features])
    train_scaled = scaler.transform(train_df[continuous_features])

    # Convert encoded and scaled data back to DataFrame
    train_one_hot_encoded_df = pd.DataFrame(train_one_hot_encoded, columns=one_hot_encoder.get_feature_names_out())
    train_scaled_df = pd.DataFrame(train_scaled, columns=continuous_features)

    # Combine all processed features
    X_train = pd.concat([train_one_hot_encoded_df, train_df[features_to_ordinal_encode], train_scaled_df], axis=1)
    y_train = train_df[label_col]

    # Save the encoders and scaler
    joblib.dump(one_hot_encoder, os.path.join(MODEL_PATH, 'one_hot_encoder.joblib'))
    joblib.dump(scaler, os.path.join(MODEL_PATH, 'scaler.joblib'))

    # Model training
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(model, os.path.join(MODEL_PATH, 'model.joblib'))

    # Model evaluation
    test_df = test_df[useful_features + [label_col]]

    # One Hot Encoding for test data
    test_one_hot_encoded = one_hot_encoder.transform(test_df[features_to_one_hot_encode])

    # Ordinal Encoding for test data
    test_df['KitchenQual'] = test_df['KitchenQual'].apply(lambda x: kitchen_quality_dict[x])

    # Scaling for test data
    test_scaled = scaler.transform(test_df[continuous_features])

    # Convert encoded and scaled data back to DataFrame
    test_one_hot_encoded_df = pd.DataFrame(test_one_hot_encoded, columns=one_hot_encoder.get_feature_names_out())
    test_scaled_df = pd.DataFrame(test_scaled, columns=continuous_features)

    # Combine all processed features
    X_test = pd.concat([test_one_hot_encoded_df, test_df[features_to_ordinal_encode], test_scaled_df], axis=1)
    y_test = test_df[label_col]

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return {'rmse': rmse}


## Prediction Function

In [4]:
# Function for making predictions on new data

def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Load the saved model and preprocessors
    model = joblib.load(os.path.join(MODEL_PATH, 'model.joblib'))
    one_hot_encoder = joblib.load(os.path.join(MODEL_PATH, 'one_hot_encoder.joblib'))
    scaler = joblib.load(os.path.join(MODEL_PATH, 'scaler.joblib'))

    # Preprocess the data
    useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
    continuous_features = ['TotRmsAbvGrd', 'YrSold', '1stFlrSF']
    features_to_one_hot_encode = ['Foundation']
    features_to_ordinal_encode = ['KitchenQual']
    kitchen_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}

    input_data = input_data[useful_features]

    # One Hot Encoding
    one_hot_encoded = one_hot_encoder.transform(input_data[features_to_one_hot_encode])

    # Ordinal Encoding
    input_data['KitchenQual'] = input_data['KitchenQual'].apply(lambda x: kitchen_quality_dict[x])

    # Scaling
    scaled = scaler.transform(input_data[continuous_features])

    # Convert encoded and scaled data back to DataFrame
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out())
    scaled_df = pd.DataFrame(scaled, columns=continuous_features)

    # Combine all processed features
    X = pd.concat([one_hot_encoded_df, input_data[features_to_ordinal_encode], scaled_df], axis=1)

    # Make predictions
    predictions = model.predict(X)
    return predictions

# Model Building

In [5]:
# Model Building
training_data_df = pd.read_csv(DATA_FILENAME)
model_performance_dict = build_model(training_data_df)
print(model_performance_dict)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# Model Inference

In [6]:
# Model Inference
test_data_path = 'C:/Users/SOHAM/dsp-soham-chakraborty/data/test.csv'
new_data_df = pd.read_csv(test_data_path)
predictions = make_predictions(new_data_df)
print(predictions)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/SOHAM/dsp-soham-chakraborty/data/test.csv'