## 1. Importing Libraries

In [186]:
# Import Necessary Libraries ,
import numpy as np
import pandas as pd
import sklearn as sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error


# 2. Model Building 

In [187]:
df=pd.read_csv("../data/train.csv")
FEATURES_LIST=['GrLivArea','GarageArea','TotalBsmtSF','Neighborhood','MSZoning', 'OverallQual','OverallCond']
TARGET_VARIABLE='SalePrice'
NUMERIC_FEATURES = ['GrLivArea', 'GarageArea', 'TotalBsmtSF']
CATEGORICAL_FEATURES = ['Neighborhood', 'MSZoning']

In [188]:
def preprocess_data(df, NUMERIC_FEATURES , CATEGORICAL_FEATURES):
    """Preprocess the data by filling missing values."""
    df_processed=df.copy()
    df_processed[NUMERIC_FEATURES] = df_processed[NUMERIC_FEATURES].fillna(df_processed[NUMERIC_FEATURES].mean())
    for col in CATEGORICAL_FEATURES:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
    return df_processed

In [189]:
def standardize_data(df, NUMERIC_FEATURES):
    """Standardize numerical features."""
    scaler = StandardScaler()
    scaler.fit(df[NUMERIC_FEATURES])
    df[NUMERIC_FEATURES] = scaler.transform(df[NUMERIC_FEATURES])
    return df, scaler

In [190]:
def encode_features(df, features):
    """Encode categorical features."""
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int64)
    encoder.fit(df[features])
    encoded_df = encoder.transform(df[features])
    columns=[]
    for feature, categories in zip(features, encoder.categories_):
        columns.extend([f'{feature}_is_{category}' for category in categories])
    encoded_df = pd.DataFrame(encoded_df, columns=columns, index=df.index)
    df = df.drop(features, axis=1)
    df = pd.concat([df, encoded_df], axis=1)
    return df, encoder

In [191]:
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    """Compute the Root Mean Squared Logarithmic Error."""
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

In [192]:
def prepare_data(df, numeric_features, categorical_features, scaler=None, encoder=None, fit_transform=True):
    """Prepare the data by preprocessing, standardizing, and encoding."""
    
    df = preprocess_data(df, numeric_features, categorical_features)
    
    if fit_transform:
        df, scaler = standardize_data(df, numeric_features)
    else:
        df[numeric_features] = scaler.transform(df[numeric_features])
    
    if fit_transform:
        df, encoder = encode_features(df, categorical_features)
    else:
        df = encode_features(df, categorical_features)[0]

    return df, scaler, encoder

In [195]:
def build_model(data: pd.DataFrame) -> dict[str, str]:
    """Build and evaluate the model from the provided DataFrame."""
    X_train, X_test, y_train, y_test = train_test_split(data[FEATURES_LIST], data[TARGET_VARIABLE], test_size=0.25, random_state=42)

    X_train, scaler, encoder = prepare_data(X_train, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
    
    joblib.dump(scaler, '../models/scaler.joblib')
    joblib.dump(encoder, '../models/ohe_encoder.joblib')
    X_test, _, _ = prepare_data(X_test, NUMERIC_FEATURES, CATEGORICAL_FEATURES, scaler, encoder, fit_transform=False)

    model = LinearRegression()
    model.fit(X_train, y_train)
    joblib.dump(model, '../models/linear_regression_model.joblib')

    y_pred = model.predict(X_test)
    y_pred = np.delete(y_pred, 134)
    y_test = np.delete(y_test, 134)
    print(type(y_test),type(y_pred))
    return {'rmsle': compute_rmsle(y_test, y_pred)}


In [196]:
build_model(df)


<class 'numpy.ndarray'> <class 'numpy.ndarray'>


{'rmsle': 0.17}

## 2.3. Model Inference

In [None]:
testing_data=pd.read_csv("../data/test.csv")

In [None]:
def transform_data(df, numeric_features, categorical_features, scaler, encoder):
    """Transform the data using the provided scaler and encoder."""
    df = df.copy()  
    df.loc[:, numeric_features] = scaler.transform(df[numeric_features])
    encoded_array = encoder.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_array, columns=[f'{feature}_is_{category}' for feature, categories in zip(categorical_features, encoder.categories_) for category in categories], index=df.index)
    df = pd.concat([df.drop(categorical_features, axis=1), encoded_df], axis=1)
    return df

In [None]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    """Make predictions using the trained model."""
    
    loaded_scaler = joblib.load('../models/scaler.joblib')
    loaded_encoder = joblib.load('../models/ohe_encoder.joblib')
    loaded_model = joblib.load('../models/linear_regression_model.joblib')

    X_testing = input_data.copy()[FEATURES_LIST]
    X_testing = preprocess_data(X_testing, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
    X_testing = transform_data(X_testing, NUMERIC_FEATURES, CATEGORICAL_FEATURES, loaded_scaler, loaded_encoder)

    return loaded_model.predict(X_testing)

In [None]:
make_predictions(testing_data)

  0.88960995]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:, numeric_features] = scaler.transform(df[numeric_features])


array([114049.00060886, 159008.45274182, 170658.99632136, ...,
       159855.81832916, 106267.88056796, 213341.27851392])