## 1. Importing Libraries

In [91]:

# Import Necessary Libraries
import numpy as np
import pandas as pd
import sklearn as sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error


# 2. Model Building 

In [92]:
df=pd.read_csv("C:/Users/usatoleb/Desktop/Epita/Semester2/DSProduction/House-predictions/train.csv")
FEATURES_LIST=['GrLivArea','GarageArea','TotalBsmtSF','OverallQual','OverallCond','Neighborhood','MSZoning']
TARGET_VARIABLE='SalePrice'
NUMERIC_FEATURES = ['GrLivArea', 'GarageArea', 'TotalBsmtSF']
CATEGORICAL_FEATURES = ['Neighborhood', 'MSZoning']

In [93]:
def preprocess_data(df, NUMERIC_FEATURES , CATEGORICAL_FEATURES):
    """Preprocess the data by filling missing values."""
    df_processed=df.copy()
    df_processed[NUMERIC_FEATURES] = df_processed[NUMERIC_FEATURES].fillna(df_processed[NUMERIC_FEATURES].mean())
    for col in CATEGORICAL_FEATURES:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
    return df_processed

In [94]:
def standardize_data(df, NUMERIC_FEATURES):
    """Standardize numerical features."""
    scaler = StandardScaler()
    scaler.fit(df[NUMERIC_FEATURES])
    df[NUMERIC_FEATURES] = scaler.transform(df[NUMERIC_FEATURES])
    return df, scaler

In [95]:
def encode_features(df, feature):
    """Encode categorical features."""
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int64)
    encoder.fit(df[[feature]])
    encoded_df = encoder.transform(df[[feature]])
    columns = [f'{feature}_is_{category}' for category in encoder.categories_[0]]
    encoded_df = pd.DataFrame(encoded_df, columns=columns, index=df.index)
    df = df.drop([feature], axis=1)
    df = pd.concat([df, encoded_df], axis=1)
    return df, encoder

In [96]:
def compute_rmsle(y_true, y_pred, precision=2):
    """Compute the Root Mean Squared Logarithmic Error."""
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

In [97]:
def build_model(data:pd.DataFrame)-> dict[str,str]:
    """Build and evaluate the model from the provided DataFrame."""
    df = data[FEATURES_LIST + [TARGET_VARIABLE]]
    x_data, y_data = df[FEATURES_LIST], df[TARGET_VARIABLE]
    X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, random_state=42)

    # Preprocessing
    X_train = preprocess_data(X_train, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
    X_train, scaler = standardize_data(X_train, NUMERIC_FEATURES)
    joblib.dump(scaler, 'C:/Users/usatoleb/dsp-charbel-boumalham/models/scaler.joblib')
    for feature in CATEGORICAL_FEATURES:
        X_train, encoder = encode_features(X_train, feature)
        # Assuming you might want to use the encoder later, consider saving it
        joblib.dump(encoder, f'C:/Users/usatoleb/dsp-charbel-boumalham/models/ohe_encoder_{feature}.joblib')

    # Apply preprocessing to test data
    X_test = preprocess_data(X_test, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
    X_test[NUMERIC_FEATURES] = scaler.transform(X_test[NUMERIC_FEATURES])
    for feature in CATEGORICAL_FEATURES:
        X_test, _ = encode_features(X_test, feature)  # Encoder fitted on the training set

    # Model training
    model = LinearRegression()
    model.fit(X_train, y_train)
    joblib.dump(model,'C:/Users/usatoleb/dsp-charbel-boumalham/models/linear_regression_model.joblib')

    # Prediction and evaluation
    y_pred = model.predict(X_test)
    y_pred=np.delete(y_pred,134)
    y_test=np.delete(y_test,134)
    rmsle = compute_rmsle(y_test, y_pred)

    return {'rmsle': rmsle}


In [98]:
build_model(df)

{'rmsle': 0.17}

## 2.3. Model Inference

In [99]:
testing_data=pd.read_csv("C:/Users/usatoleb/Desktop/Epita/Semester2/DSProduction/House-predictions/test.csv")

In [100]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    """Make predictions using the trained model."""
    
    loaded_scaler = joblib.load('C:/Users/usatoleb/dsp-charbel-boumalham/models/scaler.joblib')
    encoders = {feature: joblib.load(f'C:/Users/usatoleb/dsp-charbel-boumalham/models/ohe_encoder_{feature}.joblib') for feature in CATEGORICAL_FEATURES}
    loaded_model = joblib.load('C:/Users/usatoleb/dsp-charbel-boumalham/models/linear_regression_model.joblib')
    
    X_testing=input_data.copy()[FEATURES_LIST]
    X_testing = preprocess_data(X_testing, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
    
    
    X_testing.loc[:, NUMERIC_FEATURES] = loaded_scaler.transform(X_testing[NUMERIC_FEATURES])
    
    for feature, encoder in encoders.items():
        encoded_df = encoder.transform(X_testing[[feature]])
        columns = [f'{feature}_is_{category}' for category in encoder.categories_[0]]
        encoded_df = pd.DataFrame(encoded_df, columns=columns)
        X_testing = X_testing.drop([feature], axis=1)
        X_testing = pd.concat([X_testing, encoded_df], axis=1)
        
    predictions=loaded_model.predict(X_testing)
    return predictions
    #X = preprocess_data(input_data, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
    #for feature in CATEGORICAL_FEATURES:
    #    X, _ = encode_features(X, feature)  # Encoder fitted on the training set
    #model = joblib.load('C:/Users/usatoleb/dsp-charbel-boumalham/models/linear_regression_model.joblib')
    #predictions = model.predict(X)
    #return predictions

In [101]:
make_predictions(testing_data)

  0.88960995]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_testing.loc[:, NUMERIC_FEATURES] = loaded_scaler.transform(X_testing[NUMERIC_FEATURES])


array([114049.00060886, 159008.45274182, 170658.99632136, ...,
       159855.81832916, 106267.88056796, 213341.27851392])