# **Installing Pre-requisites for our Analysis**

In [49]:
!pip install pandas
!pip install scikit-learn
!pip install pyarrow
!pip install joblib



In [50]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from IPython.display import display

# **MODEL BUILDING**

In [51]:
# Reading the csv file using read_csv function
dataset_for_training = r"C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/data/train.csv"
Training_dataset = pd.read_csv(dataset_for_training)

In [52]:
# Declaring feature selection variables outside incase if i want to change the features i can do it just here

selected_features = ['LotArea', 'GrLivArea', 'Neighborhood', 'HouseStyle'] # 2 continuos and 2 Categorical features
target_feature = ['SalePrice']
continuos_datatype_features = ['LotArea', 'GrLivArea']
discrete_datatype_features = ['Neighborhood', 'HouseStyle']

In [57]:
def build_model(data: pd.DataFrame) -> dict[str, str]:

    """
    Builds a linear regression model, trains it on the provided data, and evaluates its performance.

    Args:
        data (pd.DataFrame): The dataset to train and evaluate the model on.

    Returns:
        dict[str, str]: A dictionary containing the root mean squared log error of the model.
    """

## Training_set
    
    # 1) splitting the dataset
    
    X = Training_dataset.drop(target_feature, axis=1)
    y = Training_dataset[target_feature]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=50)


    # 2) Extracting the features from training set
    
    Extracted_Selected_Features_For_Training = X_train[selected_features]
    Extracted_Target_Attribute = y_train[target_feature]
    Training_Features = pd.concat([Extracted_Selected_Features_For_Training,y_train], axis=1)

    # 3) Encoding the categorical columns from training set

    encoder = OneHotEncoder(sparse_output= False)
    encoder.fit(Training_Features[discrete_datatype_features])
    Training_encoded_categories = encoder.transform(Training_Features[discrete_datatype_features])
    encoded_discrete_features_training_df = pd.DataFrame(Training_encoded_categories, columns=encoder.get_feature_names_out(discrete_datatype_features))

    # 4) Scaling the continuos columns from training set

    scaler = StandardScaler()
    scaler.fit(X_train[continuos_datatype_features])
    scaled_continuos_features_training_df = scaler.transform(X_train[continuos_datatype_features])

    # 5) Concatenating the processed training set
    
    training_continuous_features_df = pd.DataFrame(scaled_continuos_features_training_df , columns= continuos_datatype_features)
    Processed_Training_Df = pd.concat([training_continuous_features_df, encoded_discrete_features_training_df] , axis=1)

    # 6) Fitting the model
    
    model = LinearRegression()
    model.fit(Processed_Training_Df, y_train)

    #7) SAVING THE MODEL
    joblib.dump(model,r'C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/models/model.joblib')
    joblib.dump(scaler,r'C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/models/scaler.joblib')
    joblib.dump(encoder,r'C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/models/encoder.joblib')

## Testing_set   

    # 1) Extracting the features from testing set
    
    Extracted_Features_Testing = X_test[selected_features]
    Testing_Features = pd.concat([Extracted_Features_Testing,y_test], axis=1)

    # 2) Encoding the categorical columns from testing set
    
    encoder.fit(Testing_Features[discrete_datatype_features])
    Testing_encoded_categories = encoder.transform(Testing_Features[discrete_datatype_features])
    encoded_discrete_features_testing_df = pd.DataFrame(Testing_encoded_categories, columns=encoder.get_feature_names_out(discrete_datatype_features))

    # 3) Scaling the continuos columns from testing set
    
    scaler.fit(X_test[continuos_datatype_features])
    scaled_continuos_features_testing_df = scaler.transform(X_test[continuos_datatype_features])

    # 4) Concatenating the processed testing set
    
    testing_continuous_features_df = pd.DataFrame(scaled_continuos_features_testing_df , columns= continuos_datatype_features)
    Processed_Testing_Df = pd.concat([testing_continuous_features_df, encoded_discrete_features_testing_df] , axis=1)
    
    # 5) Making prediction 
    
    y_pred = model.predict(Processed_Testing_Df)

    # 6) Evaluating the model
    
    Rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred)) 
    return {'Root Mean Squared Error out of': str(Rmsle) }


In [58]:
build_model(Training_dataset)

{'Root Mean Squared Error out of': '0.19357831076666'}

# **MODEL INFERENCE**

In [64]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:

    """
    Makes predictions on house prices based on the provided input data using
    pre-trained model, scaler, and encoder.

    Args:
        input_data (pd.DataFrame): DataFrame containing the test data.

    Returns:
        np.ndarray: Array of predicted house prices.
    """
    
    dataset_for_testing = r"C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/data/test.csv"
    scaler_location = r'C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/models/scaler.joblib'
    encoder_location = r'C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/models/encoder.joblib'
    model_location = r'C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/models/model.joblib'

    model = joblib.load(model_location)
    scaler = joblib.load(scaler_location)
    encoder = joblib.load(encoder_location)

    continuos_datatype_features = ['LotArea', 'GrLivArea']
    discrete_datatype_features = ['Neighborhood', 'HouseStyle']

    test_scaled = scaler.transform(Testing_df[continuos_datatype_features])
    test_encoded = encoder.transform(Testing_df[discrete_datatype_features])

    selected_features = ['LotArea', 'GrLivArea', 'Neighborhood', 'HouseStyle'] # 2 continuos and 2 Categorical features
    target_feature = ['SalePrice']
   
    
    test_scaled_df = pd.DataFrame(test_scaled,columns=continuos_datatype_features)
    test_encoded_df = pd.DataFrame(test_encoded,columns=encoder.get_feature_names_out(discrete_datatype_features))
    
    Transformed_test_df = pd.concat([test_scaled_df,test_encoded_df], axis=1)

    predict_house_price = model.predict(Transformed_test_df)
    return predict_house_price

In [65]:
# Example usage:
dataset_for_testing = r"C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/data/test.csv"
Testing_df = pd.read_csv(dataset_for_testing)
predictions = make_predictions(Testing_df)
print(predictions)


[[111539.65323364]
 [153677.46598706]
 [185488.24754898]
 ...
 [157965.97784075]
 [144389.60544552]
 [197115.00156427]]
