In [44]:
!pip install pandas
!pip install joblib
!pip install scikit-learn
!pip install pyarrow



In [45]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from IPython.display import display

# **Model building**

In [46]:
def load_dataset(file_path: str) -> pd.DataFrame:
    """
    This function is used to Load a dataset from the specified file path in the local device.

    Args:
        file_path (str): The path of the dataset file.

    Returns:
        pd.DataFrame: The dataset is loaded as Pandas DataFrame.
    """
    dataset = pd.read_csv(file_path)
    print(f"Dataset loaded from {file_path} with shape {dataset.shape}")
    return dataset


In [47]:
file_path = r"C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/data/train.csv"
dataset = load_dataset(file_path)

Dataset loaded from C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/data/train.csv with shape (1460, 81)


In [48]:
def select_features(data: pd.DataFrame, selected_features: list, target_feature: str) -> tuple[pd.DataFrame, pd.Series]:
    """
    selecting the features and target variable for ml training.

    Args:
        data (pd.DataFrame): The dataset to prepare features from.
        selected_features (list): List of feature names to include.
        target_feature (str): The name of the target feature.

    Returns:
        tuple[pd.DataFrame, pd.Series]: The selected features and target variable.
    """
    X = data[selected_features]
    y = data[target_feature]
    return X, y


In [49]:
selected_features = ['LotArea', 'GrLivArea', 'Neighborhood', 'HouseStyle']
target_feature = ['SalePrice']

X, y = select_features(dataset, selected_features, target_feature)

In [50]:
def split_dataset(data: pd.DataFrame, target_feature: str, test_size: float = 0.35, random_state: int = 50) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    This function is used to Split the dataset into training and testing sets.

    Args:
        data (pd.DataFrame): The dataset to split.
        target_feature (str): The column_name of the target feature.
        test_size (float): The proportion of the dataset to include in the test split. Defaults to 0.35.
        random_state (int): Seed used by the random number generator. Defaults to 50.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: The training features, testing features, training target, and testing target.
    """
    X = data.drop(target_feature, axis=1)
    y = data[target_feature]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


In [51]:
X_train, X_test, y_train, y_test = split_dataset(dataset, target_feature)

In [52]:
def extract_features(X: pd.DataFrame, y: pd.Series, selected_features: list) -> pd.DataFrame:
    return X[selected_features]


In [53]:
X_selected = extract_features(X, y, selected_features)

In [54]:
def prepare_training_features(X_train: pd.DataFrame, y_train: pd.Series, select_features: list, target_features: str) -> pd.DataFrame:
    """
    Extracts selected features and the target attribute from the training data and concatenates them into a single DataFrame.

    Args:
        X_train (pd.DataFrame): The training feature data.
        y_train (pd.Series): The training target data.
        selected_features (list): List of feature names to include.
        target_feature (str): The name of the target feature.

    Returns:
        pd.DataFrame: A DataFrame containing the selected features and the target attribute.
    """
    # Extract the selected features from the training data
    extracted_features = X_train[selected_features]
    
    # Extract the target attribute from the training data
    extracted_target = y_train.reset_index(drop=True)
    
    # Concatenate the extracted features and target attribute into a single DataFrame
    training_features_df = pd.concat([extracted_features, y_train], axis=1)
    
    return training_features_df


In [55]:
target_feature = 'SalePrice'
training_features_df = prepare_training_features(X_train, y_train, selected_features, target_feature)

In [56]:
def encode_categorical_features(df: pd.DataFrame, categorical_features: list) -> pd.DataFrame:
    encoder = OneHotEncoder(sparse_output=False)
    encoder.fit(df[categorical_features])
    encoded_categories = encoder.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_features))
    return encoded_df, encoder


In [57]:
categorical_features = ['Neighborhood', 'HouseStyle']
encoded_df, encoder = encode_categorical_features(X_train, categorical_features)

In [58]:
def scale_continuous_features(df: pd.DataFrame, continuous_features: list) -> pd.DataFrame:
    scaler = StandardScaler()
    scaler.fit(df[continuous_features])
    scaled_features = scaler.transform(df[continuous_features])
    scaled_df = pd.DataFrame(scaled_features, columns=continuous_features)
    return scaled_df, scaler


In [59]:
continuos_features = ['LotArea', 'GrLivArea']
scaled_df, scaler = scale_continuous_features(X_train, continuos_features)

In [60]:
def concatenate_features(continuous_features_df: pd.DataFrame, categorical_features_df: pd.DataFrame) -> pd.DataFrame:
    concatenated = pd.concat([continuous_features_df, categorical_features_df], axis=1)
    return concatenated


In [61]:
combined_features_df = concatenate_features(scaled_df, encoded_df)


In [62]:
def train_model(combined_features_df: pd.DataFrame, y_train: pd.Series, model_path: str, scaler_path: str, encoder_path: str):
    model = LinearRegression()
    model.fit(combined_features_df, y_train)
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    joblib.dump(encoder, encoder_path)
    return model


In [63]:
model_path = r"C:\Users\edwin victor\git repositories\dsp-edwinvictor-justin\models\model.joblib"
scaler_path = r"C:\Users\edwin victor\git repositories\dsp-edwinvictor-justin\models\scaler.joblib"
encoder_path = r"C:\Users\edwin victor\git repositories\dsp-edwinvictor-justin\models\encoder.joblib"
model = train_model(combined_features_df, y_train, model_path, scaler_path, encoder_path)

In [64]:
def process_testing_set(X_test: pd.DataFrame, categorical_features: list, continuous_features: list, encoder, scaler) -> pd.DataFrame:
    # Encode categorical features
    
    encoder = joblib.load(encoder)
    scaler = joblib.load(scaler)
    categorical_df = X_test[categorical_features]
    continuous_df = X_test[continuous_features]

    encoded_categories = encoder.transform(categorical_df)
    encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_features))
    
    # Scale continuous features
    scaled_features = scaler.transform(continuous_df)
    scaled_df = pd.DataFrame(scaled_features, columns=continuous_features)
    
    # Concatenate features
    processed_test_df = pd.concat([scaled_df, encoded_df], axis =1)
    y_pred = model.predict(processed_test_df)
    return processed_test_df,y_pred


In [65]:
testing_set, y_pred = process_testing_set(X_test, categorical_features,continuos_features,encoder_path,scaler_path)

In [66]:
def evaluate_model(y_test: pd.Series, y_pred: np.ndarray) -> dict[str, str]:
    Rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return {'Root Mean Squared Error out of': str(Rmsle)}


In [67]:
evaluate = evaluate_model(y_test, y_pred)
print(evaluate)

{'Root Mean Squared Error out of': '0.19120286564820066'}


# **Model Inference**

In [68]:

def load_model_and_transformers(model_path: str, scaler_path: str, encoder_path: str):
    """
    Loads the pre-trained model, scaler, and encoder.

    Args:
        model_path (str): Path to the saved model.
        scaler_path (str): Path to the saved scaler.
        encoder_path (str): Path to the saved encoder.

    Returns:
        model: Loaded model object.
        scaler: Loaded scaler object.
        encoder: Loaded encoder object.
    """
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    encoder = joblib.load(encoder_path)
    return model, scaler, encoder




In [69]:
model, scaler, encoder = load_model_and_transformers(model_path, scaler_path, encoder_path)


In [72]:
def preprocess_data_and_predict(Testing_df: pd.DataFrame, scaler, encoder, model, continuos_features: list, categorical_features: list) -> pd.DataFrame:
    """
    Preprocesses the input data by scaling continuous features and encoding categorical features.

    Args:
        input_data (pd.DataFrame): The data to preprocess.
        scaler: Scaler object to scale continuous features.
        encoder: Encoder object to encode categorical features.
        continuous_features (list): List of continuous feature names.
        discrete_features (list): List of discrete feature names.

    Returns:
        pd.DataFrame: Preprocessed data.
    """
    test_scaled = scaler.transform(Testing_df[continuos_features])
    test_encoded = encoder.transform(Testing_df[categorical_features])

    test_scaled_df = pd.DataFrame(test_scaled, columns=continuos_features)
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_features))

    transformed_test_df = pd.concat([test_scaled_df, test_encoded_df], axis=1)
    predict_house_price = model.predict(transformed_test_df)
    return predict_house_price



In [73]:
Test_dataset = r"C:/Users/edwin victor/git repositories/dsp-edwinvictor-justin/data/test.csv"
Testing_df = pd.read_csv(Test_dataset)
predict_house_price = preprocess_data_and_predict(Testing_df, scaler, encoder, model ,continuos_features, categorical_features)
print(predict_house_price)

[[111539.65323364]
 [153677.46598706]
 [185488.24754898]
 ...
 [157965.97784075]
 [144389.60544552]
 [197115.00156427]]
