In [None]:
import pickle, os, sys
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
# TRAIN PIPELINE

# READ DF
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
sns.pairplot(df[['MPG', 'Displacement', 'Weight']], diag_kind='kde')


In [None]:
# INITIALIZE
metr = {}
pipeline = {}
tar_col = "MPG"

X = df.drop(columns=[tar_col])  # Features
y = df[tar_col]  # Target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numerical and categorical column names
numerical_cols = [col for col, dtype in df.dtypes.items() if not(dtype == "object") and not (col == tar_col)]  # Replace with actual numerical column names
categorical_cols = [col for col, dtype in df.dtypes.items() if (dtype == "object") and not (col == tar_col)]  # Replace with actual categorical column names

# Impute missing values
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Update the preprocessing steps for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', numerical_imputer),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps for both numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#########################################################################
#########################FIT############################################
#########################################################################

regression_method = LinearRegression() # RandomForestRegressor, MLPRegressor
# Update the pipeline with the new preprocessing steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', regression_method)])

pipeline.fit(X_train, y_train)


#########################################################################
#########################PERFOMANCE############################################
#########################################################################
# Make predictions on the testing data
predictions = pipeline.predict(X_test)

metr = {}
metr["r2"] = r2_score(y_test, predictions)
metr["mse"] = mean_squared_error(y_test, predictions)
metr["rmse"] = mean_squared_error(y_test, predictions, squared=False)
metr["mae"] = mean_absolute_error(y_test, predictions)
metr["nmse"] = mean_squared_error(y_test, predictions) / (y_test.max() - y_test.min())
metr["y_test"] = y_test
metr["predictions"] = predictions

fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(y_test, predictions, color='blue', label='True vs Predicted')
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
ax.set_xlabel('True Values')
ax.set_ylabel('Predicted Values')
ax.set_title(f"True vs Predicted Values (RMSE: {metr['rmse']:.2f})")
ax.legend(loc='best')
ax.grid(True)

print(f"")

In [None]:
df.describe().transpose()

In [None]:
X_tr = pd.DataFrame(preprocessor.fit_transform(df))
X_tr.columns = X.columns
X_tr.describe().transpose()

In [None]:
# SAVE PIPELINE
j_pipeline = {"pipeline" : pipeline}
with open(os.path.join("out", "savedRegPipeline.pkl"), "wb") as f:
    pickle.dump(j_pipeline, f)

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)

df.to_excel(os.path.join("in", "autoMpg.xlsx"), index=False)

In [None]:
# LOAD PIPELINE AND TEST
import pandas as pd
import pickle, os
import numpy as np
# read data
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
df = pd.read_excel(os.path.join("in", "autoMpg.xlsx"))

# load pipeline
with open(os.path.join("out", "savedRegPipeline.pkl"), "rb") as f:
    j_pipeline = pickle.load(f)
pipeline = j_pipeline["pipeline"]

# predict values
y_pred = pipeline.predict(df)

In [None]:
y_tar = df["MPG"].to_numpy()
squared_errors = (y_tar - y_pred) ** 2
mean_squared_error = np.mean(squared_errors)
rmse = np.sqrt(mean_squared_error)

print(f"RMSE: {rmse}")