In [49]:
import requests, os, re, tqdm, time, pickle
import numpy as np, pandas as pd, matplotlib as mpl, matplotlib.pyplot as plt, seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, f1_score, precision_score, recall_score, \
classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#ignore warnings
import warnings
warnings.filterwarnings(action= 'ignore')

In [51]:
bmw_df = pd.read_excel('BMW_new.xlsx')

In [53]:
# 1. Data Separating

bmw_df.drop_duplicates(inplace=True)
seperated = bmw_df['Motor'].str.split('/', expand=True).apply(lambda x: x.str.strip())

for i in seperated.columns:
    bmw_df.loc[seperated[i].str.contains(r'\d+\.\d+ L', na=False), 'Motor gucu'] = seperated[i].str.split('L').str[0]
    bmw_df.loc[seperated[i].str.contains(r'\d+ a\.g\.', na=False), 'At gucu'] = seperated[i].str.split('a.g.').str[0]
bmw_df['Yanacaq novu'] = bmw_df['Motor'].str.split('/').str[-1]

bmw_df.drop('Motor', axis=1, inplace=True)

bmw_df.rename(columns={'Yeni ya Kohne': 'Yeni?'}, inplace=True)

bmw_df['Km'] = bmw_df['Km'].str.split('km').str[0]
bmw_df['At gucu'] = bmw_df['At gucu'].str.split(' ').str[0]
bmw_df['At gucu'] = bmw_df['At gucu'].astype(int)

bmw_df['Qiymet'] = bmw_df['Qiymet'].str.extract(r'(\d+\s+\d+)')

# 2. Data type Changing
bmw_df['Motor gucu'] = bmw_df['Motor gucu'].astype(float)
bmw_df['Km'] = bmw_df['Km'].str.replace(' ', '')
bmw_df['Km'] = bmw_df['Km'].astype(int)

bmw_df['Qiymet'] = bmw_df['Qiymet'].str.replace(' ', '')
bmw_df['Qiymet'] = bmw_df['Qiymet'].astype(float)
bmw_df['At gucu'] = bmw_df['At gucu'].astype(int)


bmw_df['Sifarisle'].fillna('Sifarishsiz', inplace=True)
bmw_df['Motor gucu'].fillna(0, inplace=True)
bmw_df['Resmi Satish'].fillna('Shexsi', inplace=True)
bmw_df['Hansi Bazar'].fillna('Yoxdur', inplace=True)

bmw_df.drop(bmw_df[bmw_df['Qiymet'].isnull()].index, axis=0, inplace=True)

target_types = ['Universal, 5 qapı', 'Hetçbek, 5 qapı', 'Kabriolet','Rodster']

new_bmw_df = bmw_df[~bmw_df['Ban novu'].isin(target_types)]

new_bmw_df['Yerlerin sayi'].fillna(5, inplace=True)
new_bmw_df['Elaveler'].fillna('Yüngül lehimli disklər, ABS, Kondisioner', inplace=True)
new_bmw_df['Qiymet'] = new_bmw_df['Qiymet'].astype(int)

new_bmw_df['Kredit'] = np.where(new_bmw_df['Kredit ve Barter'].str.contains('Kredit', na=False), 'var', 'yox')
new_bmw_df['Barter'] = np.where(new_bmw_df['Kredit ve Barter'].str.contains('Barter', na=False), 'var', 'yox')

new_bmw_df.drop('Kredit ve Barter', axis=1, inplace=True)
new_bmw_df.drop(new_bmw_df[new_bmw_df['Veziyyeti'] == 'Vuruğu var, rənglənməyib'].index, axis=0, inplace=True)
new_bmw_df.drop('Veziyyeti', axis=1, inplace=True)

for idx, row in new_bmw_df.iterrows():
    if row['Resmi Satish'] != 'Shexsi':
        new_bmw_df.loc[idx, 'Resmi Satish'] = 'Salon'

new_bmw_df['Yerlerin sayi'] = new_bmw_df['Yerlerin sayi'].astype('int64')
new_bmw_df['Km'] = new_bmw_df['Km'].astype('int64')
new_bmw_df['Qiymet'] = new_bmw_df['Qiymet'].astype('int64')

In [57]:
new_bmw_df.to_excel('BMW for Streamlit filtering.xlsx', index=False)

### Data Modelling

### 1. Deleting useless columns, creating encoders, creating excels for each model

In [29]:
new_bmw_df.drop(columns = {'Bashlig','Sheher', 'Kredit', 'Barter', 'Shekil urli', 
                           'Url', 'Yeni?', 'Sifarisle', 'Elaveler'},
                axis=1, inplace=True)
new_bmw_df = new_bmw_df[new_bmw_df['Model'] != '523d']

df = new_bmw_df.copy()
df['Series'] = [None] * df.shape[0]

df = df[['Series', 'Model', 'Buraxilish ili', 'Ban novu', 'Reng', 'Km', 'Suretler qutusu',
       'Oturucu', 'Resmi Satish', 'Qiymet', 'Yerlerin sayi', 'Hansi Bazar',
       'Motor gucu', 'At gucu', 'Yanacaq novu' ]]

df.loc[df[df['Model'].str.startswith('X')].index, 'Series'] = 'X'
df.loc[df[df['Model'].str.startswith('7')].index, 'Series'] = 7
df.loc[df[df['Model'].str.startswith('5')].index, 'Series'] = 5
df.loc[df[df['Model'].str.startswith('4')].index, 'Series'] = 4
df.loc[df[df['Model'].str.startswith('3')].index, 'Series'] = 3

df = df[~df['Series'].isnull()]
df['Model'] = df['Model'].str.replace('X', '')
df['Model'] = df['Model'].str.replace('e', '')
df = df[~(df['Model'].isin(['5 M', '6 M', 'M', 'M 50']))]
df['Model'] = df['Model'].str.replace(' xDriv', '')
df['Model'] = df['Model'].str.replace('L', '')
 
df['Model'] = df['Model'].str.replace('GT', '')
df['Model'] = df['Model'].astype(int)
df = df[df['Series'] != 4]

In [59]:
df.to_excel('BMW for Streamlit modelling.xlsx', index=False)

### 2. Scaling and modelling with LinearRegression

In [33]:
all_series_score_lr = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    filename = f'pipeline_lr_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_lr = pipeline.predict(X_test)
    train_predictions_lr = pipeline.predict(X_train)

    mae_lr = mean_absolute_error(y_test, predictions_lr)
    mse_lr = mean_squared_error(y_test, predictions_lr)
    rmse_lr = np.sqrt(mse_lr)
    r2_lr = r2_score(y_test, predictions_lr)
    train_r2_lr = r2_score(y_train, train_predictions_lr)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Linear Regression ---") 
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_lr:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_lr:.2f}")
    print(f"R-squared (R2): {r2_lr:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_lr:.2f}")
    print(f"R2 on Test Data:     {r2_lr:.2f}")

    results_df_lr = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_lr
    })
    results_df_lr = results_df_lr.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_lr.head())

    series_result_lr = {
        'Series': series,
        'R2_Score_LR': round(r2_lr, 2)
    }

    all_series_score_lr.append(series_result_lr)

r2_scores_df_lr = pd.DataFrame(all_series_score_lr)
r2_scores_df_lr = r2_scores_df_lr.sort_values(by='R2_Score_LR', ascending=False, ignore_index=True) 

print("\n=======================================================")
print("=== Final R-squared Scores for All Series (Linear Regression) ===") 
print("=======================================================")
print(r2_scores_df_lr)


--------------------------------------------

--- Results for Linear Regression ---

For series of 'X'
Pipeline saved as: pipeline_lr_X.sav
Mean Absolute Error (MAE): 18562.62
Root Mean Squared Error (RMSE): 25199.48
R-squared (R2): 0.80

--- Overfitting Check ---
R2 on Training Data: 0.81
R2 on Test Data:     0.80

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         22000    -43920.642392
1         20500      7961.711616
2         42330     51865.699881
3        140250    114530.279910
4         18700     30182.275726

--------------------------------------------

--- Results for Linear Regression ---

For series of '3'
Pipeline saved as: pipeline_lr_3.sav
Mean Absolute Error (MAE): 4384.44
Root Mean Squared Error (RMSE): 6344.70
R-squared (R2): 0.71

--- Overfitting Check ---
R2 on Training Data: 0.83
R2 on Test Data:     0.71

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         23800     25598.243232
1         2290

### Modelling with DecisionTree

In [35]:
all_series_score_dt = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    filename = f'pipeline_dt_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_dt = pipeline.predict(X_test)
    train_predictions_dt = pipeline.predict(X_train)

    mae_dt = mean_absolute_error(y_test, predictions_dt)
    mse_dt = mean_squared_error(y_test, predictions_dt)
    rmse_dt = np.sqrt(mse_dt)
    r2_dt = r2_score(y_test, predictions_dt)
    train_r2_dt = r2_score(y_train, train_predictions_dt)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Decision Tree ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_dt:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_dt:.2f}")
    print(f"R-squared (R2): {r2_dt:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_dt:.2f}")
    print(f"R2 on Test Data:     {r2_dt:.2f}")

    results_df_dt = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_dt
    })
    results_df_dt = results_df_dt.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_dt.head())

    series_result_dt = {
        'Series': series,
        'R2_Score_DT': round(r2_dt, 2)
    }

    all_series_score_dt.append(series_result_dt)

r2_scores_df_dt = pd.DataFrame(all_series_score_dt)
r2_scores_df_dt = r2_scores_df_dt.sort_values(by='R2_Score_DT', ascending=False, ignore_index=True) # <-- Changed key

print("\n=======================================================")
print("=== Final R-squared Scores for All Series (Decision Tree) ===")
print("=======================================================")
print(r2_scores_df_dt)


--------------------------------------------

--- Results for Decision Tree ---

For series of 'X'
Pipeline saved as: pipeline_dt_X.sav
Mean Absolute Error (MAE): 15800.56
Root Mean Squared Error (RMSE): 23938.13
R-squared (R2): 0.82

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.82

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         22000          21800.0
1         20500          26800.0
2         42330          40700.0
3        140250         185300.0
4         18700          34000.0

--------------------------------------------

--- Results for Decision Tree ---

For series of '3'
Pipeline saved as: pipeline_dt_3.sav
Mean Absolute Error (MAE): 3572.00
Root Mean Squared Error (RMSE): 4956.86
R-squared (R2): 0.82

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.82

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         23800          20900.0
1         22900       

### Modelling with RandomForest

In [37]:
all_series_score_rf = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    filename = f'pipeline_rf_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_rf = pipeline.predict(X_test)
    train_predictions_rf = pipeline.predict(X_train)

    mae_rf = mean_absolute_error(y_test, predictions_rf)
    mse_rf = mean_squared_error(y_test, predictions_rf)
    rmse_rf = np.sqrt(mse_rf)
    r2_rf = r2_score(y_test, predictions_rf)
    train_r2_rf = r2_score(y_train, train_predictions_rf)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Random Forest ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")
    print(f"R-squared (R2): {r2_rf:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_rf:.2f}")
    print(f"R2 on Test Data:     {r2_rf:.2f}")

    results_df_rf = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_rf
    })
    results_df_rf = results_df_rf.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_rf.head())

    series_result_rf = {
        'Series': series,
        'R2_Score_RF': round(r2_rf, 2)
    }

    all_series_score_rf.append(series_result_rf)

r2_scores_df_rf = pd.DataFrame(all_series_score_rf)
r2_scores_df_rf = r2_scores_df_rf.sort_values(by='R2_Score_RF', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series ===")
print("=======================================================")
print(r2_scores_df_rf)


--------------------------------------------

--- Results for Random Forest ---

For series of 'X'
Pipeline saved as: pipeline_rf_X.sav
Mean Absolute Error (MAE): 11486.65
Root Mean Squared Error (RMSE): 16630.67
R-squared (R2): 0.91

--- Overfitting Check ---
R2 on Training Data: 0.98
R2 on Test Data:     0.91

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         22000         24127.97
1         20500         25226.29
2         42330         46163.70
3        140250        176114.54
4         18700         32155.00

--------------------------------------------

--- Results for Random Forest ---

For series of '3'
Pipeline saved as: pipeline_rf_3.sav
Mean Absolute Error (MAE): 2786.44
Root Mean Squared Error (RMSE): 3751.18
R-squared (R2): 0.90

--- Overfitting Check ---
R2 on Training Data: 0.99
R2 on Test Data:     0.90

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         23800     22271.010000
1         22900     28

### Modelling with GradientBoost

In [39]:
warnings.filterwarnings('ignore')

all_series_score_gb = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish' ], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)
    
    filename = f'pipeline_gb_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_gb = pipeline.predict(X_test)
    train_predictions_gb = pipeline.predict(X_train)

    mae_gb = mean_absolute_error(y_test, predictions_gb)
    mse_gb = mean_squared_error(y_test, predictions_gb)
    rmse_gb = np.sqrt(mse_gb)
    r2_gb = r2_score(y_test, predictions_gb)
    train_r2_gb = r2_score(y_train, train_predictions_gb)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Gradient Boosting ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_gb:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_gb:.2f}")
    print(f"R-squared (R2): {r2_gb:.2f}\n")
    
    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_gb:.2f}")
    print(f"R2 on Test Data:     {r2_gb:.2f}")

    results_df_gb = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_gb
    })
    results_df_gb = results_df_gb.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_gb.head())

    series_result_gb = {
        'Series': series,
        'R2_Score_GB': round(r2_gb, 2)
    }
    
    all_series_score_gb.append(series_result_gb)

r2_scores_df_gb = pd.DataFrame(all_series_score_gb)
r2_scores_df_gb = r2_scores_df_gb.sort_values(by='R2_Score_GB', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series ===")
print("=======================================================")
print(r2_scores_df_gb)


--------------------------------------------

--- Results for Gradient Boosting ---

For series of 'X'
Pipeline saved as: pipeline_gb_X.sav
Mean Absolute Error (MAE): 9594.48
Root Mean Squared Error (RMSE): 14928.72
R-squared (R2): 0.93

--- Overfitting Check ---
R2 on Training Data: 0.99
R2 on Test Data:     0.93

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         22000     20827.024848
1         20500     24017.368472
2         42330     46278.204722
3        140250    139141.350189
4         18700     28252.941467

--------------------------------------------

--- Results for Gradient Boosting ---

For series of '3'
Pipeline saved as: pipeline_gb_3.sav
Mean Absolute Error (MAE): 2334.41
Root Mean Squared Error (RMSE): 2911.87
R-squared (R2): 0.94

--- Overfitting Check ---
R2 on Training Data: 0.98
R2 on Test Data:     0.94

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         23800     23371.485175
1         22900

### Modelling with XGBoost

In [43]:
all_series_score_xgb = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    # Changed filename to avoid overwriting other models
    filename = f'pipeline_xgb_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_xgb = pipeline.predict(X_test)
    train_predictions_xgb = pipeline.predict(X_train)

    mae_xgb = mean_absolute_error(y_test, predictions_xgb)
    mse_xgb = mean_squared_error(y_test, predictions_xgb)
    rmse_xgb = np.sqrt(mse_xgb)
    r2_xgb = r2_score(y_test, predictions_xgb)
    train_r2_xgb = r2_score(y_train, train_predictions_xgb)

    print('\n--------------------------------------------')
    print(f"\n--- Results for XGBoost Regressor ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_xgb:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_xgb:.2f}")
    print(f"R-squared (R2): {r2_xgb:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_xgb:.2f}")
    print(f"R2 on Test Data:     {r2_xgb:.2f}")

    results_df_xgb = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_xgb
    })
    results_df_xgb = results_df_xgb.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_xgb.head())

    series_result_xgb = {
        'Series': series,
        'R2_Score_XGB': round(r2_xgb, 2)
    }

    all_series_score_xgb.append(series_result_xgb)

r2_scores_df_xgb = pd.DataFrame(all_series_score_xgb)
r2_scores_df_xgb = r2_scores_df_xgb.sort_values(by='R2_Score_XGB', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series (XGBoost) ===")
print("=======================================================")
print(r2_scores_df_xgb)


--------------------------------------------

--- Results for XGBoost Regressor ---

For series of 'X'
Pipeline saved as: pipeline_xgb_X.sav
Mean Absolute Error (MAE): 12076.67
Root Mean Squared Error (RMSE): 18367.88
R-squared (R2): 0.89

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.89

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         22000     18312.542969
1         20500     24939.437500
2         42330     48910.050781
3        140250    172299.140625
4         18700     26073.714844

--------------------------------------------

--- Results for XGBoost Regressor ---

For series of '3'
Pipeline saved as: pipeline_xgb_3.sav
Mean Absolute Error (MAE): 3600.45
Root Mean Squared Error (RMSE): 4875.13
R-squared (R2): 0.83

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.83

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         23800     20963.255859
1         22

In [45]:
all_df_results = pd.concat([r2_scores_df_lr, r2_scores_df_dt, r2_scores_df_rf, r2_scores_df_gb, r2_scores_df_xgb], axis=1)

all_df_results = all_df_results.loc[:, ~all_df_results.columns.duplicated()]

all_df_results

Unnamed: 0,Series,R2_Score_LR,R2_Score_DT,R2_Score_RF,R2_Score_GB,R2_Score_XGB
0,X,0.8,0.94,0.98,0.97,0.97
1,5,0.75,0.84,0.97,0.94,0.95
2,3,0.71,0.82,0.91,0.94,0.89
3,7,0.62,0.82,0.9,0.93,0.83


In [47]:
# save the model to disk

model_lr = LinearRegression()
filename = 'finalized_model_bmw_lr.sav'
pickle.dump(model_lr, open(filename, 'wb'))

model_dt = DecisionTreeRegressor(random_state=42)
filename = 'finalized_model_bmw_dt.sav'
pickle.dump(model_dt, open(filename, 'wb'))

model_rf = RandomForestRegressor(random_state=42)
filename = 'finalized_model_bmw_rf.sav'
pickle.dump(model_rf, open(filename, 'wb'))

model_gb = GradientBoostingRegressor(random_state=42)
filename = 'finalized_model_bmw_gb_new.sav'
pickle.dump(model_gb, open(filename, 'wb'))

model_kn = KNeighborsRegressor()
filename = 'finalized_model_bmw_kn.sav'
pickle.dump(model_kn, open(filename, 'wb'))

model_xgb = XGBRegressor(random_state=42)
filename = 'finalized_model_bmw_xgb.sav'
pickle.dump(model_xgb, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)