In [1]:
import requests, os, re, tqdm, time, pickle
import numpy as np, pandas as pd, matplotlib as mpl, matplotlib.pyplot as plt, seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, f1_score, precision_score, recall_score, \
classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#ignore warnings
import warnings
warnings.filterwarnings(action= 'ignore')

In [2]:
mercedes_df = pd.read_excel('Mercedes_new.xlsx')

In [3]:
mercedes_df.head(2)

Unnamed: 0,Bashlig,Sheher,Model,Buraxilish ili,Ban novu,Reng,Motor,Km,Suretler qutusu,Oturucu,...,Kredit ve Barter,Resmi Satish,Qiymet,Elaveler,Sifarisle,Shekil urli,Yerlerin sayi,Veziyyeti,Hansi Bazar,Url
0,"Mercedes S 500, 4.7 L, 2015 il, 159 200 km",Bakı,S 500,2015,Sedan,Boz,4.7 L / 455 a.g. / Benzin,159 200 km,Avtomat (AT),Arxa,...,"Barter,None",Abu Dabi Cars №1,≈ 64 430 ₼,"Yüngül lehimli disklər, ABS, Lyuk, Yağış senso...",,https://turbo.azstatic.com/uploads/thumbnail/2...,5.0,"Vuruğu yoxdur, rənglənməyib",Avropa,https://turbo.az/autos/9530942-mercedes-s-500
1,"Mercedes E 350 e , 2.0 L, 2025 il, yeni",Bakı,E 350 e,2025,Sedan,Ağ,2.0 L / 320 a.g. / Plug-in Hibrid,0 km,Avtomat (AT),Arxa,...,"Kredit,Barter",Mercedes-Benz AutoStar Kaukasus GmbH,≈ 151 092 ₼,"Yüngül lehimli disklər, ABS, Lyuk, Yağış senso...",,https://turbo.azstatic.com/uploads/thumbnail/2...,,"Vuruğu yoxdur, rənglənməyib",,https://turbo.az/autos/9724042-mercedes-e-350-e


In [7]:
# 1. Data Separating

mercedes_df.drop_duplicates(inplace=True)
seperated = mercedes_df['Motor'].str.split('/', expand=True).apply(lambda x: x.str.strip())

for i in seperated.columns:
    mercedes_df.loc[seperated[i].str.contains(r'\d+\.\d+ L', na=False), 'Motor gucu'] = seperated[i].str.split('L').str[0]
    mercedes_df.loc[seperated[i].str.contains(r'\d+ a\.g\.', na=False), 'At gucu'] = seperated[i].str.split('a.g.').str[0]
mercedes_df['Yanacaq novu'] = mercedes_df['Motor'].str.split('/').str[-1]

mercedes_df.drop('Motor', axis=1, inplace=True)

mercedes_df.rename(columns={'Yeni ya Kohne': 'Yeni?'}, inplace=True)

mercedes_df['Km'] = mercedes_df['Km'].str.split('km').str[0]
mercedes_df['At gucu'] = mercedes_df['At gucu'].str.split(' ').str[0]
mercedes_df['At gucu'] = mercedes_df['At gucu'].astype(int)

mercedes_df['Qiymet'] = mercedes_df['Qiymet'].str.extract(r'(\d+\s+\d+)')


mercedes_df['Motor gucu'] = mercedes_df['Motor gucu'].astype(float)
mercedes_df['Km'] = mercedes_df['Km'].str.replace(' ', '')
mercedes_df['Km'] = mercedes_df['Km'].astype(int)

mercedes_df['Qiymet'] = mercedes_df['Qiymet'].str.replace(' ', '')
mercedes_df['Qiymet'] = mercedes_df['Qiymet'].astype(float)

mercedes_df['Sifarisle'].fillna('Sifarishsiz', inplace=True)
mercedes_df['Motor gucu'].fillna(0, inplace=True)
mercedes_df['Resmi Satish'].fillna('Shexsi', inplace=True)
mercedes_df['Hansi Bazar'].fillna('Yoxdur', inplace=True)

mercedes_df.drop(mercedes_df[mercedes_df['Qiymet'].isnull()].index, axis=0, inplace=True)

target_types = ['Universal, 5 qapı', 'Hetçbek, 5 qapı', 'Kabriolet','Rodster', 'Kompakt-Van', 'Avtobus', 'Dartqı', 
                'Liftbek', 'Mikroavtobus', 'Yük maşını', 'Kupe']
new_mercedes_df = mercedes_df[~mercedes_df['Ban novu'].isin(target_types)].copy()


new_mercedes_df['Elaveler'].fillna('Yüngül lehimli disklər, ABS, Kondisioner', inplace=True)
new_mercedes_df['Qiymet'] = new_mercedes_df['Qiymet'].astype(int)


new_mercedes_df['Kredit'] = np.where(new_mercedes_df['Kredit ve Barter'].str.contains('Kredit', na=False), 'var', 'yox')
new_mercedes_df['Barter'] = np.where(new_mercedes_df['Kredit ve Barter'].str.contains('Barter', na=False), 'var', 'yox')

new_mercedes_df.drop('Kredit ve Barter', axis=1, inplace=True)
new_mercedes_df.drop(new_mercedes_df[new_mercedes_df['Veziyyeti'] == 'Vuruğu var, rənglənməyib'].index, axis=0, inplace=True)
new_mercedes_df.drop('Veziyyeti', axis=1, inplace=True)

new_mercedes_df.loc[new_mercedes_df['Resmi Satish'] != 'Shexsi', 'Resmi Satish'] = 'Salon'

new_mercedes_df = new_mercedes_df[~new_mercedes_df['Yerlerin sayi'].isin(['1', '2'])]


new_mercedes_df.loc[new_mercedes_df['Yerlerin sayi'].isnull() & new_mercedes_df['Yerlerin sayi'].str.startswith('F', na=False), 'Yerlerin sayi'] = '8'
new_mercedes_df.loc[new_mercedes_df['Yerlerin sayi'].isnull() & new_mercedes_df['Yerlerin sayi'].str.startswith('S', na=False), 'Yerlerin sayi'] = '4'
new_mercedes_df.loc[new_mercedes_df['Yerlerin sayi'].isnull() & new_mercedes_df['Yerlerin sayi'].str.startswith('O', na=False), 'Yerlerin sayi'] = '5'
new_mercedes_df.loc[new_mercedes_df['Yerlerin sayi'].isnull() & new_mercedes_df['Yerlerin sayi'].str.startswith('SUV', na=False), 'Yerlerin sayi'] = '5'
new_mercedes_df.loc[new_mercedes_df['Yerlerin sayi'] == '8+', 'Yerlerin sayi'] = '8'
new_mercedes_df.loc[new_mercedes_df['Yerlerin sayi'].isnull(), 'Yerlerin sayi'] = '5'

In [9]:
new_mercedes_df.to_excel('Mercedes for Streamlit filtering.xlsx', index=False)

In [11]:
new_mercedes_df.drop(columns = {'Bashlig','Sheher', 'Kredit', 'Barter', 'Shekil urli', 
                           'Url', 'Yeni?', 'Sifarisle', 'Elaveler'},
                axis=1, inplace=True)

df = new_mercedes_df.copy()
df['Series'] = [None] * df.shape[0]

df = df[['Series', 'Model', 'Buraxilish ili', 'Ban novu', 'Reng', 'Km', 'Suretler qutusu',
       'Oturucu', 'Resmi Satish', 'Qiymet', 'Yerlerin sayi', 'Hansi Bazar',
       'Motor gucu', 'At gucu', 'Yanacaq novu' ]]

df = df[~df['Model'].str.startswith('Sprinter')]
df = df[~df['Model'].str.startswith('CLA')]

df.loc[df[df['Model'].str.startswith('E')].index, 'Series'] = 'E'
df.loc[df[df['Model'].str.startswith('C')].index, 'Series'] = 'C'
df.loc[df[df['Model'].str.startswith('S')].index, 'Series'] = 'S'
#df.loc[df['Model'].str.startswith(('G ', 'GLS', 'ML', 'GL ', 'GLE', 'GLC'), na=False), 'Series'] = 'SUV'
df.loc[df[df['Model'].str.startswith('V')].index, 'Series'] = 'V'

df = df[~df['Series'].isnull()]
df['Model'] = df['Model'].str.replace('E', '')
df['Model'] = df['Model'].str.replace('C', '')
df['Model'] = df['Model'].str.replace('S', '')
df['Model'] = df['Model'].str.replace('Vito', '')
df['Model'] = df['Model'].str.replace(r'^(G|GLS|ML|GL|GLE|GLC|4MATIC)', '', regex=True)

df['Model'] = df['Model'].str.extract(r'(\d+)')
df.drop(df[df['Model'].isnull()].index, axis=0, inplace=True)

df['Model'] = df['Model'].astype(int)
df['Yerlerin sayi'] = df['Yerlerin sayi'].astype(int)
df = df[df['Series'] != 'V']

In [13]:
df.to_excel('Mercedes for Streamlit modelling.xlsx', index=False)

### Scaling and Modelling for LinearRegression

In [15]:
all_series_score_lr = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    filename = f'pipeline_lr_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_lr = pipeline.predict(X_test)
    train_predictions_lr = pipeline.predict(X_train)

    mae_lr = mean_absolute_error(y_test, predictions_lr)
    mse_lr = mean_squared_error(y_test, predictions_lr)
    rmse_lr = np.sqrt(mse_lr)
    r2_lr = r2_score(y_test, predictions_lr)
    train_r2_lr = r2_score(y_train, train_predictions_lr)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Linear Regression ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_lr:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_lr:.2f}")
    print(f"R-squared (R2): {r2_lr:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_lr:.2f}")
    print(f"R2 on Test Data:     {r2_lr:.2f}")

    results_df_lr = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_lr
    })
    results_df_lr = results_df_lr.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_lr.head())

    series_result_lr = {
        'Series': series,
        'R2_Score_LR': round(r2_lr, 2)
    }

    all_series_score_lr.append(series_result_lr)

r2_scores_df_lr = pd.DataFrame(all_series_score_lr)
r2_scores_df_lr = r2_scores_df_lr.sort_values(by='R2_Score_LR', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series (Linear Regression) ===") 
print("=======================================================")
print(r2_scores_df_lr)


--------------------------------------------

--- Results for Linear Regression ---

For series of 'S'
Pipeline saved as: pipeline_lr_S.sav
Mean Absolute Error (MAE): 23800.48
Root Mean Squared Error (RMSE): 44706.13
R-squared (R2): 0.68

--- Overfitting Check ---
R2 on Training Data: 0.82
R2 on Test Data:     0.68

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         57630     60714.777644
1         64430     58449.401532
2         51340     62749.290658
3         69700     68231.538061
4          8800      5551.924346

--------------------------------------------

--- Results for Linear Regression ---

For series of 'E'
Pipeline saved as: pipeline_lr_E.sav
Mean Absolute Error (MAE): 7979.65
Root Mean Squared Error (RMSE): 11616.78
R-squared (R2): 0.81

--- Overfitting Check ---
R2 on Training Data: 0.86
R2 on Test Data:     0.81

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         14500     15473.364617
1         415

### Modelling for DecisionTree

In [17]:
all_series_score_dt = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    filename = f'pipeline_dt_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_dt = pipeline.predict(X_test)
    train_predictions_dt = pipeline.predict(X_train)

    mae_dt = mean_absolute_error(y_test, predictions_dt)
    mse_dt = mean_squared_error(y_test, predictions_dt)
    rmse_dt = np.sqrt(mse_dt)
    r2_dt = r2_score(y_test, predictions_dt)
    train_r2_dt = r2_score(y_train, train_predictions_dt)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Decision Tree ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_dt:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_dt:.2f}")
    print(f"R-squared (R2): {r2_dt:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_dt:.2f}")
    print(f"R2 on Test Data:     {r2_dt:.2f}")

    results_df_dt = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_dt
    })
    results_df_dt = results_df_dt.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_dt.head())

    series_result_dt = {
        'Series': series,
        'R2_Score_DT': round(r2_dt, 2)
    }

    all_series_score_dt.append(series_result_dt)

r2_scores_df_dt = pd.DataFrame(all_series_score_dt)
r2_scores_df_dt = r2_scores_df_dt.sort_values(by='R2_Score_DT', ascending=False, ignore_index=True) # <-- Changed key

print("\n=======================================================")
print("=== Final R-squared Scores for All Series (Decision Tree) ===")
print("=======================================================")
print(r2_scores_df_dt)


--------------------------------------------

--- Results for Decision Tree ---

For series of 'S'
Pipeline saved as: pipeline_dt_S.sav
Mean Absolute Error (MAE): 8642.15
Root Mean Squared Error (RMSE): 11075.45
R-squared (R2): 0.98

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.98

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         57630          69700.0
1         64430          71400.0
2         51340          37000.0
3         69700          69700.0
4          8800           5500.0

--------------------------------------------

--- Results for Decision Tree ---

For series of 'E'
Pipeline saved as: pipeline_dt_E.sav
Mean Absolute Error (MAE): 5180.96
Root Mean Squared Error (RMSE): 10044.55
R-squared (R2): 0.85

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.85

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         14500          15450.0
1         41500       

### Modelling for RandomForest

In [19]:
all_series_score_rf = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    filename = f'pipeline_rf_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_rf = pipeline.predict(X_test)
    train_predictions_rf = pipeline.predict(X_train)

    mae_rf = mean_absolute_error(y_test, predictions_rf)
    mse_rf = mean_squared_error(y_test, predictions_rf)
    rmse_rf = np.sqrt(mse_rf)
    r2_rf = r2_score(y_test, predictions_rf)
    train_r2_rf = r2_score(y_train, train_predictions_rf)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Random Forest ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")
    print(f"R-squared (R2): {r2_rf:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_rf:.2f}")
    print(f"R2 on Test Data:     {r2_rf:.2f}")

    results_df_rf = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_rf
    })
    results_df_rf = results_df_rf.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_rf.head())

    series_result_rf = {
        'Series': series,
        'R2_Score_RF': round(r2_rf, 2)
    }

    all_series_score_rf.append(series_result_rf)

r2_scores_df_rf = pd.DataFrame(all_series_score_rf)
r2_scores_df_rf = r2_scores_df_rf.sort_values(by='R2_Score_RF', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series ===")
print("=======================================================")
print(r2_scores_df_rf)


--------------------------------------------

--- Results for Random Forest ---

For series of 'S'
Pipeline saved as: pipeline_rf_S.sav
Mean Absolute Error (MAE): 11788.85
Root Mean Squared Error (RMSE): 23734.17
R-squared (R2): 0.91

--- Overfitting Check ---
R2 on Training Data: 0.99
R2 on Test Data:     0.91

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         57630         67968.60
1         64430         65123.60
2         51340         34831.60
3         69700         70689.87
4          8800          7275.95

--------------------------------------------

--- Results for Random Forest ---

For series of 'E'
Pipeline saved as: pipeline_rf_E.sav
Mean Absolute Error (MAE): 3730.33
Root Mean Squared Error (RMSE): 6175.21
R-squared (R2): 0.95

--- Overfitting Check ---
R2 on Training Data: 0.99
R2 on Test Data:     0.95

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         14500          13805.5
1         41500       

### Modelling with GradientBoost

In [21]:
all_series_score_gb = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish' ], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)
    
    filename = f'pipeline_gb_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_gb = pipeline.predict(X_test)
    train_predictions_gb = pipeline.predict(X_train)

    mae_gb = mean_absolute_error(y_test, predictions_gb)
    mse_gb = mean_squared_error(y_test, predictions_gb)
    rmse_gb = np.sqrt(mse_gb)
    r2_gb = r2_score(y_test, predictions_gb)
    train_r2_gb = r2_score(y_train, train_predictions_gb)

    print('\n--------------------------------------------')
    print(f"\n--- Results for Gradient Boosting ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_gb:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_gb:.2f}")
    print(f"R-squared (R2): {r2_gb:.2f}\n")
    
    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_gb:.2f}")
    print(f"R2 on Test Data:     {r2_gb:.2f}")

    results_df_gb = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_gb
    })
    results_df_gb = results_df_gb.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_gb.head())

    series_result_gb = {
        'Series': series,
        'R2_Score_GB': round(r2_gb, 2)
    }
    
    all_series_score_gb.append(series_result_gb)

r2_scores_df_gb = pd.DataFrame(all_series_score_gb)
r2_scores_df_gb = r2_scores_df_gb.sort_values(by='R2_Score_GB', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series ===")
print("=======================================================")
print(r2_scores_df_gb)


--------------------------------------------

--- Results for Gradient Boosting ---

For series of 'S'
Pipeline saved as: pipeline_gb_S.sav
Mean Absolute Error (MAE): 10268.11
Root Mean Squared Error (RMSE): 17490.18
R-squared (R2): 0.95

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.95

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         57630     68470.629156
1         64430     65335.260017
2         51340     31612.897452
3         69700     73507.559901
4          8800      6698.538600

--------------------------------------------

--- Results for Gradient Boosting ---

For series of 'E'
Pipeline saved as: pipeline_gb_E.sav
Mean Absolute Error (MAE): 3529.10
Root Mean Squared Error (RMSE): 5796.66
R-squared (R2): 0.95

--- Overfitting Check ---
R2 on Training Data: 0.99
R2 on Test Data:     0.95

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         14500     14357.297717
1         4150

### Modelling with XGBOOST

In [23]:
all_series_score_xgb = []

for series in df['Series'].unique():
    single_model_df = df[df['Series'] == series].copy()
    single_model_df.drop(['Series', 'Hansi Bazar', 'Reng', 'Resmi Satish'], axis=1, inplace=True)

    X = single_model_df.drop(columns=['Qiymet'])
    y = single_model_df['Qiymet']

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    active_categorical_features = [
        col for col in categorical_features if X[col].nunique() > 1
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), active_categorical_features)
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    pipeline.fit(X_train, y_train)

    # Changed filename to avoid overwriting other models
    filename = f'pipeline_xgb_{str(series).replace(" ", "_")}.sav'
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

    predictions_xgb = pipeline.predict(X_test)
    train_predictions_xgb = pipeline.predict(X_train)

    mae_xgb = mean_absolute_error(y_test, predictions_xgb)
    mse_xgb = mean_squared_error(y_test, predictions_xgb)
    rmse_xgb = np.sqrt(mse_xgb)
    r2_xgb = r2_score(y_test, predictions_xgb)
    train_r2_xgb = r2_score(y_train, train_predictions_xgb)

    print('\n--------------------------------------------')
    print(f"\n--- Results for XGBoost Regressor ---")
    print(f"\nFor series of '{series}'")
    print(f"Pipeline saved as: {filename}")
    print(f"Mean Absolute Error (MAE): {mae_xgb:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_xgb:.2f}")
    print(f"R-squared (R2): {r2_xgb:.2f}\n")

    print(f"--- Overfitting Check ---")
    print(f"R2 on Training Data: {train_r2_xgb:.2f}")
    print(f"R2 on Test Data:     {r2_xgb:.2f}")

    results_df_xgb = pd.DataFrame({
        'Actual Price': y_test,
        'Predicted Price': predictions_xgb
    })
    results_df_xgb = results_df_xgb.reset_index(drop=True)

    print("\nComparison of Actual vs. Predicted Prices:")
    print(results_df_xgb.head())

    series_result_xgb = {
        'Series': series,
        'R2_Score_XGB': round(r2_xgb, 2)
    }

    all_series_score_xgb.append(series_result_xgb)

r2_scores_df_xgb = pd.DataFrame(all_series_score_xgb)
r2_scores_df_xgb = r2_scores_df_xgb.sort_values(by='R2_Score_XGB', ascending=False, ignore_index=True)

print("\n=======================================================")
print("=== Final R-squared Scores for All Series (XGBoost) ===")
print("=======================================================")
print(r2_scores_df_xgb)


--------------------------------------------

--- Results for XGBoost Regressor ---

For series of 'S'
Pipeline saved as: pipeline_xgb_S.sav
Mean Absolute Error (MAE): 13388.60
Root Mean Squared Error (RMSE): 25250.37
R-squared (R2): 0.90

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.90

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         57630     68428.546875
1         64430     62194.437500
2         51340     36003.769531
3         69700     74704.453125
4          8800      6826.104004

--------------------------------------------

--- Results for XGBoost Regressor ---

For series of 'E'
Pipeline saved as: pipeline_xgb_E.sav
Mean Absolute Error (MAE): 3911.21
Root Mean Squared Error (RMSE): 6171.97
R-squared (R2): 0.95

--- Overfitting Check ---
R2 on Training Data: 1.00
R2 on Test Data:     0.95

Comparison of Actual vs. Predicted Prices:
   Actual Price  Predicted Price
0         14500     13307.185547
1         41

In [25]:
all_df_results = pd.concat([r2_scores_df_lr, r2_scores_df_dt, r2_scores_df_rf, r2_scores_df_gb, r2_scores_df_xgb], axis=1)

all_df_results = all_df_results.loc[:, ~all_df_results.columns.duplicated()]

all_df_results

Unnamed: 0,Series,R2_Score_LR,R2_Score_DT,R2_Score_RF,R2_Score_GB,R2_Score_XGB
0,E,0.81,0.98,0.95,0.95,0.95
1,C,0.7,0.93,0.91,0.95,0.95
2,S,0.68,0.85,0.76,0.84,0.9


In [27]:
#save the model to disk
model_lr = LinearRegression()
filename = 'finalized_model_mercedes_lr.sav'
pickle.dump(model_lr, open(filename, 'wb'))

model_dt = DecisionTreeRegressor(random_state=42)
filename = 'finalized_model_mercedes_dt.sav'
pickle.dump(model_dt, open(filename, 'wb'))

model_rf = RandomForestRegressor(random_state=42)
filename = 'finalized_model_mercedes_rf_new.sav'
pickle.dump(model_rf, open(filename, 'wb'))

model_gb = GradientBoostingRegressor(random_state=42)
filename = 'finalized_model_mercedes_gb.sav'
pickle.dump(model_gb, open(filename, 'wb'))

model_xgb = XGBRegressor(random_state=42)
filename = 'finalized_model_mercedes_xgb.sav'
pickle.dump(model_xgb, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)