In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

In [None]:
df = pd.read_excel('Data\mediamarkt_products.xlsx')

In [None]:
selected_cols = [
    "price", "Termék típusa",
    "Operációs rendszer", "Kapacitás", "Memóriaméret",
    "Képernyőátló", "Felbontás szélesség", "Felbontás magasság",
    "Hátsó kamera", "Hátsó kamera felbontás", "Előlapi kamera",
    "Előlapi kamera felbontása", "Arcfelismerés", "Dual SIM",
    "Szélesség", "Magasság", "Mélység",
    "Tömeg", "Akkumulátor kapacitás", #"Modell azonosító", "reviews"
]

In [None]:
df = df[selected_cols]
df = df.rename(
    columns={
    "Termék típusa":"type",
    "Operációs rendszer":"op",
    "Kapacitás":"capacity",
    "Memóriaméret":"memory",
    "Képernyőátló":"screen_diagonal",
    "Felbontás szélesség":"pixel_w",
    "Felbontás magasság":"pixel_h",
    "Hátsó kamera":"back_camera",
    "Hátsó kamera felbontás":"back_camera_resolution",
    "Előlapi kamera":"front_camera",
    "Előlapi kamera felbontása":"front_camera_resolution",
    "Arcfelismerés":"face_id",
    "Dual SIM":"dual_sim",
    "Szélesség":"w",
    "Magasság":"h",
    "Mélység":"d",
    "Tömeg":"mass",
    "Akkumulátor kapacitás":"battery",
    #"Modell azonosító":"model_id",
    }
)

In [None]:
for col in df.columns:
    unique_vals = df[col].unique().tolist() 
    num_missing = df[col].isnull().sum()
    
    print(f"Column: {col}")
    print(f"  Unique Values: {unique_vals}")
    print(f"  Number of Missing: {num_missing}")
    print("-" * 40)

In [None]:
df['price'] = df['price'].str.strip(',')
df['price'] = df['price'].str.replace(r'[\s\u00A0,]', '', regex=True)
df['price'] = df['price'].astype(int)

In [None]:
# Reviews
#df['reviews'] = df['reviews'].str.split(' ').str[0].astype(int)
#df.reviews.isna().sum() # none missing

In [None]:
# Type
df = df.dropna(subset=['type']).reset_index(drop=True) # 4 missing (all columns)

df['type'] = df['type'].str.lower()
df = df[df['type'].str.contains('okostelefon', case=False, na=False)].copy()

df.loc[:, 'type'] = df['type'].apply(lambda x: 'iphone' if 'iphone' in x.lower() else 'okostelefon')
df.type = df.type.map({'iphone': 1, 'okostelefon': 0})

In [None]:
# Op rendszer
df = df.drop(['op'], axis=1) # equal to type

In [None]:
# Capacity
df = df.replace({'capacity': {'1 TB': '1024 GB'}})
df['capacity'] = df['capacity'].str.split(' ').str[0].astype(int)

In [None]:
# Memory
df['memory'] = df['memory'].str.split(' ').str[0].astype(int)

In [None]:
# Screen diagonal
df['screen_diagonal'] = df['screen_diagonal'].str.split(' ').str[0].astype(float)

In [None]:
# Pixel width & height
df = df.dropna(subset=['pixel_w', 'pixel_h'])
df['pixel_w'] = df['pixel_w'].str.split(' ').str[0].astype(int)
df['pixel_h'] = df['pixel_h'].str.split(' ').str[0].astype(int)

In [None]:
# Back camera & resolution
df.back_camera = df.back_camera.map({'Single Cam': 1, 'Dual Cam': 2, 'Triple Cam': 3, 'Quad Cam': 4})

#df.back_camera.hist()

df['back_camera_resolution'] = df['back_camera_resolution'].str.extract(r'(\d+)').astype(float)
df['back_camera_resolution'] = df['back_camera_resolution'].fillna(df['back_camera_resolution'].median())

In [None]:
# Front camera & resolution
df.front_camera = df.front_camera.map({'Single Cam': 1, 'Dual Cam': 2})
df['front_camera_resolution'] = df['front_camera_resolution'].str.split(' ').str[0].astype(int)


In [None]:
# Face ID & Dual Sim
df['face_id'] = df['face_id'].replace({'Igen': 1, 'Nem': 0}).fillna(0).astype(int)
df['dual_sim'] = df['dual_sim'].replace({'Igen': 1, 'Nem': 0}).fillna(0).astype(int)

In [None]:
# Width & Height & Depth
df.w = df.w.str.split(' ').str[0].astype(float)
df.h = df.h.str.split(' ').str[0].astype(float)
df.d = df.d.str.split(' ').str[0].astype(float)

#plt.hist(df.h)
#plt.hist(df.w)

df = df[df.h > 100]
df = df[df.w < 100]

In [None]:
# Mass
df.mass = df.mass.str.split(' ').str[0].astype(float)
df.loc[df['mass'] < 1, 'mass'] = df.loc[df['mass'] < 1, 'mass'] * 1000

In [None]:
# Battery
df['battery'] = df['battery'].str.extract(r'(\d+)').astype(float)
df['battery'] = df['battery'].fillna(df['battery'].median())

In [None]:
df.isna().sum()

In [None]:
ax = df.hist(
    figsize=(12, 8),       # bigger figure size
    bins=20,               # number of bins
    edgecolor='black',     # black edges for bars
    color='skyblue',       # bar color
    grid=False             # optional: remove grid lines
)

# Beautify each subplot
for a in ax.ravel():
    a.set_title(a.get_title(), fontsize=14)
    a.set_xlabel(a.get_xlabel(), fontsize=12)
    a.set_ylabel(a.get_ylabel(), fontsize=12)
    a.tick_params(axis='both', labelsize=10)

plt.tight_layout()
plt.show()

In [None]:
df = df.drop(['front_camera'], axis=1)

In [None]:
df

In [None]:
from pycaret.regression import *
from sklearn.model_selection import train_test_split

In [None]:
# Specify column types
binary_cat = ['type', 'face_id', 'dual_sim']
ordinal_cat = ['back_camera']
numeric_columns = [
 'capacity', 'memory', 'screen_diagonal', 
 'pixel_w', 'pixel_h', 'back_camera_resolution',
 'front_camera_resolution', 'w', 'h', 
 'd', 'mass', 'battery'
 ]

In [None]:
stats = {}
for col in numeric_columns:
    if col in df.columns:
        stats[col] = {
            'min': float(df[col].min()),
            'max': float(df[col].max()),
            'mean': float(df[col].mean()),
            'std': float(df[col].std()),
        }

# Save to YAML
with open('numeric_stats.yaml', 'w') as f:
    yaml.dump(stats, f, sort_keys=False)

In [None]:
# Apply min-max normalization
df[numeric_columns] = df[numeric_columns].apply(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

In [None]:
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
regression_setup = setup(
    data=train_val_df,
    target='price',
    train_size=0.8,
    numeric_features=numeric_columns,
    categorical_features=binary_cat + ordinal_cat,
    ordinal_features={
        'back_camera': ['1', '2', '3', '4']    # adjust levels accordingly
    },
    normalize=False,  # already normalized
    session_id=42,
    verbose=False
)

In [None]:
best_model = compare_models(n_select=1, sort='MAE')
tuned_model = tune_model(best_model)

final_model = finalize_model(tuned_model)

In [None]:
save_model(final_model, 'Models/best_regression_model')

In [None]:
predictions = predict_model(final_model, data=test_df)

plt.figure(figsize=(8, 6))
plt.scatter(predictions['price'], predictions['prediction_label'], alpha=0.7)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Price')
plt.plot([predictions['price'].min(), predictions['price'].max()],
         [predictions['price'].min(), predictions['price'].max()],
         'r--')  # diagonal line
plt.grid(True)
plt.tight_layout()
plt.show()