In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [47]:
# Ganti path sesuai lokasi file kamu
data = pd.read_csv("MEN_SHOES.csv")

# Cek 5 baris pertama
data.head()

Unnamed: 0,Brand_Name,How_Many_Sold,Current_Price,Product_details,RATING
0,ASIAN,2242,"₹1,098","Oxygen-01 Sports Running,Walking & Gym Shoes w...",3.8
1,ASIAN,240,₹674,"Men's Express-08 Sports Running,Walking,Gym,Tr...",4.0
2,ASIAN,16662,₹588,"Men's Cosko Sports Running,Walking,Gym,Trainin...",3.8
3,ASIAN,135,₹599,"Wind-03 Sports Running,Walking & Gym Shoes wit...",4.0
4,Reebok,240,₹982,Men's Velocity Runner Lp Running Shoe,4.0


In [48]:
print(data.columns)
print(data.dtypes)


Index(['Brand_Name', 'How_Many_Sold', 'Current_Price', 'Product_details',
       'RATING'],
      dtype='object')
Brand_Name          object
How_Many_Sold       object
Current_Price       object
Product_details     object
RATING             float64
dtype: object


In [49]:
# Cek missing value pada setiap kolom
print(data.isnull().sum())

Brand_Name           0
How_Many_Sold        0
Current_Price      798
Product_details      0
RATING               0
dtype: int64


In [50]:
from sklearn.impute import SimpleImputer

# Imputasi nilai hilang pada kolom numerik
num_cols = data.select_dtypes(include=['int64', 'float64']).columns
num_imputer = SimpleImputer(strategy='mean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

# Imputasi nilai hilang pada kolom kategorik
cat_cols_missing = [col for col in data.select_dtypes(include='object').columns if data[col].isnull().any()]

# Tambahkan baris ini untuk mendefinisikan cat_imputer
cat_imputer = SimpleImputer(strategy='most_frequent')

data[cat_cols_missing] = cat_imputer.fit_transform(data[cat_cols_missing])

# Imputasi nilai hilang pada kolom harga (jika ada)
if 'Current_Price_clean' in data.columns and data['Current_Price_clean'].isnull().any():
    data['Current_Price_clean'] = num_imputer.fit_transform(data[['Current_Price_clean']])

In [51]:
print(data.isnull().sum())

Brand_Name         0
How_Many_Sold      0
Current_Price      0
Product_details    0
RATING             0
dtype: int64


## Encoding Variabel Kategorik ##


In [52]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Asumsikan 'data' adalah DataFrame Anda
# Kolom target: 'Brand_Name'
# Kolom kategorik untuk OneHotEncoder: 'Product_details'
# Kolom numerik: 'How_Many_Sold', 'Current_Price', 'RATING'

# Definisikan fitur (X) dan target (y)
X = data.drop('Brand_Name', axis=1)
y = data['Brand_Name']

# --- LabelEncoder untuk Kolom Target ---
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- OneHotEncoder untuk Fitur Kategorik ---
# OneHotEncoder digunakan pada kolom 'Product_details'
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_ohe = ohe.fit_transform(X[['Product_details']])

# Dapatkan nama-nama kolom baru dari one-hot encoding
ohe_feature_names = ohe.get_feature_names_out(['Product_details'])

# Gabungkan hasil one-hot encoding dengan fitur numerik lainnya
X_encoded = pd.DataFrame(X_ohe, columns=ohe_feature_names, index=X.index)
X_final = pd.concat([X.drop('Product_details', axis=1), X_encoded], axis=1)

print("Shape X_final:", X_final.shape)
print("Shape y_encoded:", y_encoded.shape)

Shape X_final: (23940, 62)
Shape y_encoded: (23940,)


##  Scaling Fitur Numerik ##

In [53]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Contoh data loading (ganti dengan data asli kamu)
# data = pd.read_csv("filemu.csv")

# --- 1. Bersihkan kolom numerik dari simbol dan koma ---
# How_Many_Sold: hilangkan koma, ubah ke float
data['How_Many_Sold'] = data['How_Many_Sold'].replace(',', '', regex=True).astype(float)

# Current_Price: hilangkan simbol ₹ dan koma, ubah ke float
data['Current_Price'] = data['Current_Price'].replace('[₹,]', '', regex=True).astype(float)

# --- 2. Pilih kolom numerik (yang akan di-scale) ---
numeric_features = ['How_Many_Sold', 'Current_Price', 'RATING']

# --- 3. Imputasi NaN (jika ada) ---
imputer = SimpleImputer(strategy='mean')
numeric_imputed = imputer.fit_transform(data[numeric_features])

# --- 4. Standard Scaler (Z-score scaling) ---
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(numeric_imputed)

# --- 5. Buat DataFrame hasil scaling ---
df_numeric_scaled = pd.DataFrame(numeric_scaled, columns=numeric_features)

# --- (Opsional) Gabungkan dengan kolom non-numerik ---
non_numeric = data.drop(columns=numeric_features).reset_index(drop=True)
df_final = pd.concat([df_numeric_scaled, non_numeric], axis=1)

# --- Lihat hasil ---
print(df_final.head())


   How_Many_Sold  Current_Price    RATING Brand_Name  \
0      -0.116968       0.619604 -0.089283      ASIAN   
1      -0.303546      -0.470111  0.397717      ASIAN   
2       1.226914      -0.691138 -0.089283      ASIAN   
3      -0.313332      -0.662867  0.397717      ASIAN   
4      -0.303546       0.321474  0.397717     Reebok   

                                     Product_details  
0  Oxygen-01 Sports Running,Walking & Gym Shoes w...  
1  Men's Express-08 Sports Running,Walking,Gym,Tr...  
2  Men's Cosko Sports Running,Walking,Gym,Trainin...  
3  Wind-03 Sports Running,Walking & Gym Shoes wit...  
4            Men's Velocity Runner Lp Running Shoe    


## Membagi Data Menjadi Training dan Testing Set ##

In [54]:
from sklearn.model_selection import train_test_split

# Asumsikan X adalah fitur dan y adalah target Anda
# Pisahkan data menjadi set pelatihan (80%) dan set pengujian (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Anda juga dapat menggunakan stratified sampling jika target memiliki kelas yang tidak seimbang
# Misalnya, jika y adalah kolom target yang telah di-encode (y_encoded)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Cetak bentuk (shape) dari setiap set untuk verifikasi
print(f"Bentuk X_train: {X_train.shape}")
print(f"Bentuk X_test: {X_test.shape}")
print(f"Bentuk y_train: {y_train.shape}")
print(f"Bentuk y_test: {y_test.shape}")

Bentuk X_train: (19152, 4)
Bentuk X_test: (4788, 4)
Bentuk y_train: (19152,)
Bentuk y_test: (4788,)


## Pelatihan dan Perbandingan Model ##

In [77]:
#pelatihan model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV    
# === Definisikan model dan parameter tuning ===
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regression': SVR()
}

In [75]:
# === Definisikan parameter grid untuk GridSearchCV ===
param_grids = {
    'Linear Regression': {},
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [None, 10, 20]
    },
    'Support Vector Regression': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10]
    }
}


In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

results = []
best_models = {}
best_params = {}

for name, model in models.items():
    print(f"Training model: {name}")

    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params[name] = grid.best_params_
    else:
        model.fit(X_train, y_train)
        best_model = model
        best_params[name] = "Default Parameters"

    best_models[name] = best_model

    # Prediksi dan evaluasi
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Cross-validation (R2)
    cv_r2 = cross_val_score(best_model, X_scaled, y, cv=5, scoring='r2').mean()

    results.append({
        "Model": name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "CV R2": cv_r2
    })

results_df = pd.DataFrame(results)
print("\n=== Perbandingan Kinerja Model ===")
print(results_df)

print("\n=== Parameter Terbaik Tiap Model ===")
for name, params in best_params.items():
    print(f"{name}: {params}")


In [None]:
# Barplot untuk R2 dan RMSE
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot R²
axes[0].bar(results_df["Model"], results_df["R²"], color='skyblue')
axes[0].set_title("R² Score Comparison")
axes[0].set_ylabel("R² Score")
axes[0].set_ylim(0, 1)

# Plot RMSE
axes[1].bar(results_df["Model"], results_df["RMSE"], color='salmon')
axes[1].set_title("RMSE Comparison")
axes[1].set_ylabel("RMSE")

plt.tight_layout()
plt.show()
