In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load dataset
df = pd.read_csv('toyota.csv')

# Select only the specified columns
selected_columns = ['model', 'year', 'transmission', 'fuelType', 'mpg', 'mileage', 'engineSize', 'price']
df = df[selected_columns]

# Define categorical columns
categorical_columns = ['model', 'transmission', 'fuelType']

# Create and fit encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(df[categorical_columns])

# Transform the categorical data
encoded_categories = pd.DataFrame(
    encoder.transform(df[categorical_columns]),
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Combine numerical and encoded features
numerical_features = df[['year', 'mpg', 'mileage', 'engineSize']]
features = pd.concat([numerical_features, encoded_categories], axis=1)
target = df['price']

# Print feature information
print("Number of numerical features:", len(numerical_features.columns))
print("Number of encoded features:", len(encoded_categories.columns))
print("Total features:", len(features.columns))
print("\nFeature names:")
for i, name in enumerate(features.columns):
    print(f"{i+1}. {name}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=70)

# Train model
model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                         max_depth=5, alpha=10, n_estimators=100)
model.fit(X_train, y_train)

# Save feature names for verification
feature_names = list(features.columns)
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

# Save model and encoder
pickle.dump(model, open('prediksi_hargamobil.sav', 'wb'))
pickle.dump(encoder, open('encoder.pkl', 'wb'))

# Print model performance
pred = model.predict(X_test)
print('\nR² Score:', model.score(X_test, y_test))

Number of numerical features: 4
Number of encoded features: 26
Total features: 30

Feature names:
1. year
2. mpg
3. mileage
4. engineSize
5. model_ Auris
6. model_ Avensis
7. model_ Aygo
8. model_ C-HR
9. model_ Camry
10. model_ Corolla
11. model_ GT86
12. model_ Hilux
13. model_ IQ
14. model_ Land Cruiser
15. model_ PROACE VERSO
16. model_ Prius
17. model_ RAV4
18. model_ Supra
19. model_ Urban Cruiser
20. model_ Verso
21. model_ Verso-S
22. model_ Yaris
23. transmission_Automatic
24. transmission_Manual
25. transmission_Other
26. transmission_Semi-Auto
27. fuelType_Diesel
28. fuelType_Hybrid
29. fuelType_Other
30. fuelType_Petrol

R² Score: 0.9642820358276367


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import pickle

# Load dataset
df = pd.read_csv('toyota.csv')

# Select only the specified columns
selected_columns = ['model', 'year', 'transmission', 'fuelType', 'mileage', 'price']
df = df[selected_columns]

# Define categorical columns
categorical_columns = ['model', 'transmission', 'fuelType']

# Create and fit encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(df[categorical_columns])

# Transform the categorical data
encoded_categories = pd.DataFrame(
    encoder.transform(df[categorical_columns]),
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Combine numerical and encoded features
numerical_features = df[['year', 'mileage']]
features = pd.concat([numerical_features, encoded_categories], axis=1)
target = df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=70)

# Train model
model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                         max_depth=5, alpha=10, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate model
pred_test = model.predict(X_test)
mae = mean_absolute_error(y_test, pred_test)
mape = np.mean(np.abs((y_test - pred_test) / y_test)) * 100
accuracy = 100 - mape

print("\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")

# Save feature names, model, encoder, metrics, and test data
feature_names = list(features.columns)
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

pickle.dump(model, open('prediksi_hargamobil.sav', 'wb'))
pickle.dump(encoder, open('encoder.pkl', 'wb'))

# Save metrics
metrics = {
    'mae': mae,
    'mape': mape,
    'accuracy': accuracy
}
with open('metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)

# Save test data for validation
with open('X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)
with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)



Model Evaluation:
Mean Absolute Error (MAE): 887.60
Mean Absolute Percentage Error (MAPE): 8.08%
Accuracy: 91.92%
