In [2]:
import pandas as pd
import numpy as np

# Load the dataset
print("Loading dataset...")
df = pd.read_csv("smartphone_data.csv")

# Display basic info about the dataset
print("Original dataset shape:", df.shape)
print("\nMissing values before cleaning:")
print(df.isnull().sum())

# Remove rows with missing values
df_clean = df.dropna()

# Remove duplicate rows
df_clean = df_clean.drop_duplicates()

print("\nDataset shape after removing missing values and duplicates:", df_clean.shape)

# Extract brand from 'Name'
df_clean['Brand'] = df_clean['Name'].apply(lambda x: x.split()[0] if isinstance(x, str) and len(x.split()) > 0 else 'Unknown')

# Get unique brands and their counts
brand_counts = df_clean['Brand'].value_counts()
print(f"\nNumber of unique brands: {len(brand_counts)}")
print("\nTop 10 brands by frequency:")
print(brand_counts.head(10))

# Create a brand mapping dictionary (Brand name to ID)
unique_brands = df_clean['Brand'].unique()
brand_to_id = {brand: idx for idx, brand in enumerate(unique_brands)}

# Create a new column 'BrandID' with the numeric ID
df_clean['BrandID'] = df_clean['Brand'].map(brand_to_id)

# Save the brand mapping for future reference
brand_mapping_df = pd.DataFrame({
    'Brand': list(brand_to_id.keys()),
    'BrandID': list(brand_to_id.values())
})
brand_mapping_df.to_csv('brand_mapping.csv', index=False)
print("\nBrand mapping saved to 'brand_mapping.csv'")

# Save the cleaned dataset
df_clean.to_csv('smartphone_data_remove_null_duplicate.csv', index=False)
print("\nCleaned dataset saved to 'smartphone_data_remove_null_duplicate.csv'")

# Create a version of the dataset ready for modeling
# (with Name removed and only essential columns)
model_df = df_clean.drop(columns=['Name'])
model_df.to_csv('smartphone_data_model_ready.csv', index=False)
print("\nModel-ready dataset saved to 'smartphone_data_model_ready.csv'")

# Print sample of the cleaned data
print("\nSample of the cleaned dataset:")
print(df_clean.head())

# Print summary statistics
print("\nSummary statistics of numeric columns:")
print(df_clean.describe())

print("\nData preprocessing complete!")


Loading dataset...
Original dataset shape: (3183, 8)

Missing values before cleaning:
Name              0
Price          1244
RAM              21
Storage           7
Camera         1383
ScreenSize     1383
Battery        1440
ReleaseYear    1386
dtype: int64

Dataset shape after removing missing values and duplicates: (1709, 8)

Number of unique brands: 64

Top 10 brands by frequency:
Brand
Samsung    303
Xiaomi     162
Vivo       162
OPPO       149
Realme     147
Infinix    109
Apple       75
Tecno       50
Huawei      49
ASUS        47
Name: count, dtype: int64

Brand mapping saved to 'brand_mapping.csv'

Cleaned dataset saved to 'smartphone_data_remove_null_duplicate.csv'

Model-ready dataset saved to 'smartphone_data_model_ready.csv'

Sample of the cleaned dataset:
                                   Name      Price  RAM  Storage  Camera  \
0            OPPO A3s RAM 6GB ROM 128GB   532000.0  6.0      6.0    13.0   
1  Infinix Hot 40 Pro RAM 8GB ROM 256GB  1704000.0  8.0      8.0   1

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Set random seed for reproducibility
np.random.seed(42)

# Load the preprocessed dataset
print("Loading the preprocessed dataset...")
df = pd.read_csv("smartphone_data_model_ready.csv")
print(f"Dataset shape: {df.shape}")

# Display basic information about the dataset
print("\nDataset info:")
print(df.info())

print("\nSample data:")
print(df.head())

# Check for any remaining missing values
print("\nMissing values:")
print(df.isnull().sum())

# Exploratory Data Analysis
print("\nPerforming Exploratory Data Analysis...")

# Distribution of the target variable (Price)
plt.figure(figsize=(10, 6))
sns.histplot(df['Price'], kde=True)
plt.title('Distribution of Smartphone Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.savefig('price_distribution.png')
plt.close()

# Check for outliers in Price
plt.figure(figsize=(10, 6))
sns.boxplot(y=df['Price'])
plt.title('Boxplot of Smartphone Prices')
plt.ylabel('Price')
plt.savefig('price_boxplot.png')
plt.close()

# Correlation matrix - use only numeric columns
numeric_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

print("Top correlations with Price:")
price_correlations = correlation_matrix['Price'].sort_values(ascending=False)
print(price_correlations)

# Feature Engineering
print("\nPerforming Feature Engineering...")

# Create a copy of the dataframe for modeling
model_df = df.copy()

# Log transform the target variable if it's right-skewed
if model_df['Price'].skew() > 0.5:
    print("Applying log transformation to Price due to right skew")
    model_df['Price_Log'] = np.log1p(model_df['Price'])
    target = 'Price_Log'
    original_target = 'Price'
else:
    target = 'Price'
    original_target = None

# Convert any categorical columns to numeric
# Check if 'Brand' column exists and convert it to numeric if needed
if 'Brand' in model_df.columns and model_df['Brand'].dtype == 'object':
    print("Converting 'Brand' column to numeric using BrandID")
    # We'll use the BrandID column that's already in the dataset
    model_df = model_df.drop(columns=['Brand'])

# Prepare features and target
if original_target:
    X = model_df.drop(columns=[target, original_target])
    y = model_df[target]
else:
    X = model_df.drop(columns=[target])
    y = model_df[target]

# Get feature names
feature_names = X.columns.tolist()
print(f"Features used for modeling: {feature_names}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for future use
joblib.dump(scaler, 'smartphone_price_scaler.pkl')
print("Scaler saved as 'smartphone_price_scaler.pkl'")

# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name, original_target=None):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # If we log-transformed the target, transform predictions back
    if original_target:
        y_train_actual = np.expm1(y_train)
        y_test_actual = np.expm1(y_test)
        y_train_pred_actual = np.expm1(y_train_pred)
        y_test_pred_actual = np.expm1(y_test_pred)
    else:
        y_train_actual = y_train
        y_test_actual = y_test
        y_train_pred_actual = y_train_pred
        y_test_pred_actual = y_test_pred
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train_actual, y_train_pred_actual))
    test_rmse = np.sqrt(mean_squared_error(y_test_actual, y_test_pred_actual))
    train_mae = mean_absolute_error(y_train_actual, y_train_pred_actual)
    test_mae = mean_absolute_error(y_test_actual, y_test_pred_actual)
    train_r2 = r2_score(y_train_actual, y_train_pred_actual)
    test_r2 = r2_score(y_test_actual, y_test_pred_actual)
    
    # Print metrics
    print(f"\n{model_name} Performance:")
    print(f"Training RMSE: {train_rmse:.2f}")
    print(f"Testing RMSE: {test_rmse:.2f}")
    print(f"Training MAE: {train_mae:.2f}")
    print(f"Testing MAE: {test_mae:.2f}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Testing R²: {test_r2:.4f}")
    
    # Plot actual vs predicted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_actual, y_test_pred_actual, alpha=0.5)
    plt.plot([y_test_actual.min(), y_test_actual.max()], 
             [y_test_actual.min(), y_test_actual.max()], 
             'r--', lw=2)
    plt.title(f'{model_name}: Actual vs Predicted Prices')
    plt.xlabel('Actual Price')
    plt.ylabel('Predicted Price')
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_predictions.png')
    plt.close()
    
    # Return metrics and model
    return {
        'model': model,
        'name': model_name,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2
    }

# Train and evaluate multiple models
print("\nTraining and evaluating models...")
models = []

# Linear Regression
print("\nTraining Linear Regression...")
lr_model = LinearRegression()
lr_results = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, 
                           "Linear Regression", original_target)
models.append(lr_results)

# Ridge Regression
print("\nTraining Ridge Regression...")
ridge_model = Ridge(alpha=1.0)
ridge_results = evaluate_model(ridge_model, X_train_scaled, X_test_scaled, y_train, y_test, 
                              "Ridge Regression", original_target)
models.append(ridge_results)

# Random Forest Regression
print("\nTraining Random Forest Regression...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_results = evaluate_model(rf_model, X_train, X_test, y_train, y_test, 
                           "Random Forest Regression", original_target)
models.append(rf_results)

# Gradient Boosting Regression
print("\nTraining Gradient Boosting Regression...")
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_results = evaluate_model(gb_model, X_train, X_test, y_train, y_test, 
                           "Gradient Boosting Regression", original_target)
models.append(gb_results)

# Try XGBoost if available
try:
    import xgboost as xgb
    print("\nTraining XGBoost Regression...")
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_results = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, 
                               "XGBoost Regression", original_target)
    models.append(xgb_results)
except ImportError:
    print("XGBoost not available. Skipping XGBoost model.")

# Compare models
print("\nModel Comparison (Test Set Performance):")
model_comparison = pd.DataFrame({
    'Model': [model['name'] for model in models],
    'RMSE': [model['test_rmse'] for model in models],
    'MAE': [model['test_mae'] for model in models],
    'R²': [model['test_r2'] for model in models]
})
print(model_comparison.sort_values('RMSE'))

# Find the best model based on test RMSE
best_model_idx = model_comparison['RMSE'].idxmin()
best_model_name = model_comparison.loc[best_model_idx, 'Model']
best_model = models[best_model_idx]['model']
print(f"\nBest model based on Test RMSE: {best_model_name}")

# Feature importance for the best model (if applicable)
if hasattr(best_model, 'feature_importances_'):
    print("\nFeature Importance:")
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': best_model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    print(feature_importance)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title(f'Feature Importance ({best_model_name})')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
elif best_model_name == "Linear Regression" or best_model_name == "Ridge Regression":
    print("\nFeature Coefficients:")
    feature_coefs = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': best_model.coef_
    })
    feature_coefs = feature_coefs.sort_values('Coefficient', ascending=False)
    print(feature_coefs)
    
    # Plot coefficients
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Coefficient', y='Feature', data=feature_coefs)
    plt.title(f'Feature Coefficients ({best_model_name})')
    plt.tight_layout()
    plt.savefig('feature_coefficients.png')
    plt.close()

# Save the best model
model_filename = 'smartphone_price_prediction_model.pkl'
joblib.dump(best_model, model_filename)
print(f"\nBest model saved as '{model_filename}'")

# Create a simple prediction function
def predict_smartphone_price(features_dict):
    """
    Predict smartphone price based on input features.
    
    Args:
        features_dict: Dictionary with keys matching the feature names
                      (RAM, Storage, Camera, ScreenSize, Battery, ReleaseYear, BrandID)
    
    Returns:
        Predicted price
    """
    # Create a DataFrame with the input features
    input_df = pd.DataFrame([features_dict])
    
    # Ensure all required features are present
    for feature in feature_names:
        if feature not in input_df.columns:
            input_df[feature] = 0
    
    # Reorder columns to match the training data
    input_df = input_df[feature_names]
    
    # Scale the features if needed
    if best_model_name in ["Linear Regression", "Ridge Regression"]:
        input_scaled = scaler.transform(input_df)
    else:
        input_scaled = input_df
    
    # Make prediction
    prediction = best_model.predict(input_scaled)[0]
    
    # Transform prediction back if log transformation was applied
    if original_target:
        prediction = np.expm1(prediction)
    
    return prediction

# Example prediction
example_phone = {
    'RAM': 8.0,
    'Storage': 8.0,
    'Camera': 64.0,
    'ScreenSize': 6.5,
    'Battery': 5000.0,
    'ReleaseYear': 2023.0,
    'BrandID': 2  # Samsung
}

predicted_price = predict_smartphone_price(example_phone)
print(f"\nExample prediction for a Samsung phone with 8GB RAM, 64MP camera, etc.:")
print(f"Predicted price: Rp {predicted_price:,.2f}")

print("\nModel building and evaluation complete!")


Loading the preprocessed dataset...
Dataset shape: (1709, 9)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1709 entries, 0 to 1708
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        1709 non-null   float64
 1   RAM          1709 non-null   float64
 2   Storage      1709 non-null   float64
 3   Camera       1709 non-null   float64
 4   ScreenSize   1709 non-null   float64
 5   Battery      1709 non-null   float64
 6   ReleaseYear  1709 non-null   float64
 7   Brand        1709 non-null   object 
 8   BrandID      1709 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 120.3+ KB
None

Sample data:
       Price  RAM  Storage  Camera  ScreenSize  Battery  ReleaseYear    Brand  \
0   532000.0  6.0      6.0    13.0        6.20   4230.0       2018.0     OPPO   
1  1704000.0  8.0      8.0   108.0        6.78   5000.0       2023.0  Infinix   
2  2719000.0  8.0      8.0    

In [11]:
# Example prediction
example_phone = {
    'RAM': 16.0,
    'Storage': 8.0,
    'Camera': 64.0,
    'ScreenSize': 6.5,
    'Battery': 5000.0,
    'ReleaseYear': 2023.0,
    'BrandID': 0  # OPPO
}

predicted_price = predict_smartphone_price(example_phone)
print(f"\nPrediction for a OPPO phone with 16GB RAM, 64MP camera, etc.:")
print(f"Predicted price: Rp {predicted_price:,.2f}")


Prediction for a OPPO phone with 16GB RAM, 64MP camera, etc.:
Predicted price: Rp 4,277,298.85
