In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import time

# Custom feature engineering function
def feature_engineering(df):
    df_processed = df.copy()
    
    price_ranges = [0, 5000, 10000, 20000, float('inf')]
    labels = ['budget', 'low_mid', 'high_mid', 'premium']
    df_processed['price_range'] = pd.cut(df_processed['Price'], bins=price_ranges, labels=labels, right=False)
    
    df_processed['screen_area'] = df_processed['Screen size (inches)'] ** 2
    df_processed['resolution_total'] = df_processed['Resolution x'] * df_processed['Resolution y']
    df_processed['pixel_density'] = np.sqrt(df_processed['resolution_total']) / df_processed['Screen size (inches)']
    
    df_processed['RAM_GB'] = df_processed['RAM (MB)'] / 1000
    
    df_processed['performance_score'] = df_processed['Processor'] * df_processed['RAM_GB']
    
    df_processed['camera_score'] = df_processed['Rear camera'] + 0.5 * df_processed['Front camera']
    
    binary_features = ['Touchscreen', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']
    for feature in binary_features:
        df_processed[feature] = df_processed[feature].map({'Yes': 1, 'No': 0})
    
    connectivity_features = ['Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']
    df_processed['connectivity_score'] = df_processed[connectivity_features].sum(axis=1)
    
    brand_avg_price = df.groupby('Brand')['Price'].mean().reset_index()
    brand_avg_price.columns = ['Brand', 'brand_avg_price']
    df_processed = pd.merge(df_processed, brand_avg_price, on='Brand', how='left')
    
    df_processed['storage_ram_ratio'] = df_processed['Internal storage (GB)'] / df_processed['RAM_GB']
    df_processed['OS_category'] = df_processed['Operating system'].map(
        lambda x: 'iOS' if x == 'iOS' else ('Android' if x == 'Android' else 'Other')
    )
    
    drop_cols = ['Unnamed: 0', 'Name', 'Model', 'Resolution x', 'Resolution y', 
                'RAM (MB)', 'Price', 'Operating system'] 
    
    drop_cols = [col for col in drop_cols if col in df_processed.columns]
    df_processed = df_processed.drop(columns=drop_cols)
    numeric_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns
    df_processed[numeric_cols] = df_processed[numeric_cols].fillna(0)
    
    cat_cols = df_processed.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        if col != 'price_range':  # Don't modify the target variable
            most_frequent = df_processed[col].mode()[0]
            df_processed[col] = df_processed[col].fillna(most_frequent)
    
    if 'storage_ram_ratio' in df_processed.columns:
        df_processed['storage_ram_ratio'] = df_processed['storage_ram_ratio'].replace([np.inf, -np.inf], 0)
    
    return df_processed

# Load the data
df = pd.read_csv('/Users/marno/Documents/Coding/MachineLearningGroupProject_TXB_8/data/ndtv_data_final.csv')

# Process data using custom feature engineering
print("Applying custom feature engineering...")
df_processed = feature_engineering(df)

# Show the new features created
print(f"\nDataset after feature engineering: {df_processed.shape}")
print(f"Features: {df_processed.columns.tolist()}")

# Prepare data for modeling
X = df_processed.drop(columns=['price_range'])
y = df_processed['price_range']

# Map categorical target to numerical values for modeling
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
price_range_names = label_encoder.classes_.tolist()

print(f"\nPrice Range Categories: {price_range_names}")
print("Distribution of price ranges:")
for i, category in enumerate(price_range_names):
    count = (y_encoded == i).sum()
    percentage = count / len(y_encoded) * 100
    print(f"{category}: {count} phones ({percentage:.1f}%)")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Get numeric and categorical columns
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

# Create pipeline with XGBoost
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=42))
])

# Define the parameter grid for XGBoost
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0],
    'classifier__min_child_weight': [1, 3]
}

# Create GridSearchCV object
print("\nPerforming Grid Search for XGBoost parameters...")
start_time = time.time()

grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"\nGrid Search completed in {time.time() - start_time:.2f} seconds.")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy with optimal parameters: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=price_range_names))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=price_range_names, 
            yticklabels=price_range_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGBoost with Custom Features')
plt.tight_layout()
plt.savefig('xgboost_custom_features_confusion_matrix.png')
plt.close()

# Feature importance analysis
model = best_model.named_steps['classifier']
preprocessor = best_model.named_steps['preprocessor']

# Get feature names after preprocessing (handling one-hot encoding)
ohe = preprocessor.transformers_[1][1]
if len(categorical_columns) > 0:
    categorical_feature_names = ohe.get_feature_names_out(categorical_columns).tolist()
    all_feature_names = numeric_columns + categorical_feature_names
else:
    all_feature_names = numeric_columns

# Get feature importances
feature_importances = model.feature_importances_

# Create DataFrame for feature importance
feature_imp_df = pd.DataFrame({
    'Feature': all_feature_names[:len(feature_importances)],
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

# Display top 15 features
print("\nTop 15 Features for Price Range Prediction:")
print(feature_imp_df.head(15))

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_imp_df.head(15))
plt.title('Top 15 Features for Price Range Prediction - Custom Engineered Features')
plt.tight_layout()
plt.savefig('feature_importance_custom_features.png')
plt.close()

# Save the model
import joblib
joblib.dump(best_model, 'xgboost_custom_features_model.pkl')

print("\nOptimized model saved as 'xgboost_custom_features_model.pkl'")

# Compare results with original price categories
# For reference, create the original 4-category price ranges
df['original_price_range'] = pd.cut(
    df['Price'], 
    bins=[0, 5000, 15000, 30000, float('inf')], 
    labels=['Budget (<5K)', 'Mid-range (5K-15K)', 'Premium (15K-30K)', 'Flagship (>30K)'],
    right=False
)

# Map our predictions back to the original dataframe IDs
test_indices = X_test.index
df_test = df.loc[test_indices].copy()
df_test['predicted_category'] = label_encoder.inverse_transform(y_pred)
df_test['actual_category'] = label_encoder.inverse_transform(y_test)

# Calculate accuracy by original price range
original_ranges = df_test['original_price_range'].unique()
print("\nAccuracy by Original Price Range:")
for orig_range in original_ranges:
    subset = df_test[df_test['original_price_range'] == orig_range]
    correct = (subset['predicted_category'] == subset['actual_category']).sum()
    accuracy_range = correct / len(subset) if len(subset) > 0 else 0
    print(f"{orig_range}: {accuracy_range:.4f} ({correct}/{len(subset)})")

# Create a function for predicting with the new model
def predict_price_range(phone_features, model, label_encoder):
    """
    Predict price range for a new phone using the custom feature-engineered model
    
    Parameters:
    phone_features: Dictionary with original phone features
    model: Trained model pipeline
    label_encoder: Encoder used for price range categories
    
    Returns:
    Predicted price range and probabilities
    """
    # Create a DataFrame with the phone features
    phone_df = pd.DataFrame([phone_features])
    
    # Apply the same feature engineering
    # We need a temporary Price column to use the feature engineering function
    if 'Price' not in phone_df.columns:
        phone_df['Price'] = 0  # Temporary value, will be dropped
    
    processed_df = feature_engineering(phone_df)
    
    # Drop the target column if it exists
    if 'price_range' in processed_df.columns:
        processed_df = processed_df.drop(columns=['price_range'])
    
    # Make prediction
    predicted_class = model.predict(processed_df)[0]
    predicted_probas = model.predict_proba(processed_df)[0]
    
    # Get predicted category name
    predicted_category = label_encoder.inverse_transform([predicted_class])[0]
    
    # Create probability dictionary
    probability_dict = {cat: prob for cat, prob in zip(label_encoder.classes_, predicted_probas)}
    
    return predicted_category, probability_dict

# Test prediction with a sample phone
print("\nExample prediction with custom feature-engineered model:")
sample_phone = df.iloc[X_test.index[0]].to_dict()
print(f"Sample Phone Brand: {sample_phone.get('Brand', 'Unknown')}")
print(f"Screen Size: {sample_phone.get('Screen size (inches)', 0)} inches")
print(f"Internal Storage: {sample_phone.get('Internal storage (GB)', 0)} GB")
print(f"Processor Cores: {sample_phone.get('Processor', 0)}")
print(f"Actual Price: ₹{sample_phone.get('Price', 0)}")

predicted_category, probabilities = predict_price_range(sample_phone, best_model, label_encoder)
print(f"\nPredicted Price Range: {predicted_category}")
print("\nProbability Distribution:")
for category, probability in probabilities.items():
    print(f"  {category}: {probability:.4f}")

Applying custom feature engineering...

Dataset after feature engineering: (1359, 25)
Features: ['Brand', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen', 'Processor', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Wi-Fi', 'Bluetooth', 'GPS', 'Number of SIMs', '3G', '4G/ LTE', 'price_range', 'screen_area', 'resolution_total', 'pixel_density', 'RAM_GB', 'performance_score', 'camera_score', 'connectivity_score', 'brand_avg_price', 'storage_ram_ratio', 'OS_category']

Price Range Categories: ['budget', 'high_mid', 'low_mid', 'premium']
Distribution of price ranges:
budget: 429 phones (31.6%)
high_mid: 244 phones (18.0%)
low_mid: 526 phones (38.7%)
premium: 160 phones (11.8%)

Performing Grid Search for XGBoost parameters...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Grid Search completed in 37.08 seconds.
Best parameters: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 3, 'classifier__min_chi