In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, roc_curve, silhouette_score,
                             classification_report, confusion_matrix)
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')


In [None]:

# 1. Data Loading & Preprocessing

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv"
df = pd.read_csv(url)

df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Channel'].value_counts()

In [None]:
# Separate features and target
X = df.drop('Channel', axis=1)
y = df['Channel']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("\nFeatures after standardization:")
pd.DataFrame(X_scaled, columns=X.columns).describe()

In [None]:

# 2. Ensemble Methods

# Split data for CV
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Random Forest
rf_50 = RandomForestClassifier(n_estimators=50, random_state=42)
rf_200 = RandomForestClassifier(n_estimators=200, random_state=42)

rf_50_scores = cross_val_score(rf_50, X_train, y_train, cv=5, scoring='accuracy')
rf_200_scores = cross_val_score(rf_200, X_train, y_train, cv=5, scoring='accuracy')

# Gradient Boosting
gb_50 = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=42)
gb_200 = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)

gb_50_scores = cross_val_score(gb_50, X_train, y_train, cv=5, scoring='accuracy')
gb_200_scores = cross_val_score(gb_200, X_train, y_train, cv=5, scoring='accuracy')

# Create results table
ensemble_results = pd.DataFrame({
    'Model': ['Random Forest (50)', 'Random Forest (200)', 
              'Gradient Boosting (50)', 'Gradient Boosting (200)'],
    'Mean CV Accuracy': [rf_50_scores.mean(), rf_200_scores.mean(),
                        gb_50_scores.mean(), gb_200_scores.mean()],
    'Std CV Accuracy': [rf_50_scores.std(), rf_200_scores.std(),
                       gb_50_scores.std(), gb_200_scores.std()]
})

print("Ensemble Methods - 5-fold CV Results:")
print(ensemble_results)

In [None]:
# 3. Support Vector Machine

# SVM with different kernels and C values
svm_linear_01 = SVC(kernel='linear', C=0.1, random_state=42, probability=True)
svm_linear_10 = SVC(kernel='linear', C=1.0, random_state=42, probability=True)
svm_rbf_01 = SVC(kernel='rbf', C=0.1, random_state=42, probability=True)
svm_rbf_10 = SVC(kernel='rbf', C=1.0, random_state=42, probability=True)

svm_models = {
    'Linear C=0.1': svm_linear_01,
    'Linear C=1.0': svm_linear_10,
    'RBF C=0.1': svm_rbf_01,
    'RBF C=1.0': svm_rbf_10
}

svm_results = []
for name, model in svm_models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    svm_results.append({
        'Model': name,
        'Mean CV Accuracy': scores.mean(),
        'Std CV Accuracy': scores.std()
    })

svm_results_df = pd.DataFrame(svm_results)
print("SVM - 5-fold CV Results:")
print(svm_results_df)
print("\n" + "-"*30)

In [None]:

# 4. Clustering
# K-Means Clustering
k_values = [2, 3, 4]
kmeans_silhouette_scores = []

plt.figure(figsize=(15, 5))

for i, k in enumerate(k_values, 1):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    kmeans_silhouette_scores.append(silhouette_avg)
    
    # PCA for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.subplot(1, 3, i)
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
    plt.colorbar(scatter)
    plt.title(f'K-Means (k={k})\nSilhouette Score: {silhouette_avg:.3f}')
    plt.xlabel('PC1')
    plt.ylabel('PC2')

plt.tight_layout()
plt.show()

print("K-Means Silhouette Scores:")
for k, score in zip(k_values, kmeans_silhouette_scores):
    print(f"k={k}: Silhouette Score = {score:.3f}")

# Find best k
best_k = k_values[np.argmax(kmeans_silhouette_scores)]
print(f"\nBest k: {best_k}")

# Plot cluster centers for best k
kmeans_best = KMeans(n_clusters=best_k, random_state=42, n_init=10)
kmeans_best.fit(X_scaled)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
centers_pca = pca.transform(kmeans_best.cluster_centers_)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_best.labels_, cmap='viridis', alpha=0.6)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='X', s=200, label='Cluster Centers')
plt.colorbar(scatter)
plt.title(f'K-Means Clustering (k={best_k}) - Cluster Centers')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

In [None]:

# DBSCAN
print("\nDBSCAN Results:")
eps_values = [0.5, 1.0]
dbscan_results = []

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    dbscan_labels = dbscan.fit_predict(X_scaled)
    
    n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
    n_noise = list(dbscan_labels).count(-1)
    
    # Calculate silhouette score (excluding noise points)
    if n_clusters > 1:
        # Filter out noise points for silhouette score calculation
        non_noise_mask = dbscan_labels != -1
        if sum(non_noise_mask) > 0:  # Ensure there are non-noise points
            silhouette_avg = silhouette_score(X_scaled[non_noise_mask], dbscan_labels[non_noise_mask])
        else:
            silhouette_avg = -1
    else:
        silhouette_avg = -1
    
    dbscan_results.append({
        'eps': eps,
        'n_clusters': n_clusters,
        'n_noise': n_noise,
        'silhouette_score': silhouette_avg
    })
    
    print(f"eps={eps}: Clusters={n_clusters}, Noise points={n_noise}, Silhouette Score={silhouette_avg:.3f}")


In [None]:

# 5. Final Evaluation

# Identify best models from CV results
best_ensemble = ensemble_results.loc[ensemble_results['Mean CV Accuracy'].idxmax()]
best_svm = svm_results_df.loc[svm_results_df['Mean CV Accuracy'].idxmax()]

print(f"Best Ensemble: {best_ensemble['Model']} (CV Acc: {best_ensemble['Mean CV Accuracy']:.3f})")
print(f"Best SVM: {best_svm['Model']} (CV Acc: {best_svm['Mean CV Accuracy']:.3f})")

# Train best models
if 'Random Forest' in best_ensemble['Model']:
    n_est = 200 if '200' in best_ensemble['Model'] else 50
    best_ensemble_model = RandomForestClassifier(n_estimators=n_est, random_state=42)
else:
    n_est = 200 if '200' in best_ensemble['Model'] else 50
    best_ensemble_model = GradientBoostingClassifier(n_estimators=n_est, learning_rate=0.1, random_state=42)

# Determine best SVM parameters
if 'Linear' in best_svm['Model']:
    kernel_type = 'linear'
else:
    kernel_type = 'rbf'

if 'C=0.1' in best_svm['Model']:
    c_value = 0.1
else:
    c_value = 1.0

best_svm_model = SVC(kernel=kernel_type, C=c_value, random_state=42, probability=True)

# Train models
best_ensemble_model.fit(X_train, y_train)
best_svm_model.fit(X_train, y_train)

# Predictions
y_pred_ensemble = best_ensemble_model.predict(X_test)
y_pred_svm = best_svm_model.predict(X_test)

# Probabilities for ROC AUC
y_proba_ensemble = best_ensemble_model.predict_proba(X_test)[:, 1]
y_proba_svm = best_svm_model.predict_proba(X_test)[:, 1]

# Calculate metrics
def calculate_metrics(y_true, y_pred, y_proba, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='weighted'),
        'Recall': recall_score(y_true, y_pred, average='weighted'),
        'F1-Score': f1_score(y_true, y_pred, average='weighted'),
        'ROC AUC': roc_auc_score(y_true, y_proba)
    }

ensemble_metrics = calculate_metrics(y_test, y_pred_ensemble, y_proba_ensemble, best_ensemble['Model'])
svm_metrics = calculate_metrics(y_test, y_pred_svm, y_proba_svm, best_svm['Model'])

# Create results table
final_results = pd.DataFrame([ensemble_metrics, svm_metrics])
print("\nFinal Test Set Results:")
print(final_results.round(3))

In [None]:

# Plot ROC curves
plt.figure(figsize=(10, 8))
fpr_ensemble, tpr_ensemble, _ = roc_curve(y_test, y_proba_ensemble)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_proba_svm)

plt.plot(fpr_ensemble, tpr_ensemble, label=f'{best_ensemble["Model"]} (AUC = {roc_auc_score(y_test, y_proba_ensemble):.3f})', linewidth=2)
plt.plot(fpr_svm, tpr_svm, label=f'{best_svm["Model"]} (AUC = {roc_auc_score(y_test, y_proba_svm):.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Detailed classification reports
print("\n" + "=" * 50)
print("DETAILED CLASSIFICATION REPORTS")
print("=" * 50)

print(f"\nBest Ensemble Model ({best_ensemble['Model']}):")
print(classification_report(y_test, y_pred_ensemble))

print(f"\nBest SVM Model ({best_svm['Model']}):")
print(classification_report(y_test, y_pred_svm))