In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the datasets with proper encoding
try:
    categories = pd.read_csv('categories.csv', encoding='latin1')
    employees = pd.read_csv('employees.csv', encoding='latin1')
    order_details = pd.read_csv('order_details.csv', encoding='latin1')
    orders = pd.read_csv('orders.csv', encoding='latin1')
    products = pd.read_csv('products.csv', encoding='latin1')
    customers = pd.read_csv('customers.csv', encoding='latin1')
except Exception as e:
    print(f"Error loading CSV files: {e}")
    # Try alternative encoding if the first attempt fails
    try:
        customers = pd.read_csv('customers.csv', encoding='ISO-8859-1')
    except Exception as e:
        print(f"Still couldn't load customers.csv: {e}")

# Merge datasets to create a comprehensive view
# Join orders with order_details
orders_with_details = pd.merge(
    orders, 
    order_details,
    on='orderID',
    suffixes=('', '_order_details')
)

# Join with products to get product information
orders_with_products = pd.merge(
    orders_with_details,
    products,
    on='productID',
    suffixes=('', '_products')
)

# Join with customers to get customer information
full_data = pd.merge(
    orders_with_products,
    customers,
    on='customerID'
)

# Join with categories to get category information
full_data = pd.merge(
    full_data,
    categories,
    on='categoryID'
)

# Feature engineering
# Calculate total price per order item - using unitPrice from order_details
full_data['totalPrice'] = full_data['unitPrice'] * full_data['quantity'] * (1 - full_data['discount'])

# Extract date features
full_data['orderDate'] = pd.to_datetime(full_data['orderDate'])
full_data['year'] = full_data['orderDate'].dt.year
full_data['month'] = full_data['orderDate'].dt.month
full_data['day'] = full_data['orderDate'].dt.day
full_data['dayOfWeek'] = full_data['orderDate'].dt.dayofweek

# Encode categorical variables
le_country = LabelEncoder()
le_category = LabelEncoder()
full_data['countryCode'] = le_country.fit_transform(full_data['country'])
full_data['categoryCode'] = le_category.fit_transform(full_data['categoryName'])

# Select relevant features for clustering and classification
features_for_model = full_data[['unitPrice', 'quantity', 'discount', 'totalPrice', 
                               'freight', 'categoryCode', 'countryCode', 
                               'month', 'dayOfWeek']]

# Handle missing values
features_for_model = features_for_model.fillna(0)

# Normalize the data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features_for_model)

print("Data preprocessing completed successfully.")
print(f"Shape of preprocessed data: {normalized_features.shape}")
print(f"Selected features: {features_for_model.columns.tolist()}")


Data preprocessing completed successfully.
Shape of preprocessed data: (2155, 9)
Selected features: ['unitPrice', 'quantity', 'discount', 'totalPrice', 'freight', 'categoryCode', 'countryCode', 'month', 'dayOfWeek']


In [4]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Define the number of clusters to try
cluster_range = [2, 8, 10, 12]  # As mentioned in the paper

# Store results
inertia_values = []
silhouette_scores = []

# Calculate inertia and silhouette score for each number of clusters
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, max_iter=500)
    cluster_labels = kmeans.fit_predict(normalized_features)
    
    # Store inertia
    inertia_values.append(kmeans.inertia_)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(normalized_features, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    
    print(f"For n_clusters = {k}, the silhouette score is {silhouette_avg:.3f}")

# Plot inertia values (Elbow method)
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia_values, 'o-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.savefig('elbow_method.png')
plt.close()

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, silhouette_scores, 'o-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.grid(True)
plt.savefig('silhouette_scores.png')
plt.close()

# Choose the optimal number of clusters (based on the paper, we'll use 8)
optimal_k = 8
kmeans = KMeans(n_clusters=optimal_k, random_state=42, max_iter=500)
cluster_labels = kmeans.fit_predict(normalized_features)

# Add cluster labels to the original data
full_data['cluster'] = cluster_labels

# Analyze clusters
cluster_stats = full_data.groupby('cluster').agg({
    'totalPrice': ['mean', 'count'],
    'quantity': 'mean',
    'discount': 'mean',
    'categoryName': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'
}).reset_index()

print("\nCluster Statistics:")
print(cluster_stats)

print("\nK-means clustering completed successfully.")


For n_clusters = 2, the silhouette score is 0.338
For n_clusters = 8, the silhouette score is 0.115
For n_clusters = 10, the silhouette score is 0.134
For n_clusters = 12, the silhouette score is 0.136

Cluster Statistics:
  cluster   totalPrice         quantity  discount    categoryName
                  mean count       mean      mean        <lambda>
0       0   357.500596   391  17.941176  0.027621       Beverages
1       1   365.052170   405  19.264198  0.017160       Beverages
2       2  1564.197767   178  64.988764  0.059551  Dairy Products
3       3   444.586255   424  16.966981  0.019670         Seafood
4       4  6051.997400    25  28.120000  0.044000       Beverages
5       5  1547.762390    68  44.985294  0.065441  Dairy Products
6       6   343.266899   318  18.182390  0.019025     Confections
7       7   420.594658   346  23.682081  0.210260       Beverages

K-means clustering completed successfully.


In [5]:
from sklearn.model_selection import train_test_split

# Prepare features and target
X = normalized_features
y = cluster_labels

# Split data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split completed successfully.")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Data split completed successfully.
Training set shape: (1724, 9)
Testing set shape: (431, 9)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import time

# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Store results
results = {}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    print(f"\nTraining {name}...")
    
    # Measure training time
    start_time = time.time()
    
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Measure training time
    train_time = time.time() - start_time
    
    # Predict on test set
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate error rate
    error_rate = 1 - accuracy
    
    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(clf, X, y, cv=10)
    cv_accuracy = cv_scores.mean()
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'error_rate': error_rate,
        'cv_accuracy': cv_accuracy,
        'train_time': train_time,
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred)
    }
    
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Error Rate: {error_rate:.4f}")
    print(f"10-fold CV Accuracy: {cv_accuracy:.4f}")
    print(f"Training Time: {train_time:.2f} seconds")
    print("\nClassification Report:")
    print(results[name]['classification_report'])

# Compare classifiers
print("\nClassifier Comparison:")
for name, result in results.items():
    print(f"{name}: Accuracy = {result['accuracy']:.4f}, Error Rate = {result['error_rate']:.4f}")



Training Random Forest...
Random Forest Results:
Accuracy: 0.9281
Error Rate: 0.0719
10-fold CV Accuracy: 0.9183
Training Time: 0.67 seconds

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89        87
           1       0.93      0.92      0.93        62
           2       0.87      0.94      0.91        36
           3       0.93      0.94      0.94        86
           4       1.00      0.25      0.40         4
           5       0.94      0.88      0.91        17
           6       0.98      0.95      0.97        64
           7       0.94      0.99      0.96        75

    accuracy                           0.93       431
   macro avg       0.94      0.85      0.86       431
weighted avg       0.93      0.93      0.93       431


Training SVM...
SVM Results:
Accuracy: 0.9490
Error Rate: 0.0510
10-fold CV Accuracy: 0.9439
Training Time: 0.08 seconds

Classification Report:
              precision    recall  f1

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a comparison bar chart
plt.figure(figsize=(12, 6))

# Plot accuracy
names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in names]
error_rates = [results[name]['error_rate'] for name in names]
cv_accuracies = [results[name]['cv_accuracy'] for name in names]

x = np.arange(len(names))
width = 0.25

plt.bar(x - width, accuracies, width, label='Test Accuracy')
plt.bar(x, cv_accuracies, width, label='CV Accuracy')
plt.bar(x + width, error_rates, width, label='Error Rate')

plt.xlabel('Classifier')
plt.ylabel('Score')
plt.title('Classifier Performance Comparison')
plt.xticks(x, names)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('classifier_comparison.png')
plt.close()

# Find the best classifier based on accuracy
best_classifier_name = max(results, key=lambda x: results[x]['accuracy'])
best_classifier = classifiers[best_classifier_name]

print(f"\nThe best classifier is: {best_classifier_name}")
print(f"Accuracy: {results[best_classifier_name]['accuracy']:.4f}")
print(f"Error Rate: {results[best_classifier_name]['error_rate']:.4f}")

# Visualize confusion matrix for the best classifier
plt.figure(figsize=(10, 8))
cm = results[best_classifier_name]['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_classifier_name}')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig(f'{best_classifier_name}_confusion_matrix.png')
plt.close()



The best classifier is: SVM
Accuracy: 0.9490
Error Rate: 0.0510
