ðŸ“˜ Section 1: Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

ðŸ“˜ Section 2: Loading and Restructuring the Dataset

In [None]:
# Load raw CSV data (no header)
csv_file = "Dados-SLA.csv"
raw_data = pd.read_csv(csv_file, header=None)

# Define number of clients and expected columns (1 ID + 11 responses)
num_clients = 119
num_factors = 12

# Reshape flat data into structured 2D array
reshaped_data = raw_data.values.reshape(num_clients, num_factors)

# Define column names
columns = [
    "Client ID", "Service Rate", "Completed Orders", "Speed", "Consistency",
    "Flexibility", "Failure Recovery", "Information", "Correct Invoices",
    "Conforming Products", "Correct Quantity", "Overall Satisfaction"
]

# Create a DataFrame with the structured data
data = pd.DataFrame(reshaped_data, columns=columns)

ðŸ“˜ Section 3: Creating the Target Variable

In [None]:
# Define binary target: 1 for satisfied clients (rating >= 4), 0 otherwise
data["Satisfied"] = (data["Overall Satisfaction"].astype(int) >= 4).astype(int)

# Define independent variables (features) and dependent variable (target)
X = data.drop(columns=["Client ID", "Overall Satisfaction", "Satisfied"])
y = data["Satisfied"]

ðŸ“˜ Section 4: Preprocessing - Data Normalization

In [None]:
# Display the distribution of satisfaction labels
print("Original label distribution:", np.bincount(y))

# Normalize feature values to range [0, 1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

ðŸ“˜ Section 5: Training the Gradient Boosting Classifier

In [None]:
# Initialize Gradient Boosting model with specified hyperparameters
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.02,
    max_depth=4,
    subsample=0.7,
    random_state=42
)

# Fit the model to the entire dataset
gb_model.fit(X_scaled, y)

# Predict using the trained model
y_pred = gb_model.predict(X_scaled)

# Show predicted label distribution
print("Predicted label distribution:", np.bincount(y_pred))

# Evaluate the model performance
print("Accuracy (Gradient Boosting):", accuracy_score(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))

ðŸ“˜ Section 6: Error Analysis - Misclassified Clients

In [None]:
# Add prediction and correctness columns to the DataFrame
data["Predicted"] = y_pred
data["Correct"] = (data["Satisfied"] == data["Predicted"]).astype(int)

# Filter and display misclassified clients
misclassified = data[data["Correct"] == 0][["Client ID", "Satisfied", "Predicted"]]
print("\nClients that reduced the model's accuracy:")
print(misclassified)