In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import joblib

In [21]:
# Load Dataset
df = pd.read_csv('customer_support_tickets.csv')
df.shape

(8469, 17)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

In [23]:
# Target Column
target = "Customer Satisfaction Rating"

# Drop rows where target is missing
df = df.dropna(subset=[target])
print("Rows after removing missing target:", df.shape)

Rows after removing missing target: (2769, 17)


In [24]:
# Split Features and Target
X = df.drop(columns=[target])
y = df[target]

In [22]:
# Remove unwanted identifier/time columns if present
drop_cols = [c for c in X.columns if any(word in c.lower() 
              for word in ["id", "ticket", "date", "time", "email", "name"])]
X = X.drop(columns=drop_cols, errors='ignore')
print("Features used:", X.columns.tolist())

Features used: ['Customer Age', 'Customer Gender', 'Product Purchased', 'Resolution']


In [25]:
# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

In [26]:
# Preprocessing Pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
])

In [27]:
# Model
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

In [28]:
# Split train & test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "| Test:", X_test.shape)

Train: (2215, 16) | Test: (554, 16)


In [29]:
# Train the model
pipeline.fit(X_train, y_train)
print("Model Training Completed!")

Model Training Completed!


In [30]:
# Evaluation
y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("\nAccuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.19855595667870035

Classification Report:
               precision    recall  f1-score   support

         1.0       0.26      0.21      0.23       111
         2.0       0.17      0.15      0.16       110
         3.0       0.19      0.38      0.26       116
         4.0       0.21      0.17      0.19       108
         5.0       0.17      0.07      0.10       109

    accuracy                           0.20       554
   macro avg       0.20      0.20      0.19       554
weighted avg       0.20      0.20      0.19       554


Confusion Matrix:
 [[23 16 53  9 10]
 [15 17 47 23  8]
 [20 27 44 15 10]
 [16 25 37 18 12]
 [15 18 47 21  8]]


In [31]:
# ROC AUC (only if binary classification)
if len(np.unique(y_test)) == 2:
    y_prob = pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
    print("\nROC AUC:", auc)

In [32]:
# Feature Importance
# Extract transformed feature names
onehot = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot.get_feature_names_out(categorical_features)

feature_names = list(numeric_features) + list(cat_feature_names)
importances = pipeline.named_steps['model'].feature_importances_

feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

print("\nTop 10 Important Features:\n")
print(feat_imp.head(10))


Top 10 Important Features:

                          feature  importance
0                       Ticket ID    0.019073
1                    Customer Age    0.016627
4380       Customer Gender_Female    0.004358
4381         Customer Gender_Male    0.004294
9504          Ticket Priority_Low    0.004195
9505       Ticket Priority_Medium    0.004108
9506          Ticket Channel_Chat    0.004035
9502     Ticket Priority_Critical    0.004035
9508         Ticket Channel_Phone    0.004030
9509  Ticket Channel_Social media    0.004012


In [33]:
# Save Model
joblib.dump(pipeline, "C:/Users/NewAdmin/OneDrive/Desktop/DS/Customer Satisfaction Prediction/customer_satisfaction_model.joblib")
print("\nModel Saved as:C:/Users/NewAdmin/OneDrive/Desktop/DS/Customer Satisfaction Prediction/customer_satisfaction_model.joblib")


Model Saved as:C:/Users/NewAdmin/OneDrive/Desktop/DS/Customer Satisfaction Prediction/customer_satisfaction_model.joblib
