In [1]:
import pandas as pd


df=pd.read_csv('dataset/customer_support_classification_dataset.csv')
df.head()

Unnamed: 0,ticket_text,category,template_id
0,Technic@l 3rror during ch3ckout,Technical,Technical_6
1,p@yment successful but order missing,Payment,Payment_7
2,5YSTEM FA1LURE WH1LE 0RDER1NG REALLY FRUSTRATED,Technical,Technical_7
3,R3fund proc3s5 taking too long,Refund,Refund_5
4,refund not rece1ved after c@ncellat1on as@p,Refund,Refund_0


In [2]:
from sklearn.model_selection import train_test_split

template_ids=df['template_id'].unique()

In [3]:
train_ids,test_ids=train_test_split(template_ids,test_size=0.25,random_state=42)

train_df=df[df['template_id'].isin(train_ids)]
test_df=df[df['template_id'].isin(test_ids)]

In [4]:
train_df.shape,test_df.shape

((370, 3), (130, 3))

In [5]:
set(train_df["template_id"])&set(test_df["template_id"])

set()

In [6]:
train_df["category"].value_counts()



category
Technical    80
Refund       80
Other        80
Payment      70
Account      60
Name: count, dtype: int64

In [7]:
test_df["category"].value_counts()

category
Account      40
Payment      30
Refund       20
Technical    20
Other        20
Name: count, dtype: int64

In [8]:
X_train_text=train_df["ticket_text"]
X_test_text=test_df["ticket_text"]
y_train_text=train_df["category"]
y_test_text=test_df["category"]

In [9]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train_enc=le.fit_transform(y_train_text)
y_test_enc=le.transform(y_test_text)

In [10]:
import re
def clean_text(text):
    text=text.lower()
    text=re.sub(r"\s+", " ", text)
    return text

In [11]:
X_train_cleaned=X_train_text.apply(clean_text)
X_test_cleaned=X_test_text.apply(clean_text)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_word=TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    sublinear_tf=True,
)

In [13]:
X_train_word=tfidf_word.fit_transform(X_train_cleaned)
X_test_word=tfidf_word.transform(X_test_cleaned)

In [14]:
tfidf_char=TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=5000,
)

In [15]:
X_train_char=tfidf_char.fit_transform(X_train_cleaned)
X_test_char=tfidf_char.transform(X_test_cleaned)

In [16]:
from scipy.sparse import hstack
X_train=hstack((X_train_word, X_train_char))
X_test=hstack((X_test_word, X_test_char))



In [17]:
X_train.shape, X_test.shape

((370, 6724), (130, 6724))

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr=LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

lr.fit(X_train, y_train_enc)
lr_pred=lr.predict(X_test)
print("logistic regression results")
print(classification_report(y_test_enc, lr_pred, target_names=le.classes_))

logistic regression results
              precision    recall  f1-score   support

     Account       1.00      0.75      0.86        40
       Other       0.59      0.95      0.73        20
     Payment       0.76      0.93      0.84        30
      Refund       0.95      1.00      0.98        20
   Technical       0.80      0.40      0.53        20

    accuracy                           0.81       130
   macro avg       0.82      0.81      0.79       130
weighted avg       0.84      0.81      0.80       130



In [19]:
from sklearn.svm import LinearSVC
svm=LinearSVC()
svm.fit(X_train, y_train_enc)
svm_pred=svm.predict(X_test)
print("SVM results")
print(classification_report(y_test_enc, svm_pred, target_names=le.classes_))

SVM results
              precision    recall  f1-score   support

     Account       0.97      0.90      0.94        40
       Other       0.70      0.95      0.81        20
     Payment       0.77      1.00      0.87        30
      Refund       1.00      1.00      1.00        20
   Technical       1.00      0.35      0.52        20

    accuracy                           0.86       130
   macro avg       0.89      0.84      0.83       130
weighted avg       0.89      0.86      0.85       130



In [20]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(X_train, y_train_enc)
mnb_pred=mnb.predict(X_test)
print("Naive Bayes results")
print(classification_report(y_test_enc, mnb_pred, target_names=le.classes_))

Naive Bayes results
              precision    recall  f1-score   support

     Account       0.88      0.70      0.78        40
       Other       0.40      0.50      0.44        20
     Payment       0.77      1.00      0.87        30
      Refund       0.66      0.95      0.78        20
   Technical       0.80      0.20      0.32        20

    accuracy                           0.70       130
   macro avg       0.70      0.67      0.64       130
weighted avg       0.73      0.70      0.68       130



In [21]:
from sklearn.metrics import f1_score
models={
    'Logistic Regression':lr_pred,
    'SVM':svm_pred,
    'Multinomial NB':mnb_pred,
}
for name, preds in models.items():
    score=f1_score(y_test_enc, preds, average='macro')
    print(f"{name} Macro F1: {score:.3f}")

Logistic Regression Macro F1: 0.787
SVM Macro F1: 0.826
Multinomial NB Macro F1: 0.637


In [22]:
from sklearn.utils import shuffle
y_train_shuffled=shuffle(y_train_enc, random_state=42)

lr.fit(X_train, y_train_shuffled)
shuffled_preds=lr.predict(X_test)
print("Shuffled results f1:",f1_score(y_test_enc, shuffled_preds, average='macro'))


Shuffled results f1: 0.2793845525116757


In [23]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test_enc, svm_pred)
cm_df=pd.DataFrame(cm,
                   index=le.classes_,
                   columns=le.classes_
                   )
cm_df

Unnamed: 0,Account,Other,Payment,Refund,Technical
Account,36,4,0,0,0
Other,1,19,0,0,0
Payment,0,0,30,0,0
Refund,0,0,0,20,0
Technical,0,4,9,0,7


In [24]:
test_results=test_df.copy()
test_results["truee_label"]=y_test_text
test_results["predicted_label"]=le.inverse_transform(svm_pred)

errors=test_results[
    test_results["truee_label"]!=test_results["predicted_label"]
]

In [25]:
errors.head(10)

Unnamed: 0,ticket_text,category,template_id,truee_label,predicted_label
71,SCREEN GOES BLANK 0N PAYMENT NOT @ SCAM,Technical,Technical_4,Technical,Payment
87,h0w t0 update profile inf0rmation,Other,Other_1,Other,Account
95,screen goes blank on p@yment th15 i5 unacceptable,Technical,Technical_4,Technical,Other
107,account blocked without re@5on,Account,Account_2,Account,Other
147,ACCOUNT BLOCKED WITHOUT RE@SON,Account,Account_2,Account,Other
217,@ccount blocked without re@50n,Account,Account_2,Account,Other
220,Page k3ep5 r3freshing 1mm3diately,Technical,Technical_8,Technical,Other
235,SCREEN G0E5 BL@NK ON PAYMENT NOT @ SCAM,Technical,Technical_4,Technical,Payment
276,P@ge ke3ps refr3shing kindly ch3ck,Technical,Technical_8,Technical,Other
281,@CCOUNT BLOCK3D WITHOUT R3@SON KINDLY CHECK,Account,Account_2,Account,Other


In [26]:
errors.groupby(
    ["truee_label", "predicted_label"]
).size().sort_values(ascending=False)


truee_label  predicted_label
Technical    Payment            9
Account      Other              4
Technical    Other              4
Other        Account            1
dtype: int64

In [27]:
URGENT_KEYWORDS = [
    "urgent",
    "asap",
    "immediately",
    "need this resolved today",
    "seriously",
    "not a scam",
    "really frustrated",
    "this is unacceptable"
]


In [28]:
def detect_urgency(text):
    text=text.lower()
    for keyword in URGENT_KEYWORDS:
        if keyword in text:
            return 1
    return 0

In [29]:
df["urgency"]=df["ticket_text"].apply(detect_urgency)

In [30]:
def assign_priority(category,urgency):
    if urgency==1:
        return "High"
    if category in ["Payment","Account"]:
        return "Medium"
    return "Low"

In [31]:
df.head(10)

Unnamed: 0,ticket_text,category,template_id,urgency
0,Technic@l 3rror during ch3ckout,Technical,Technical_6,0
1,p@yment successful but order missing,Payment,Payment_7,0
2,5YSTEM FA1LURE WH1LE 0RDER1NG REALLY FRUSTRATED,Technical,Technical_7,1
3,R3fund proc3s5 taking too long,Refund,Refund_5,0
4,refund not rece1ved after c@ncellat1on as@p,Refund,Refund_0,0
5,unexpected 3rror m355age @ppe@r5 seriously,Technical,Technical_9,1
6,System f@ilure while ordering seriously,Technical,Technical_7,1
7,r3fund promi5ed but not credit3d,Refund,Refund_2,0
8,trans@ction failed @fter p@ym3nt,Payment,Payment_6,0
9,H0W TO CHANGE EMA1L ADDRES5,Other,Other_5,0


In [32]:
df['priority']=df.apply(
    lambda row: assign_priority(row['category'],row['urgency']),
    axis=1
)

In [33]:
df.head(10)

Unnamed: 0,ticket_text,category,template_id,urgency,priority
0,Technic@l 3rror during ch3ckout,Technical,Technical_6,0,Low
1,p@yment successful but order missing,Payment,Payment_7,0,Medium
2,5YSTEM FA1LURE WH1LE 0RDER1NG REALLY FRUSTRATED,Technical,Technical_7,1,High
3,R3fund proc3s5 taking too long,Refund,Refund_5,0,Low
4,refund not rece1ved after c@ncellat1on as@p,Refund,Refund_0,0,Low
5,unexpected 3rror m355age @ppe@r5 seriously,Technical,Technical_9,1,High
6,System f@ilure while ordering seriously,Technical,Technical_7,1,High
7,r3fund promi5ed but not credit3d,Refund,Refund_2,0,Low
8,trans@ction failed @fter p@ym3nt,Payment,Payment_6,0,Medium
9,H0W TO CHANGE EMA1L ADDRES5,Other,Other_5,0,Low


In [34]:
def route_department(category):
    if category in ["Payment","Refund"]:
        return "Finance"
    elif category in ["Technical","Account"]:
        return "Technical Support"
    else:
        return "Customer Support"

In [35]:
df["department"]=df["category"].apply(route_department)

In [36]:
import joblib
joblib.dump(svm,"models/svm.pkl")
joblib.dump(lr,"models/lr.pkl")
joblib.dump(tfidf_word,"models/tfidf_word.pkl")
joblib.dump(tfidf_char,"models/tfidf_char.pkl")
joblib.dump(le,"models/label_encoder.pkl")

['models/label_encoder.pkl']