In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])


In [2]:
# 1. Sample Dataset (Simulated)
data = {
    "ticket": [
        "I need help with my bill",
        "How do I update my billing information?",
        "Can someone assist me with technical difficulties?",
        "My account was suspended without reason",
        "Interested in purchasing your services",
        "The login button does not work",
        "Please explain this invoice charge",
        "The app crashes every time I open it",
        "I want to upgrade my plan",
        "How do I reset my password?"
    ],
    "tags": [
        ["Billing"],
        ["Billing"],
        ["Technical Issue"],
        ["Account Management"],
        ["Sales"],
        ["Technical Issue"],
        ["Billing"],
        ["Technical Issue"],
        ["Sales"],
        ["Account Management"]
    ]
}

df = pd.DataFrame(data)

In [3]:
# 2. MultiLabel Binarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tags"])
X = df["ticket"]

In [10]:
clf_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,estimator,LogisticRegre...r='liblinear')
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [11]:
y_pred = clf_pipe.predict(X_test)


In [4]:
# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# 4. Few-shot (Train on only 4 examples)
few_shot_X = X_train[:4]
few_shot_y = y_train[:4]


In [7]:
# 5. TF-IDF + Logistic Regression
from sklearn.multiclass import OneVsRestClassifier

clf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

In [12]:
# 6. Predict Top-3 Tags using Fine-tuned Model
y_pred_prob = clf_pipe.predict_proba(X_test)
top_3_indices = np.argsort(y_pred_prob, axis=1)[:, -3:][:, ::-1]
fine_tuned_top_3_tags = [[mlb.classes_[i] for i in row] for row in top_3_indices]

In [13]:
# 7. Zero-Shot (Heuristic-based)
def zero_shot_predict(text):
    tags = []
    text = text.lower()
    if any(word in text for word in ['bill', 'invoice', 'charge']):
        tags.append('Billing')
    if any(word in text for word in ['login', 'crash', 'technical', 'button', 'error']):
        tags.append('Technical Issue')
    if any(word in text for word in ['account', 'reset', 'password', 'suspended']):
        tags.append('Account Management')
    if any(word in text for word in ['buy', 'purchase', 'upgrade', 'plan', 'interested']):
        tags.append('Sales')
    return tags[:3]

zero_shot_preds = X_test.apply(zero_shot_predict)

In [14]:
# 8. Evaluation
print("\nClassification Report (Fine-tuned):")
print(classification_report(y_test, clf_pipe.predict(X_test), target_names=mlb.classes_))


Classification Report (Fine-tuned):
                    precision    recall  f1-score   support

Account Management       0.00      0.00      0.00         0
           Billing       0.00      0.00      0.00         1
             Sales       0.00      0.00      0.00         1
   Technical Issue       0.00      0.00      0.00         1

         micro avg       0.00      0.00      0.00         3
         macro avg       0.00      0.00      0.00         3
      weighted avg       0.00      0.00      0.00         3
       samples avg       0.00      0.00      0.00         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
# 9. Comparison Table
comparison_df = pd.DataFrame({
    "Ticket": X_test.values,
    "True Tags": [list(np.array(mlb.classes_)[row.astype(bool)]) for row in y_test],
    "Zero-Shot Tags": zero_shot_preds,
    "Fine-Tuned Top 3 Tags": fine_tuned_top_3_tags
})
print("\nComparison of Zero-shot vs Fine-tuned:")
print(comparison_df.to_string(index=False))


Comparison of Zero-shot vs Fine-tuned:
                                 Ticket         True Tags    Zero-Shot Tags                          Fine-Tuned Top 3 Tags
              I want to upgrade my plan           [Sales]           [Sales] [Account Management, Billing, Technical Issue]
How do I update my billing information?         [Billing]         [Billing] [Account Management, Billing, Technical Issue]
         The login button does not work [Technical Issue] [Technical Issue] [Technical Issue, Billing, Account Management]
