In [None]:
#import the dataset
from google.colab import files
uploads=files.upload()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#load the dataset
df=pd.read_csv('all_tickets_processed_improved_v3.csv')

print("Dataset shape: ",df.shape)

print("\nDataset columns: ")
print(df.columns.to_list())

print("\nFirst few rows:")
print(df.head())

print("\nDatatypes:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

In [None]:
print("="*70)
print("SUPPORT TICKET DATASET - OVERVIEW")
print("="*70)

print(f"\nDataset size: {len(df):,} tickets")

print("\n" + "="*70)
print("TICKET CATEGORIES")
print("="*70)
print(df['Topic_group'].value_counts())
print(f"\nTotal categories: {df['Topic_group'].nunique()}")

# Visualize
plt.figure(figsize=(10, 6))
df['Topic_group'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Support Ticket Category Distribution')
plt.xlabel('Category')
plt.ylabel('Number of Tickets')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Sample tickets per category
print("\n" + "="*70)
print("SAMPLE TICKETS BY CATEGORY")
print("="*70)
for category in df['Topic_group'].unique()[:3]:
    print(f"\n{category}:")
    sample = df[df['Topic_group'] == category]['Document'].iloc[0]
    print(f"  {sample[:150]}...")

In [None]:
import re
import string

print("="*70)
print("STEP 2: TEXT CLEANING")
print("="*70)

# Clean text function
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply cleaning
df['text_clean'] = df['Document'].apply(clean_text)

# Show before/after
print("\nSample 1 (Hardware):")
print(f"BEFORE: {df[df['Topic_group']=='Hardware']['Document'].iloc[0][:150]}...")
print(f"AFTER:  {df[df['Topic_group']=='Hardware']['text_clean'].iloc[0][:150]}...")

print("\nSample 2 (HR Support):")
print(f"BEFORE: {df[df['Topic_group']=='HR Support']['Document'].iloc[0][:150]}...")
print(f"AFTER:  {df[df['Topic_group']=='HR Support']['text_clean'].iloc[0][:150]}...")

print(f"\n‚úì {len(df):,} tickets cleaned")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

print("="*70)
print("STEP 3: TF-IDF FEATURE EXTRACTION")
print("="*70)

# Prepare features and target
X = df['text_clean']
y = df['Topic_group']

# Train/test split (80/20) - stratified
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nData split:")
print(f"  Training: {len(X_train):,} tickets")
print(f"  Testing:  {len(X_test):,} tickets")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2),
    stop_words='english'
)

print("\nVectorizing...")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"\n‚úì Feature extraction complete:")
print(f"  Vocabulary: {len(vectorizer.vocabulary_):,} features")
print(f"  Training matrix: {X_train_tfidf.shape}")
print(f"  Testing matrix: {X_test_tfidf.shape}")

# Sample features
features = vectorizer.get_feature_names_out()
print(f"\nSample features: {list(features[:20])}")

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("="*70)
print("STEP 4: MODEL TRAINING - TICKET CATEGORY CLASSIFICATION")
print("="*70)

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)

    # Predict
    y_pred = model.predict(X_test_tfidf)

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'accuracy': acc
    }

    print(f"  ‚úì Accuracy: {acc:.4f} ({acc*100:.2f}%)")

# Best model
best_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_name]['model']
best_pred = results[best_name]['predictions']
best_acc = results[best_name]['accuracy']

print("\n" + "="*70)
print(f"üèÜ BEST MODEL: {best_name}")
print(f"   Accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")
print("="*70)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

print("="*70)
print("STEP 5: MODEL EVALUATION")
print("="*70)

# Classification Report (Precision, Recall, F1-Score)
print("\nCLASSIFICATION REPORT:")
print("="*70)
print(classification_report(y_test, best_pred))

# Confusion Matrix
print("\n" + "="*70)
print("CONFUSION MATRIX")
print("="*70)

cm = confusion_matrix(y_test, best_pred)

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=best_model.classes_,
            yticklabels=best_model.classes_,
            cbar_kws={'label': 'Number of Tickets'})
plt.title(f'Confusion Matrix - {best_name}\nOverall Accuracy: {best_acc*100:.2f}%',
          fontsize=14, fontweight='bold')
plt.ylabel('Actual Category', fontsize=12)
plt.xlabel('Predicted Category', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Class-wise performance analysis
print("\n" + "="*70)
print("CLASS-WISE PERFORMANCE ANALYSIS")
print("="*70)

from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, support = precision_recall_fscore_support(
    y_test, best_pred, labels=best_model.classes_
)

performance_df = pd.DataFrame({
    'Category': best_model.classes_,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
}).sort_values('F1-Score', ascending=False)

print(performance_df.to_string(index=False))

# Visualize class-wise F1 scores
plt.figure(figsize=(12, 6))
plt.barh(performance_df['Category'], performance_df['F1-Score'], color='green', alpha=0.7)
plt.xlabel('F1-Score', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.title('F1-Score by Category', fontsize=14, fontweight='bold')
plt.xlim(0, 1)
for i, (cat, score) in enumerate(zip(performance_df['Category'], performance_df['F1-Score'])):
    plt.text(score + 0.01, i, f'{score:.3f}', va='center')
plt.tight_layout()
plt.show()

print("\n‚úì Step 5 Complete: Model fully evaluated")

In [None]:
import joblib

print("="*70)
print("SAVING MODEL FOR DEPLOYMENT")
print("="*70)

# Save the trained model (best_model from Step 4)
joblib.dump(best_model, 'ticket_classifier_model.pkl')
print("‚úì Model saved: ticket_classifier_model.pkl")

# Save the vectorizer (from Step 3)
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("‚úì Vectorizer saved: tfidf_vectorizer.pkl")

# Save category labels
categories = list(best_model.classes_)
joblib.dump(categories, 'categories.pkl')
print("‚úì Categories saved: categories.pkl")

print("\n" + "="*70)
print("FILES CREATED - DOWNLOAD THEM NOW")
print("="*70)
print("1. ticket_classifier_model.pkl")
print("2. tfidf_vectorizer.pkl")
print("3. categories.pkl")

In [17]:
# Download all 3 files
from google.colab import files

files.download('ticket_classifier_model.pkl')
files.download('tfidf_vectorizer.pkl')
files.download('categories.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>