In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load the Excel file
file_path = 'path_to_your_excel_file.xlsx'
df = pd.read_excel(file_path)

# Define the keywords and their respective categories
categories = {
    'Hadoop': ['hadoop'],
    'Firewall': ['firewall'],
    'Access': ['read', 'write'],
    'Alert - Space': ['cloudview', 'space'],
    'Alert - Memory/Swap': ['cloudview', 'memory/swap'],
    'Alert - Memory': ['cloudview', 'memory'],
    'Alert - Out of Memory': ['cloudview', 'out of memory']
}

# Label the data based on keywords
def label_data(description):
    description = description.lower()
    for category, keywords in categories.items():
        if all(keyword in description for keyword in keywords):
            return category
    return 'Unclassified'

df['Category'] = df['ticket description'].apply(label_data)

# Filter out 'Unclassified' if any exists
df = df[df['Category'] != 'Unclassified']

# Split the data into training and test sets
X = df['ticket description']
y = df['Category']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a text processing and classification pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Output the classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Classify the whole dataset
df['Predicted Category'] = label_encoder.inverse_transform(model.predict(X))

# Save the results to a new Excel file
output_file_path = 'classified_tickets.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Classification completed. The results are saved in '{output_file_path}'")
