In [3]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
true_events = pd.read_csv(r"C:\Users\chirag choudhary\event_ai model\true_event_2.0_final.csv")
false_events = pd.read_csv(r"C:\Users\chirag choudhary\event_ai model\false_event_2.0_final.csv")

# Standardize column names
true_events.columns = true_events.columns.str.lower().str.strip()
false_events.columns = false_events.columns.str.lower().str.strip()

# Add labels
true_events["alert created"] = 1  # True events
false_events["alert created"] = 0  # False events

# Combine datasets
data = pd.concat([true_events, false_events], ignore_index=True)

# Select only message key and description as features
selected_features = ["message key", "description"]
X = data[selected_features]  # Features
y = data["alert created"]    # Target variable

# Handle missing values
X = X.fillna("")

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2)), "description"),  # Apply n-grams (Unigram + Bigram)
        ("tfidf_key", TfidfVectorizer(ngram_range=(1, 2)), "message key")
    ],
    remainder="drop"  # Drop other columns
)

# Define the model
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")

# Create a pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", model)])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
print("Training the XGBoost model...")
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the trained model
model_filename = "alert_prediction_xgboost.pkl"
joblib.dump(pipeline, model_filename)
print(f"\nModel saved as {model_filename}")


Training the XGBoost model...


Parameters: { "use_label_encoder" } are not used.




Model Evaluation:
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2952
           1       1.00      1.00      1.00      3048

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000


Model saved as alert_prediction_xgboost.pkl


In [2]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 660.6 kB/s eta 0:03:10
   ---------------------------------------- 0.0/124.9 MB 326.8 kB/s eta 0:06:23
   ---------------------------------------- 0.1/124.9 MB 573.4 kB/s eta 0:03:38
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:51
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:51
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:51
   ---------------------------------------- 0.3/124.9 MB 948.8 kB/s eta 0:02:12
   ---------------------------------------- 0.3/124.9 MB 948.8 kB/s eta 0:02:12
   ---------------------------------------- 0.3/124.9 MB 948.8 kB/s eta 0:02:12
   ---------------------------------------- 0.3/124.9 MB 948.8 kB/s


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\chirag choudhary\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
import joblib
import pandas as pd

# Load the saved model
model = joblib.load("alert_prediction_xgboost.pkl")

# Define test cases
test_cases = [
    {
        "message key": "High Queue Length - Process or network queues are excessively long, delaying operations. Azure Cloud , 10.0.0.20 prdalbllb Node status is Faulty. default prdbmswmid01-vnet",
        "description": "High Database issue is generated by the server "
    },
    {
        "message key": "Thread count marginally elevated, keeping under observation. - system_load_near_average, 10.0.0.235 alb-monitor06 Node status is Under Monitoring.",
        "description": "thread count elevated under observation "
    }
]

# Convert to DataFrame
test_df = pd.DataFrame(test_cases)

# Make predictions
predictions = model.predict(test_df)
prediction_probabilities = model.predict_proba(test_df)

# Print results
print("Alert Prediction Results:\n")
for i, (case, prediction, probabilities) in enumerate(zip(test_cases, predictions, prediction_probabilities)):
    print(f"Test Case {i+1}:")
    print(f"Message Key: {case['message key']}")
    print(f"Description: {case['description']}")
    print(f"Predicted Alert Created: {'True' if prediction == 1 else 'False'}")
    print("Confidence Scores:")
    print(f"- False Alert: {probabilities[0]:.2%}")
    print(f"- True Alert: {probabilities[1]:.2%}")
    print("-" * 80 + "\n")

# ✅ Function properly indented and structured
def test_new_case(message_key, description):
    new_case = pd.DataFrame({
        "message key": [message_key],
        "description": [description]
    })
    prediction = model.predict(new_case)
    probabilities = model.predict_proba(new_case)

    print("\nNew Test Case Results:")
    print(f"Message Key: {message_key}")
    print(f"Description: {description}")
    print(f"Predicted Alert Created: {'True' if prediction[0] == 1 else 'False'}")
    print("Confidence Scores:")
    print(f"- False Alert: {probabilities[0][0]:.2%}")
    print(f"- True Alert: {probabilities[0][1]:.2%}")

# Example usage:
# test_new_case("Your message key here", "Your description here")


Alert Prediction Results:

Test Case 1:
Message Key: High Queue Length - Process or network queues are excessively long, delaying operations. Azure Cloud , 10.0.0.20 prdalbllb Node status is Faulty. default prdbmswmid01-vnet
Description: High Database issue is generated by the server 
Predicted Alert Created: True
Confidence Scores:
- False Alert: 0.01%
- True Alert: 99.99%
--------------------------------------------------------------------------------

Test Case 2:
Message Key: Thread count marginally elevated, keeping under observation. - system_load_near_average, 10.0.0.235 alb-monitor06 Node status is Under Monitoring.
Description: thread count elevated under observation 
Predicted Alert Created: False
Confidence Scores:
- False Alert: 99.99%
- True Alert: 0.01%
--------------------------------------------------------------------------------

