In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
true_events = pd.read_csv(r"F:\eventAI\event-prediction\model\true_event_2.0_final.csv")
false_events = pd.read_csv(r"F:\eventAI\event-prediction\model\false_event_2.0_final.csv")

# Standardize column names
true_events.columns = true_events.columns.str.lower().str.strip()
false_events.columns = false_events.columns.str.lower().str.strip()

# Add labels
true_events["alert created"] = 1  # True events
false_events["alert created"] = 0  # False events

# Combine datasets
data = pd.concat([true_events, false_events], ignore_index=True)

# Select only message key and description as features
selected_features = ["message key", "description"]
X = data[selected_features]  # Features
y = data["alert created"]    # Target variable

# Handle missing values
X = X.fillna("")

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2)), "description"),  # Apply n-grams (Unigram + Bigram)
        ("tfidf_key", TfidfVectorizer(ngram_range=(1, 2)), "message key")
    ],
    remainder="drop"  # Drop other columns
)

# Define the model
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")

# Create a pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", model)])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
print("Training the XGBoost model...")
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the trained model
model_filename = "alert_prediction_xgboost.pkl"
joblib.dump(pipeline, model_filename)
print(f"\nModel saved as {model_filename}")


Training the XGBoost model...


Parameters: { "use_label_encoder" } are not used.




Model Evaluation:
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2952
           1       1.00      1.00      1.00      3048

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000


Model saved as alert_prediction_xgboost.pkl


In [1]:
!pip install xgboost




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\chirag choudhary\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
