In [33]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import pandas as pd
from sklearn.preprocessing import LabelEncoder


data = pd.read_csv('master_data_clean.csv').drop(columns=['Agency_Type_grouped'])

label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Separate features and target
target_column = 'Weapon Category'
X = data.drop(columns=[target_column])
y = data[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Initialize and train the XGBoost classifier
xgb_model = XGBClassifier(eval_metric='aucpr', random_state=42, scale_pos_weight=scale_pos_weight)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(report)



Accuracy: 0.66

Classification Report:

              precision    recall  f1-score   support

           0       0.78      0.68      0.73     30901
           1       0.48      0.60      0.53     15159

    accuracy                           0.66     46060
   macro avg       0.63      0.64      0.63     46060
weighted avg       0.68      0.66      0.66     46060



In [29]:
confusion_matrix(y_test, y_pred)

array([[28391,  2510],
       [10859,  4300]])

In [34]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

xgb_model = XGBClassifier(eval_metric='aucpr', random_state=42)
# Step 3: Train the XGBoost classifier
xgb_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Evaluate on the test set
y_pred = xgb_model.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display the classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.6543421623968736
              precision    recall  f1-score   support

           0       0.77      0.69      0.73     30901
           1       0.48      0.58      0.53     15159

    accuracy                           0.65     46060
   macro avg       0.63      0.64      0.63     46060
weighted avg       0.68      0.65      0.66     46060



In [32]:
confusion_matrix(y_test, y_pred)

array([[21309,  9592],
       [ 6329,  8830]])

In [35]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

xgb_model = XGBClassifier(eval_metric='aucpr', random_state=42)
# Step 3: Train the XGBoost classifier
xgb_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Evaluate on the test set
y_pred = xgb_model.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display the classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.6542770299609205
              precision    recall  f1-score   support

           0       0.77      0.69      0.73     30901
           1       0.48      0.58      0.52     15159

    accuracy                           0.65     46060
   macro avg       0.62      0.64      0.63     46060
weighted avg       0.67      0.65      0.66     46060

