In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv('clean_df_3_single_weapon.csv')
df.head()

Unnamed: 0,Agency Type,Victim Sex,Victim Age,Victim Ethnicity,Perpetrator Sex,Perpetrator Ethnicity,Weapon Category,Relationship Category,Region,Season
0,Other Police,Female,26,Not Hispanic,Male,Not Hispanic,Non-Firearm,Lover,West,Autumn
1,Sheriff,Male,23,Not Hispanic,Male,Not Hispanic,Firearm,Acquaintance,South,Summer
2,Sheriff,Male,42,Not Hispanic,Female,Not Hispanic,Firearm,Lover,South,Summer
3,Sheriff,Male,33,Not Hispanic,Male,Not Hispanic,Firearm,Acquaintance,South,Summer
4,Sheriff,Male,46,Not Hispanic,Male,Not Hispanic,Firearm,Family,South,Autumn


In [5]:
# Target variable
target_column = 'Weapon Category'


# Encode categorical features
encoder = LabelEncoder()
encoded_df = df.copy()
for col in df.columns:
    encoded_df[col] = encoder.fit_transform(df[col])

# Features and target split
X = encoded_df.drop(columns=[target_column])
y = encoded_df[target_column]

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Map target labels back to original categories
target_mapping = dict(zip(range(len(encoder.classes_)), encoder.classes_))
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df[target_column] = y_resampled.map(target_mapping)

# Decode categorical features back to original values
for col in df.columns:
    if col != target_column:
        resampled_df[col] = resampled_df[col].map(dict(zip(range(len(encoder.classes_)), encoder.classes_)))

# Save datasets
original_file = "original_dataset.csv"
resampled_file = "resampled_dataset.csv"

df.to_csv(original_file, index=False)
resampled_df.to_csv(resampled_file, index=False)

print(f"Original dataset saved as: {original_file}")
print(f"Resampled dataset (with SMOTE) saved as: {resampled_file}")

Original dataset saved as: original_dataset.csv
Resampled dataset (with SMOTE) saved as: resampled_dataset.csv
