In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# Load data
df = pd.read_csv("dataset.csv")

# Drop unwanted columns
X = df.drop(columns=["Unnamed: 0", "label"])
y = df["label"]

# Show class distribution before resampling
print("Class distribution before resampling:\n", y.value_counts())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% variance
X_pca = pca.fit_transform(X_scaled)
print(f"PCA reduced features from {X.shape[1]} to {X_pca.shape[1]}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=3)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Show class distribution after resampling
print("Class distribution after resampling:\n", pd.Series(y_res).value_counts())

# Train Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_res, y_res)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))


Class distribution before resampling:
 label
0    335847
2     55390
3     34134
1      2254
Name: count, dtype: int64
PCA reduced features from 23 to 11
Class distribution after resampling:
 label
3    268678
0    268678
2    268678
1    268678
Name: count, dtype: int64
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     67169
           1       0.94      0.93      0.93       451
           2       1.00      1.00      1.00     11078
           3       0.89      0.90      0.90      6827

    accuracy                           0.98     85525
   macro avg       0.95      0.96      0.95     85525
weighted avg       0.98      0.98      0.98     85525

