In [1]:
import pandas as pd
from ctgan import CTGAN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Load data
df = pd.read_csv("dataset.csv")
X = df.drop(columns=["Unnamed: 0", "label"])
y = df["label"]

# Combine for CTGAN
data = X.copy()
data["label"] = y

# Focus on minority class generation
minority_class = 1
minority_data = data[data["label"] == minority_class]

# Train CTGAN
ctgan = CTGAN(epochs=100)
ctgan.fit(minority_data, discrete_columns=["label"])

# Generate synthetic samples
synthetic_data = ctgan.sample(len(minority_data) * 10)
synthetic_data = synthetic_data[synthetic_data["label"] == minority_class]

# Combine synthetic with original data
augmented_data = pd.concat([data, synthetic_data], ignore_index=True)

# Train/test split
X_aug = augmented_data.drop(columns=["label"])
y_aug = augmented_data["label"]

scaler = StandardScaler()
X_aug_scaled = scaler.fit_transform(X_aug)

X_train, X_test, y_train, y_test = train_test_split(X_aug_scaled, y_aug, test_size=0.3, stratify=y_aug, random_state=42)

# Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99    100755
           1       1.00      1.00      1.00      7438
           2       1.00      1.00      1.00     16617
           3       0.92      0.90      0.91     10240

    accuracy                           0.99    135050
   macro avg       0.98      0.97      0.97    135050
weighted avg       0.99      0.99      0.99    135050

