In [8]:
import pandas as pd
from ctgan import CTGAN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv("dataset.csv")
df = df.drop(columns=["Unnamed: 0"])

# Split features and label
X = df.drop(columns=["label"])
y = df["label"]

# Combine for CTGAN
df_full = X.copy()
df_full["label"] = y

# Separate majority and minority class
minority_class = 1
df_minority = df_full[df_full["label"] == minority_class]
df_majority = df_full[df_full["label"] != minority_class]

# Train CTGAN on minority class
ctgan = CTGAN(epochs=300)
ctgan.fit(df_minority, discrete_columns=["label"])

# Generate synthetic samples
synthetic_data = ctgan.sample(len(df_minority))
synthetic_data = synthetic_data[synthetic_data["label"] == minority_class]

# Combine synthetic and real data
df_balanced = pd.concat([df_majority, df_minority, synthetic_data], ignore_index=True)

# Split into train-test
X_bal = df_balanced.drop(columns=["label"])
y_bal = df_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)

# Train classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99     67211
           1       0.99      1.00      1.00       885
           2       1.00      1.00      1.00     10976
           3       0.91      0.89      0.90      6904

    accuracy                           0.98     85976
   macro avg       0.97      0.97      0.97     85976
weighted avg       0.98      0.98      0.98     85976

