In [None]:
# Here we are trying oversampling data in an imbalanced dataset using SMOTE
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv(r'combined_dataset.csv')

In [None]:
print(df.head())

In [None]:
df = df.iloc[:, :-2]
print(df.head())

In [5]:
encoder = LabelEncoder()
df[' Label'] = encoder.fit_transform(df[' Label'])

In [None]:
print(df[' Label'].value_counts())

In [7]:
df = df.fillna(0)
df = df.replace([np.inf, -np.inf], 0)

In [8]:
df = df.astype(int)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
X = df.drop(columns=[' Label'])
y = df[' Label']

# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=y.unique(),
    y=y
)
class_weights_dict = dict(enumerate(class_weights))

print("Class Weights:", class_weights_dict)


In [None]:
print(df[' Label'].value_counts())

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_scaled)

In [None]:
k = 10  
k_best = SelectKBest(score_func=f_classif, k=k)
X_new = k_best.fit_transform(X_imputed, y)

In [None]:
selected_features_mask = k_best.get_support()
selected_feature_names = X.columns[selected_features_mask]
print("Selected Features:")
print(selected_feature_names)

In [None]:
df_new = pd.DataFrame(X_new, columns=selected_feature_names)
df_new['Label'] = y
print(df_new.head())

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Balanced Class Distribution:")
print(pd.Series(y_train_balanced).value_counts())

In [None]:
print("Shapes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

In [None]:
ann = Sequential()

In [18]:
ann.add(Dense(units=16, activation='relu', input_dim=X_train.shape[1]))

In [19]:
ann.add(Dense(units=8, activation='relu'))
ann.add(Dropout(0.2))  
ann.add(Dense(units=8, activation='relu'))

In [20]:
ann.add(Dense(units=1, activation='sigmoid'))  # Binary classification

In [None]:
# Compile the ANN
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [None]:
history = ann.fit(
    X_train, y_train,
    batch_size=32,
    epochs=20,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
y_pred_prob = ann.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC-AUC Score: {roc_auc}")

In [None]:
ann.save(r'trained_ann_model.h5')
print("Model saved successfully.")