In [1]:
# use autokeras to find a model for the sonar dataset
from numpy import asarray
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from autokeras import StructuredDataClassifier


In [2]:
import os
import pandas as pd
from scipy.io import arff


# Path to the directory containing your CSV files
directory_path = '/home/danish/Datasets/ICS/multiclass'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.arff')]

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Loop through each CSV file and concatenate its data to the combined DataFrame
for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    df , _ = arff.loadarff(file_path)
    df = pd.DataFrame(df)
    combined_df = pd.concat([combined_df, df], ignore_index=True)


# Display the combined DataFrame
data = combined_df # .sample(frac=0.5, random_state=42)
X = data.drop('marker', axis=1)
y = data['marker'] 

# Optionally, you can save the combined DataFrame to a new CSV file
# combined_df.to_csv('/path/to/combined_data.csv', index=False)

In [3]:

# # split into input and output elements
# data = dataframe.values
# X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)


(78377, 128) (78377,)


In [4]:
# basic data preparation
X = X.astype('float32')
y = LabelEncoder().fit_transform(y)


In [5]:
# separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(52512, 128) (25865, 128) (52512,) (25865,)


In [6]:
# define the search
search = StructuredDataClassifier(max_trials=50)



INFO:tensorflow:Reloading Oracle from existing project ./structured_data_classifier/oracle.json
INFO:tensorflow:Reloading Tuner from ./structured_data_classifier/tuner0.json


In [7]:
# perform the search
search.fit(x=X_train, y=y_train, verbose=1)


INFO:tensorflow:Oracle triggered exit
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


<tensorflow.python.keras.callbacks.History at 0x7ff90ed94588>

In [8]:
# evaluate the model
loss, acc = search.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %.3f' % acc)


Accuracy: 0.022


In [9]:
# get the best performing model
model = search.export_model()


In [10]:
# summarize the loaded model
model.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
multi_category_encoding (Mul (None, 128)               0         
_________________________________________________________________
normalization (Normalization (None, 128)               257       
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
re_lu (ReLU)                 (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              33792     
_________________________________________________________________
re_lu_1 (ReLU)               (None, 1024)              0     

In [11]:
# save the best performing model to file
model.save('/home/danish/Codes/Auto_Keras_Dataset_3.h5')

NotImplementedError: Save or restore weights that is not an instance of `tf.Variable` is not supported in h5, use `save_format='tf'` instead. Got a model or layer MultiCategoryEncoding with weights [<tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677f9d68>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677fc358>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677fc908>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677fceb8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677fa4a8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677faa90>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677800b8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677806a0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867780c88>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677832b0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867783898>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867783e80>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677874a8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867787a90>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677880b8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677886a0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867788c88>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677942b0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867794898>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867794e80>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677994a8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867799a90>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677a50b8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677a56a0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677a5c88>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677ad2b0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677ad898>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677ade80>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677b24a8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677b2a90>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677bb0b8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677bb6a0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677bbc88>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677452b0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867745898>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867745e80>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff86774b4a8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff86774ba90>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677500b8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677506a0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867750c88>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff86775a2b0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff86775a898>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff86775ae80>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677614a8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867761a90>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677680b8>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677686a0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867768c88>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff8677712b0>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867771898>, <tensorflow.python.keras.engine.base_layer_utils.TrackableWeightHandler object at 0x7ff867771e80>]

In [None]:
# pprint(automl.show_models(), indent=4)


In [None]:
predictions = automl.predict(X_test)
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, stratify=y)

# skf = StratifiedKFold(n_splits=5)
  
# clf = AutoSklearnClassifier(time_left_for_this_task=600,
#                             max_models_on_disc=5,
#                             memory_limit = 102400,
#                             resampling_strategy=skf,
#                             ensemble_size = 3,
#                             metric = average_precision,
#                             scoring_functions=[roc_auc, average_precision, accuracy, f1, precision, recall, log_loss])    

# clf.fit(X = X_train, y = y_train)

In [None]:
import os
import pandas as pd

# Path to the directory containing your CSV files
directory_path = '/home/danish/Datasets/ICS/binaryAllNaturalPlusNormalVsAttacks'
# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Loop through each CSV file and concatenate its data to the combined DataFrame
for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    df = pd.read_csv(file_path).dropna()
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display the combined DataFrame
data = combined_df # .sample(frac=0.5, random_state=42)
data.shape
# Optionally, you can save the combined DataFrame to a new CSV file
# combined_df.to_csv('/path/to/combined_data.csv', index=False)

In [None]:
# List of bad or constant columns 
columns_to_drop = ['R3-PA:Z', 'R1-PA:Z', 'R2-PA:Z', 'R4-PA:Z', 'snort_log1', 'snort_log2', 'control_panel_log2', 'control_panel_log1']

# Remove specified columns
data = data.drop(columns=columns_to_drop)

In [None]:

# Load your dataset (replace 'your_dataset.csv' with your actual dataset file)
# data = pd.read_csv('/home/danish/Downloads/Datasets/ICS/binaryAllNaturalPlusNormalVsAttacks/data1.csv')
data.replace([np.inf, -np.inf], 1e15, inplace=True)
# data

# Extract the numerical columns for normalization
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols]
data

In [None]:

corr = pd.DataFrame(data.drop('marker', axis=1)).corr()

plt.figure(figsize=(30,30))

sns.heatmap(corr)

plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_estimators = 100, class_weight='balanced', random_state=42)
et.fit(data.drop('marker', axis=1), data['marker'].tolist())
importances = et.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(75,50))
plt.title("Feature importances")
plt.bar(range(data.drop('marker', axis=1).shape[1]), importances[indices],
        color="lightsalmon", align="center")
plt.xticks(range(data.drop('marker', axis=1).shape[1]), data.drop('marker', axis=1).columns[indices], rotation=90)
plt.xlim([-1, data.drop('marker', axis=1).shape[1]])
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'data' contains your DataFrame with features and labels
features1 = data.drop('marker', axis=1)  # Features
labels = data['marker']  # Labels

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
components = pca.fit_transform(features1)
reduced_df = pd.DataFrame(components, columns=['Component 1', 'Component 2'])

# Plot PCA visualization with colored points based on labels
plt.figure(figsize=(10, 8))
plt.scatter(reduced_df['Component 1'], reduced_df['Component 2'], c=range(0,78377), cmap='viridis')
plt.title('PCA Visualization of Feature Vectors with Colorful Labels')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar(label='Label (Marker)')
plt.show()


In [None]:
# Assuming 'df' contains your data
features1 = data.drop('marker', axis=1)  # Features
label = data['marker']  # Labels

from sklearn.decomposition import PCA

pca = PCA(n_components=3)  # Reduce to 2 dimensions for visualization
components = pca.fit_transform(features1)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

import matplotlib.pyplot as plt

# Plotting 3D PCA visualization
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(components[:, 0], components[:, 1], components[:, 2], c=range(0,78377), cmap='viridis')
ax.set_title('PCA Visualization (3D)')
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')
plt.show()


In [None]:
et = ExtraTreesClassifier(n_estimators = 100, class_weight='balanced', random_state=42)

# sfm = SelectFromModel(rf, threshold=0.00025)
sfm = SelectFromModel(et)
sfm.fit(data.drop('marker', axis=1), data['marker'].tolist())
# X_important_train = sfm.transform(X_train)
# X_important_test = sfm.transform(X_test)

feature_vector_1 = sfm.transform(data.drop('marker', axis=1))

# rf = RandomForestClassifier(n_estimators = 100, class_weight='balanced', random_state=42)
# rf.fit(X_important_train, y_train)
# y_pred = rf.predict(X_important_test)
# print(classification_report(y_test, y_pred))
print(feature_vector_1.shape)




In [None]:
cols = sfm.get_feature_names_out()
feature_vector_1 =pd.DataFrame(feature_vector_1)
feature_vector_1.columns = cols
feature_vector_1



corr = pd.DataFrame(feature_vector_1).corr()

plt.figure(figsize=(30,30))

sns.heatmap(corr)

plt.show()

In [None]:
et = ExtraTreesClassifier(n_estimators = 100, class_weight='balanced', random_state=42)
et.fit(feature_vector_1, data['marker'].tolist())
importances = et.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(75,50))
plt.title("Feature importances")
plt.bar(range(feature_vector_1.shape[1]), importances[indices],
        color="lightsalmon", align="center")
plt.xticks(range(feature_vector_1.shape[1]), feature_vector_1.columns[indices], rotation=90)
plt.xlim([-1, feature_vector_1.shape[1]])
plt.show()

In [None]:
numerical_cols = feature_vector_1.select_dtypes(include=['float64', 'int64']).columns


scaler = RobustScaler()

# Normalize the numerical columns
feature_vector_1[numerical_cols] = scaler.fit_transform(feature_vector_1[numerical_cols])
feature_vector_1
# # Display the normalized DataFrame
# print(data)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'data' contains your DataFrame with features and labels
features1 = data.drop('marker', axis=1)  # Features
labels = data['marker']  # Labels

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
components = pca.fit_transform(features1)
reduced_df = pd.DataFrame(components, columns=['Component 1', 'Component 2'])

# Plot PCA visualization with colored points based on labels
plt.figure(figsize=(10, 8))
plt.scatter(reduced_df['Component 1'], reduced_df['Component 2'], c=range(0,78377), cmap='viridis')
plt.title('PCA Visualization of Feature Vectors with Colorful Labels')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar(label='Label (Marker)')
plt.show()


In [None]:
# Assuming 'df' contains your data
features1 = data.drop('marker', axis=1)  # Features
label = data['marker']  # Labels

from sklearn.decomposition import PCA

pca = PCA(n_components=3)  # Reduce to 2 dimensions for visualization
components = pca.fit_transform(features1)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

import matplotlib.pyplot as plt

# Plotting 3D PCA visualization
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(components[:, 0], components[:, 1], components[:, 2], c=range(0,78377), cmap='viridis')
ax.set_title('PCA Visualization (3D)')
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Assuming the target variable is in a column named 'target'
# X = data.drop('marker', axis=1)
X = feature_vector_1
y = data['marker']
# y.value_counts()
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

In [None]:


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import autokeras as ak



In [None]:

# Train AutoKeras CNN model
clf = ak.StructuredDataClassifier(max_trials=10, overwrite=True)
clf.fit(X_train, y_train, epochs=10)


In [None]:

# Evaluate AutoKeras model
y_pred = clf.predict(X_test)
print("AutoKeras CNN Model Accuracy: ", accuracy_score(y_test, y_pred))

# Train traditional Machine Learning model (Random Forest)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Evaluate Random Forest model
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Model Accuracy: ", accuracy_score(y_test, y_pred_rf))


In [None]:

# Get predictions
y_pred = automl.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Plot ROC curve and calculate AUC
y_proba = automl.predict_proba(X_test)[:, 1]  # Probability of the positive class
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()
