In [75]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Input

from sklearn.metrics import confusion_matrix, accuracy_score

In [7]:
df=pd.read_csv('HIGGS_8K.csv',header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,0.0,1.630428,0.404143,0.401026,2.722409,0.355644,1.362199,0.015818,-1.174111,0.0,...,0.387635,1.596321,1.550981,0.859235,0.827233,0.995416,0.764766,0.520597,0.850413,0.929865
2,1.0,0.326305,-0.797735,0.223471,1.248234,-1.427857,0.923767,-0.27828,-0.470052,0.0,...,-0.426866,1.153501,0.0,0.887944,0.924601,0.987189,0.854094,0.385013,0.789912,0.724627
3,1.0,1.38117,0.521993,0.574698,0.357347,0.037136,0.413057,-0.607036,-1.30827,0.0,...,0.567525,-1.475778,0.0,0.936186,1.107217,0.983808,0.693327,0.562045,0.825387,0.846233
4,1.0,0.304161,-0.736375,0.686225,0.477172,-1.274736,0.907003,0.32477,-0.463954,2.173076,...,0.904818,1.165154,0.0,1.18179,1.033701,0.993943,0.648399,0.936097,0.732592,0.661779


In [9]:
# While examining the data, some string elements were noticed
print(df.dtypes[df.dtypes != 'float64'])

17    object
dtype: object


In [11]:
# After locating the issue, we fix it 
print(f'String value: {df.iloc[0,17]} (type: {type(df.iloc[0, 17])})')
df.iloc[:, 17] = pd.to_numeric(df.iloc[:, 17], errors='coerce').fillna(0) 
print(f'Fixed value: {df.iloc[0, 17]} (type: {type(df.iloc[0, 17])})')

String value: 0.000000000000000000e+00.1 (type: <class 'str'>)
Fixed value: 0.0 (type: <class 'float'>)


In [21]:
# Separation of the data, according to the instructions
classification=df.iloc[:,0]
low_level=df.iloc[:,1:21]
high_level=df.iloc[:,22:]

### Low level

In [139]:
# Splitting the dataset
low_level_train, low_level_test, classification_train, classification_test = train_test_split(low_level, classification, test_size=0.25, random_state=42)

# Feature scaling
scaler = StandardScaler()
low_level_train_scaled = scaler.fit_transform(low_level_train)
low_level_test_scaled =scaler.transform(low_level_test)

In [141]:
# Building the ANN
ann = tf.keras.models.Sequential()

# Input layer
ann.add(Input(shape=(20,)))
ann.add(tf.keras.layers.Dense(12,activation='relu'))

#Second hidden layer
ann.add(tf.keras.layers.Dense(8, activation='relu'))

# Output layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [143]:
# Training the ANN

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann.summary()

In [126]:
ann.fit(low_level_train_scaled, classification_train, batch_size=32, epochs=200)

Epoch 1/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5022 - loss: 0.7594
Epoch 2/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5017 - loss: 0.7067
Epoch 3/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5374 - loss: 0.6893
Epoch 4/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5466 - loss: 0.6854
Epoch 5/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5536 - loss: 0.6814
Epoch 6/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5732 - loss: 0.6771
Epoch 7/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5679 - loss: 0.6766
Epoch 8/200
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5821 - loss: 0.6702
Epoch 9/200
[1m188/188[0m [32

<keras.src.callbacks.history.History at 0x14b1595c710>

In [128]:
# Predicting the test set results
classification_pred=ann.predict(low_level_test_scaled)
classification_pred=(classification_pred>0.5)

# Making the confusion matrix
conf_matr = confusion_matrix(classification_test, classification_pred)
print(f'Confusion matrix for the low level is: \n{conf_matr}')
accuracy_score(classification_test, classification_pred)
print(f'Accuracy score is: {accuracy_score(classification_test, classification_pred)}')


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Confusion matrix for the low level is: 
[[486 492]
 [331 692]]
Accuracy score is: 0.5887056471764118


In [129]:
# Evaluate the model on test data
loss, accuracy = ann.evaluate(low_level_test_scaled, classification_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6002 - loss: 0.6943
Test loss: 0.6993393301963806
Test accuracy: 0.5887056589126587
