In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [51]:
names = [
    'fLength',
    'fWidth',
    'fSize',
    'fConc',
    'fConc1',
    'fAsym',
    'fM3Long',
    'fM3Trans',
    'fAlpha',
    'fDist',
    'class'
]

df = pd.read_csv('magic04.data', names=names)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [52]:
df['class'] = (df['class'] == 'g').astype(int)

In [53]:
# for label in names[:-1]:
#     plt.hist(df[df['class'] == 1][label], color='orange', label='gamma', alpha=0.7, density=True)
#     plt.hist(df[df['class'] == 0][label], color='green', label='hadron', alpha=0.7, density=True)
#     plt.title(label)
#     plt.ylabel('Probability')
#     plt.xlabel(label)
#     plt.legend()
#     plt.show()

In [54]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])

def scaledataset(dataframe, oversample = False):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y) # type: ignore

    data = np.hstack((X, np.reshape(y, (-1, 1))))

    return data, X, y

In [55]:
train, X_train, y_train = scaledataset(train, oversample=True)
valid, X_valid, y_valid = scaledataset(valid, oversample=False)
test, X_test, y_test = scaledataset(test, oversample=False)

K Near Neighbors

In [56]:
# knn_model = KNeighborsClassifier(n_neighbors=5)
# knn_model.fit(X_train, y_train)

In [57]:
# y_pred = knn_model.predict(X_test)
# print(classification_report(y_test, y_pred))

Naive Bayes

In [58]:
from sklearn.naive_bayes import GaussianNB

# nb_model = GaussianNB()
# nb_model = nb_model.fit(X_train, y_train)
# y_pred = nb_model.predict(X_test)

In [59]:
# print(classification_report(y_test, y_pred))

Logistic regration


In [60]:
from sklearn.linear_model import LogisticRegression

# lg_model = LogisticRegression()
# lg_model = lg_model.fit(X_train, y_train)
# y_pred = lg_model.predict(X_test)
# print(classification_report(y_test, y_pred))

In [61]:
from sklearn.svm import SVC

# svm_model = SVC()
# svm_model = svm_model.fit(X_train, y_train)
# y_pred = svm_model.predict(X_test)
# print(classification_report(y_test, y_pred))

Neural Net

In [62]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary crossentropy')
    ax1.grid(True)

    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.grid(True)

    plt.show()

In [3]:
import tensorflow as tf

def train_model(X_train, y_train, num_nodes, dropout, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(num_nodes, activation='relu'),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    nn_model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = nn_model.fit(
        X_train, y_train,
        epochs=epochs, batch_size=batch_size, validation_data=(X_valid, y_valid),
        verbose=0 # type: ignore
    )

    return nn_model, history

In [None]:
# least_val_loss = float('inf')
# least_loss_model = None

# epochs = 100
# for num_nodes in [16, 32, 64]:
#     for dropout in [0, 0.2]:
#         for lr in [0.01, 0.005, 0.001]:
#             for batch_size in [32, 64, 128]:
#                 model, history = train_model(X_train, y_train, num_nodes, dropout, lr, batch_size, epochs)

#                 plot_history(history)

#                 val_loss = model.evaluate(X_valid, y_valid)[0]

#                 if val_loss < least_val_loss:
#                     least_val_loss = val_loss
#                     leas_loss_model = model

In [65]:
# y_pred = leas_loss_model.predict(X_test)
# y_pred = (y_pred > 0.5).astype(int).reshape(-1,)
# print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82      1313
           1       0.90      0.91      0.91      2491

    accuracy                           0.88      3804
   macro avg       0.86      0.86      0.86      3804
weighted avg       0.88      0.88      0.88      3804

