In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler

In [190]:
columns = ['fLength','fWidth', 'fSize', 'fConc','fConc1','fAsym','fM3Long','fM3Trans','fAlpha','fDist', 'class']

In [191]:
df = pd.read_csv('Data/magic04.csv', names=columns)

In [192]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [193]:
df['class'] = (df['class'] == 'g').astype(int)
df['class'].unique()

array([1, 0])

In [194]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [195]:
df['class'].unique()

array([1, 0])

for label in columns[:-1]:
    plt.hist(df[df['class'] == 1][label], color='red', label='gamma', alpha=0.7, density=True)
    plt.hist(df[df['class'] == 0][label], color='blue', label='hadron', alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel('Probability')
    plt.xlabel(label)
    plt.legend()
    plt.show()

In [197]:
train, validation, testing =np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [198]:
from sklearn.preprocessing import StandardScaler

In [199]:
scaler = StandardScaler()

In [200]:
def scale_dataset(dataframe, oversampler=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    
    if oversampler == True:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x, y)
        
    x = scaler.fit_transform(x)
    data = np.hstack((x, np.reshape(y,(-1,1))))
    return data, x, y

In [201]:
train, x_train, y_train = scale_dataset(train, oversampler=True)
validation, x_valid, y_valid = scale_dataset(validation, oversampler=False)
testing, x_testing, y_testing = scale_dataset(testing, oversampler=False)

KNN

In [202]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [203]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)

In [204]:
y_prediction_KNN = knn_model.predict(x_testing)

In [205]:
y_prediction_KNN

array([0, 1, 1, ..., 1, 1, 1])

In [206]:
y_testing

array([0, 0, 1, ..., 0, 0, 1])

In [207]:
print(classification_report(y_testing, y_prediction_KNN))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74      1327
           1       0.87      0.83      0.85      2477

    accuracy                           0.81      3804
   macro avg       0.79      0.80      0.79      3804
weighted avg       0.81      0.81      0.81      3804



NAIVE BAYES

In [208]:
from sklearn.naive_bayes import GaussianNB

In [209]:
nb_model = GaussianNB()

In [210]:
nb_model.fit(x_train, y_train)

In [211]:
y_predition_naive_bayes = nb_model.predict(x_testing)

In [213]:
print(classification_report(y_testing, y_predition_naive_bayes))

              precision    recall  f1-score   support

           0       0.60      0.45      0.51      1327
           1       0.74      0.84      0.79      2477

    accuracy                           0.70      3804
   macro avg       0.67      0.64      0.65      3804
weighted avg       0.69      0.70      0.69      3804



Logistic Regression

In [214]:
from sklearn.linear_model import LogisticRegression

In [215]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
y_predition_logistic_regression = lr_model.predict(x_testing)
print(classification_report(y_testing, y_predition_logistic_regression))

              precision    recall  f1-score   support

           0       0.63      0.77      0.70      1327
           1       0.86      0.76      0.81      2477

    accuracy                           0.76      3804
   macro avg       0.75      0.77      0.75      3804
weighted avg       0.78      0.76      0.77      3804



Support Vector (SVM)

In [216]:
from sklearn.svm import SVC

In [218]:
lr_model = SVC()
lr_model.fit(x_train, y_train)
y_predition_SVM= lr_model.predict(x_testing)
print(classification_report(y_testing, y_predition_SVM))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80      1327
           1       0.91      0.86      0.89      2477

    accuracy                           0.85      3804
   macro avg       0.84      0.85      0.84      3804
weighted avg       0.86      0.85      0.86      3804

