In [1]:
import numpy as np               # used for multidimensional arrays
import pandas as pd              # used for import the dataset

In [3]:
adult = pd.read_csv("adult_num.csv")

In [4]:
X = adult.iloc[:,:-1].values    # predictor attributes
y = adult.iloc[:,-1].values     # target attribute

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [18]:
# import naive bayes classifier
from sklearn.naive_bayes import GaussianNB

# initialize the Naive Bayes classifier
NB = GaussianNB()

#fit the classifier to the data
NB.fit(X_train,y_train)

#use the resulting model to predict value on the test sey
y_pred = NB.predict(X_test)

#performance metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print(confusion_matrix(y_test, y_pred), '\n')

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))

[[5862  297]
 [1329  653]] 

              precision    recall  f1-score   support

           0       0.82      0.95      0.88      6159
           1       0.69      0.33      0.45      1982

    accuracy                           0.80      8141
   macro avg       0.75      0.64      0.66      8141
weighted avg       0.78      0.80      0.77      8141

Accuracy: 0.8002702370716128
AUC: 0.6406215363502894


In [20]:
#cross-validation
from sklearn.model_selection import cross_val_score
NB1 = GaussianNB()

#accuracy scores for each fold
scores = cross_val_score(NB1, X, y, cv=10)
print(scores)

#print the average accuracy across all folds
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.79828063 0.80128993 0.79422604 0.79883292 0.79821867 0.80681818
 0.78900491 0.8022113  0.80251843 0.81572482]
Accuracy: 0.80 (+/- 0.01)


In [9]:
#performance metrics

from sklearn.model_selection import cross_val_predict

#predicted values
y_pred = cross_val_predict(NB1, X, y, cv=10)

print(confusion_matrix(y, y_pred), '\n')

print(classification_report(y, y_pred))
print("Accuracy:", accuracy_score(y, y_pred))
print("AUC:", roc_auc_score(y, y_pred))

[[23536  1184]
 [ 5305  2536]] 

              precision    recall  f1-score   support

           0       0.82      0.95      0.88     24720
           1       0.68      0.32      0.44      7841

    accuracy                           0.80     32561
   macro avg       0.75      0.64      0.66     32561
weighted avg       0.78      0.80      0.77     32561

Accuracy: 0.8007125088295814
AUC: 0.6377658470185552


In [None]:
############################################################
###  Neural Networks exmple with Multi-Layer Perceptron  ###
############################################################

#MLP is very sensitive to wide data ranges, so scaling the data is recommended
#scaling the data to [0,1]
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler() # normalise data

scaled_features = min_max_scaler.fit_transform(adult.values)
scaled_features_adult = pd.DataFrame(scaled_features, index=adult.index, columns=adult.columns)

In [23]:
scaled_features_adult.describe()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.5,0.295639,0.422046,0.686547,0.605379,0.435306,0.458697,0.289272,0.916464,0.669205,0.010777,0.020042,0.402423,0.889533,0.24081
std,0.288688,0.186855,0.197755,0.258018,0.171515,0.251037,0.309187,0.321354,0.212201,0.470506,0.073854,0.092507,0.125994,0.148368,0.427581
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.150685,0.375,0.6,0.533333,0.333333,0.214286,0.0,1.0,0.0,0.0,0.0,0.397959,0.926829,0.0
50%,0.5,0.273973,0.375,0.733333,0.6,0.333333,0.428571,0.2,1.0,1.0,0.0,0.0,0.397959,0.926829,0.0
75%,0.75,0.424658,0.375,0.8,0.733333,0.666667,0.714286,0.6,1.0,1.0,0.0,0.0,0.44898,0.926829,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
#import the classifier
from sklearn.neural_network import MLPClassifier

# initialize the classifier
# MLP = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
MLP = MLPClassifier(random_state=2)

#fit the classifier to the data
MLP.fit(X, y)

#use the resulting model to predict value on the test sey
y_pred = MLP.predict(X_test)

#performance metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print(confusion_matrix(y_test, y_pred), '\n')

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))

[[6093   66]
 [1611  371]] 

              precision    recall  f1-score   support

           0       0.79      0.99      0.88      6159
           1       0.85      0.19      0.31      1982

    accuracy                           0.79      8141
   macro avg       0.82      0.59      0.59      8141
weighted avg       0.81      0.79      0.74      8141

Accuracy: 0.7940056504114974
AUC: 0.5882343183144156


In [41]:
#cross-validation
from sklearn.model_selection import cross_val_score
# MLP1 =  MLPClassifier(random_state=3)
MLP1 = MLPClassifier(hidden_layer_sizes=(5 , 2), random_state=2)

#accuracy scores for each fold
scores = cross_val_score(MLP1, X, y, cv=10)
print(scores)

#print the average accuracy across all folds
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.78753454 0.79699017 0.78931204 0.79207617 0.79668305 0.8046683
 0.79484029 0.80620393 0.80128993 0.80036855]
Accuracy: 0.80 (+/- 0.01)


In [37]:
#performance metrics

from sklearn.model_selection import cross_val_predict

#predicted values
y_pred = cross_val_predict(MLP1, X, y, cv=10)

print(confusion_matrix(y, y_pred), '\n')

print(classification_report(y, y_pred))
print("Accuracy:", accuracy_score(y, y_pred))
print("AUC:", roc_auc_score(y, y_pred))

[[24662    58]
 [ 7482   359]] 

              precision    recall  f1-score   support

           0       0.77      1.00      0.87     24720
           1       0.86      0.05      0.09      7841

    accuracy                           0.77     32561
   macro avg       0.81      0.52      0.48     32561
weighted avg       0.79      0.77      0.68     32561

Accuracy: 0.7684346303860446
AUC: 0.5217193490444593


In [None]:
######################################
###  Support Vector Machine (SVM)  ###
######################################

# import classifier
from sklearn.svm import SVC

# initialize the classifier
SVM = SVC(kernel = 'linear')

#SVM is computationally expensive
#use a sample of the data for it to run in a reasonable time
adult_sample=adult.sample(frac=0.01, replace=False, random_state=1)
adult_sample.describe()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0
mean,37.794479,3.377301,10.53681,10.070552,2.708589,6.276074,1.337423,3.671779,0.653374,1027.220859,45.739264,40.90184,36.466258,0.208589
std,13.367841,1.545805,3.606429,2.407282,1.473161,4.292087,1.538114,0.844676,0.476627,6233.764378,316.087908,12.478518,5.92962,0.406924
min,17.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
25%,27.0,3.0,9.0,9.0,2.0,3.0,0.0,4.0,0.0,0.0,0.0,40.0,38.0,0.0
50%,36.0,3.0,11.0,10.0,2.0,6.0,1.0,4.0,1.0,0.0,0.0,40.0,38.0,0.0
75%,46.0,3.0,12.0,12.0,4.0,9.0,3.0,4.0,1.0,0.0,0.0,45.0,38.0,0.0
max,76.0,8.0,15.0,16.0,6.0,14.0,5.0,4.0,1.0,99999.0,2824.0,90.0,41.0,1.0


In [None]:
X = adult_sample.iloc[:,:-1].values    # predictor attributes
y = adult_sample.iloc[:,-1].values     # target attribute

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#fit the classifier to the data
SVM.fit(X_train,y_train)

#use the resulting model to predict value on the test sey
y_pred = SVM.predict(X_test)

#performance metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print(confusion_matrix(y_test, y_pred), '\n')

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))

[[59  8]
 [ 7  8]] 

              precision    recall  f1-score   support

           0       0.89      0.88      0.89        67
           1       0.50      0.53      0.52        15

    accuracy                           0.82        82
   macro avg       0.70      0.71      0.70        82
weighted avg       0.82      0.82      0.82        82

Accuracy: 0.8170731707317073
AUC: 0.7069651741293532


In [None]:
#cross-validation
from sklearn.model_selection import cross_val_score
SVM1 = SVC(kernel = 'linear')

#accuracy scores for each fold
scores = cross_val_score(SVM1, X, y, cv=10)
print(scores)

#print the average accuracy across all folds
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#performance metrics

from sklearn.model_selection import cross_val_predict

#predicted values
y_pred = cross_val_predict(SVM1, X, y, cv=10)

print(confusion_matrix(y, y_pred), '\n')

print(classification_report(y, y_pred))
print("Accuracy:", accuracy_score(y, y_pred))
print("AUC:", roc_auc_score(y, y_pred))