In [None]:
# import modules 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
# load dataset 
dataset = pd.read_excel('.../status_kelulusan.xlsx')
print(dataset.head())

In [None]:
# dataset description
print(dataset.shape)
print(dataset.info())

In [None]:
# checking missing data 
print(dataset.isnull().sum())

In [None]:
# replace missing values with mean 
dataset['IPK'] = dataset['IPK'].fillna(dataset['IPK'].mean())

In [None]:
# checking missing data 
print(dataset.isnull().sum())

In [None]:
# checking type of data 
print(dataset.dtypes)
print(dataset.head(2))

In [None]:
# relabeling column 'Jenis Kelamin', 'Status Mahasiswa', 'Status Pernikahan' 
le = LabelEncoder()
dataset['Jenis Kelamin'] = le.fit_transform(dataset['Jenis Kelamin'])
dataset['Status Mahasiswa'] = le.fit_transform(dataset['Status Mahasiswa'])
dataset['Status Pernikahan'] = le.fit_transform(dataset['Status Pernikahan'])
print(dataset.head())

In [None]:
#class distribution 
print(dataset.groupby("Status Kelulusan").size())

In [None]:
#create validation dataset
#split-out validation dataset
array = dataset.values
X = array[:, 0:4]
y = array[:, 4]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.20, random_state = 8)

In [None]:
#make prediction on validation dataset 
model = CategoricalNB()
model.fit(X_train, y_train)
pred = model.predict(X_val)

In [None]:
#evaluate predictions
def evaluate (X, y): 
    print('Accuracy score: ', accuracy_score(X, y))
    print('Confussion Matrix: \n', confusion_matrix(X, y))
    print('Classifiation Report: \n', classification_report(X, y))

In [None]:
evaluate(y_val, pred)

In [None]:
dataset.head()

In [None]:
# Predict new data 
# example 3 new data 
new_data = {'Jenis Kelamin': ['Laki-laki', 'Perempuan', 'Laki-laki'], 
           'Status Mahasiswa': ['Bekerja', 'Mahasiswa', 'Mahasiswa'], 
           'Status Pernikahan': ['Belum', 'Belum', 'Belum'], 
           'IPK': [3.43, 3.71, 2.90]}
new_data = pd.DataFrame(new_data)
# relabeling column 'Jenis Kelamin', 'Status Mahasiswa', 'Status Pernikahan' 
le = LabelEncoder()
new_data['Jenis Kelamin'] = le.fit_transform(new_data['Jenis Kelamin'])
new_data['Status Mahasiswa'] = le.fit_transform(new_data['Status Mahasiswa'])
new_data['Status Pernikahan'] = le.fit_transform(new_data['Status Pernikahan'])
arr_new_data = new_data.values

In [None]:
# make a prediction
ynew_data = model.predict(new_data)

# show the inputs and predicted outputs
for i in range(len(arr_new_data)):
    print("X=%s, Predicted=%s" % (arr_new_data[i], ynew_data[i]))