# Klasifikasi Penerimaan Murid Prasekolah menggunakan Support Vector Machine (SVM)

KELOMPOK 3
- 2210511046 Hanifah Az-Zahra
- 2210511054 Dinda Cantika Putri
- 2210511070 Choirunnisa Zalfaa Nabilah
- 2210511072 Edwina Martha Putri

## Preprocessing Data

In [168]:
import pandas as pd #untuk manipulasi data
from sklearn.preprocessing import LabelEncoder #mengubah kategorikal menjadi numerik
from sklearn.model_selection import train_test_split #membagi dataset menjadi data training dan testing

In [169]:
# Menambahkan header
headers = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "class"]

# Membaca csv dataset
df = pd.read_csv('nursery/nursery.data', names=headers)

# Menampilkan 5 baris pertama
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [170]:
# Menampilkan statistik data
df.describe()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
count,12960,12960,12960,12960,12960,12960,12960,12960,12960
unique,3,5,4,4,3,2,3,3,5
top,usual,proper,complete,1,convenient,convenient,nonprob,recommended,not_recom
freq,4320,2592,3240,3240,4320,6480,4320,4320,4320


In [171]:
# Info setiap kolom
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12960 entries, 0 to 12959
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   parents   12960 non-null  object
 1   has_nurs  12960 non-null  object
 2   form      12960 non-null  object
 3   children  12960 non-null  object
 4   housing   12960 non-null  object
 5   finance   12960 non-null  object
 6   social    12960 non-null  object
 7   health    12960 non-null  object
 8   class     12960 non-null  object
dtypes: object(9)
memory usage: 911.4+ KB


*missing value*

In [172]:
# Melihat jumlah missing value di setiap kolom
df.isnull().sum()

parents     0
has_nurs    0
form        0
children    0
housing     0
finance     0
social      0
health      0
class       0
dtype: int64

*duplicate*

In [173]:
# Melihat jumlah data duplikat
df.duplicated().sum()

0

*outlier check*

In [174]:
# Cek distribusi frekuensi kategori pada setiap kelas
for col in df.columns:
    print(col)
    print(df[col].value_counts())
    print("\n")

parents
usual          4320
pretentious    4320
great_pret     4320
Name: parents, dtype: int64


has_nurs
proper         2592
less_proper    2592
improper       2592
critical       2592
very_crit      2592
Name: has_nurs, dtype: int64


form
complete      3240
completed     3240
incomplete    3240
foster        3240
Name: form, dtype: int64


children
1       3240
2       3240
3       3240
more    3240
Name: children, dtype: int64


housing
convenient    4320
less_conv     4320
critical      4320
Name: housing, dtype: int64


finance
convenient    6480
inconv        6480
Name: finance, dtype: int64


social
nonprob          4320
slightly_prob    4320
problematic      4320
Name: social, dtype: int64


health
recommended    4320
priority       4320
not_recom      4320
Name: health, dtype: int64


class
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: class, dtype: int64




*feature selection*

In [175]:
# Filter hanya baris dengan class 'not_recom' dan 'priority'
df = df[df['class'].isin(['not_recom', 'priority'])]

# Tampilkan jumlah kelas 'not_recom' dan 'priority'
print(df['class'].value_counts())

not_recom    4320
priority     4266
Name: class, dtype: int64


In [176]:
# Pilih kolom yang akan digunakan
kolom_pilihan = ['social', 'finance', 'health', 'class']
df_pilihan = df[kolom_pilihan].copy()

# Menampilkan 5 baris pertama data terpilih
df_pilihan.head()

Unnamed: 0,social,finance,health,class
1,nonprob,convenient,priority,priority
2,nonprob,convenient,not_recom,not_recom
4,slightly_prob,convenient,priority,priority
5,slightly_prob,convenient,not_recom,not_recom
6,problematic,convenient,recommended,priority


*encoding*

In [177]:

le = LabelEncoder()

df_pilihan.loc[:, 'social'] = le.fit_transform(df_pilihan['social'])
df_pilihan.loc[:, 'finance'] = le.fit_transform(df_pilihan['finance'])
df_pilihan.loc[:, 'health'] = le.fit_transform(df_pilihan['health'])
df_pilihan.loc[:, 'class'] = le.fit_transform(df_pilihan['class'])

df_pilihan.head()

  df_pilihan.loc[:, 'social'] = le.fit_transform(df_pilihan['social'])
  df_pilihan.loc[:, 'finance'] = le.fit_transform(df_pilihan['finance'])
  df_pilihan.loc[:, 'health'] = le.fit_transform(df_pilihan['health'])
  df_pilihan.loc[:, 'class'] = le.fit_transform(df_pilihan['class'])


Unnamed: 0,social,finance,health,class
1,0,0,1,1
2,0,0,0,0
4,2,0,1,1
5,2,0,0,0
6,1,0,2,1


*splitting*

In [178]:
X = df_pilihan[['social', 'finance', 'health']]
y = df_pilihan['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Training: ", X_train.shape)
print("Data Testing: ", X_test.shape)

Data Training:  (6868, 3)
Data Testing:  (1718, 3)


## Membuat Model 

*Standarisasi*

In [179]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

*Modeling Evaluasi*

In [180]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_predict = svm_model.predict(X_test)

In [181]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Accuracy Score: {accuracy_score(y_test, svm_predict)}")
print(f"Classification Report:\n {classification_report(y_test, svm_predict)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, svm_predict)}")

Accuracy Score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       854
           1       1.00      1.00      1.00       864

    accuracy                           1.00      1718
   macro avg       1.00      1.00      1.00      1718
weighted avg       1.00      1.00      1.00      1718

Confusion Matrix:
 [[854   0]
 [  0 864]]


In [182]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

sk_fold = StratifiedKFold(n_splits=5)
scores = cross_val_score(svm_model, X, y, cv=sk_fold)

print(f"Cross Validation Score: ", scores)

Cross Validation Score:  [1. 1. 1. 1. 1.]


In [183]:
from sklearn.svm import SVC

svm_modelrbf = SVC(kernel='rbf')
svm_modelrbf.fit(X_train, y_train)
svm_pred = svm_modelrbf.predict(X_test)

In [184]:
print(f"Accuracy Score: {accuracy_score(y_test, svm_pred)}")
print(f"Classification Report:\n {classification_report(y_test, svm_pred, zero_division=1)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, svm_pred)}")

Accuracy Score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       854
           1       1.00      1.00      1.00       864

    accuracy                           1.00      1718
   macro avg       1.00      1.00      1.00      1718
weighted avg       1.00      1.00      1.00      1718

Confusion Matrix:
 [[854   0]
 [  0 864]]


In [185]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

sk_fold = StratifiedKFold(n_splits=5)
scores = cross_val_score(svm_modelrbf, X, y, cv=sk_fold)

print(f"Cross Validation Score: ", scores)

Cross Validation Score:  [1. 1. 1. 1. 1.]


*Simpan Model ke Pickle*

In [186]:
import pickle

pickle.dump(svm_model, open('linear_model', 'wb'))

In [187]:
loaded_model_linear = pickle.load(open('linear_model', 'rb'))

result_linear = loaded_model_linear.score(X_test, y_test)

print(result_linear)

1.0


In [188]:
pickle.dump(svm_modelrbf, open('rbf_model', 'wb'))

In [189]:
loaded_model_rbf = pickle.load(open('rbf_model', 'rb'))

result_rbf = loaded_model_rbf.score(X_test, y_test)

print(result_rbf)

1.0
