In [None]:
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [None]:
data = 'dataset/svm/cell_samples.csv'
df = pd.read_csv(data)

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
# Index features in DataFrame
col_names = df.columns
col_names

In [None]:
# Because i check there's 16 data null with symbol '?', we replace to NaN

df.replace('?', pd.NA, inplace=True)  # Use pd.NA to represent missing values in pandas

df.isnull().sum()

In [None]:
# In percentage
df.isna().sum() / len(df) * 100

In [None]:
# We want to implace the NaN values with 0.
df['BareNuc'].fillna(0, inplace=True)

In [None]:
# Check again if null values has been filled

df.isnull().sum()

In [None]:
# Distribution in Class column
df['Class'].value_counts()

In [None]:
# View percentage distribution of target_class
df['Class'].value_counts()/float(len(df))

In [None]:
# Summary dataset
df.info()

In [None]:
# Check if there any missing values in variables
df.isnull().sum()

In [None]:
# Summary statistics dataset in numerical variables
round(df.describe(), 2)

In [None]:
# Draw boxplots to visualize any outliers

plt.subplot(4,2,1)
fig = df.boxplot(column='ID')
fig.set_title('')
fig.set_ylabel('ID')

plt.subplot(4,2,2)
fig = df.boxplot(column='Clump')
fig.set_title('')
fig.set_ylabel('Clump')

plt.subplot(4,2,3)
fig = df.boxplot(column='UnifSize')
fig.set_title('')
fig.set_ylabel('UnifSize')

plt.subplot(4,2,4)
fig = df.boxplot(column='UnifShape')
fig.set_title('')
fig.set_ylabel('UnifShape')

plt.subplot(4,2,5)
fig = df.boxplot(column='MargAdh')
fig.set_title('')
fig.set_ylabel('MargAdh')

plt.subplot(4,2,6)
fig = df.boxplot(column='SingEpiSize')
fig.set_title('')
fig.set_ylabel('SingEpiSize')

plt.subplot(4,2,7)
fig = df.boxplot(column='BlandChrom')
fig.set_title('')
fig.set_ylabel('BlandChrom')

plt.subplot(4,2,8)
fig = df.boxplot(column='NormNucl')
fig.set_title('')
fig.set_ylabel('NormNucl')


In [None]:
# Declare vector feature and target variable, the target is 'Class'
X = df.drop(['Class'], axis=1)
y = df['Class']

# Import train test and split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)


In [None]:
# import SVC classifier
from sklearn.svm import SVC


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_test)

# save the trained model to a variable

trained_svm = svc


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=100.0) 

# fit classifier to training set
svc.fit(X_train,y_train)
# make predictions on test set
y_pred=svc.predict(X_test)
# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# instantiate classifier with linear kernel and C=1.0
linear_svc=SVC(kernel='linear', C=1.0) 

# fit classifier to training set
linear_svc.fit(X_train,y_train)
# make predictions on test set
y_pred_test=linear_svc.predict(X_test)
# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

In [None]:
# instantiate classifier with linear kernel and C=100.0
linear_svc100=SVC(kernel='linear', C=100.0) 


# fit classifier to training set
linear_svc100.fit(X_train, y_train)
# make predictions on test set
y_pred=linear_svc100.predict(X_test)
# compute and print accuracy score
print('Model accuracy score with linear kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Compare the train-set and test-set accuracy

y_pred_train = linear_svc.predict(X_train)
y_pred_train

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))


Untuk Akurasi training-set adalah 0.9762, dibandingkan dengan akurasi test-set yaitu 0.9652. Sehingga kecil kemungkinan untuk dia overfitting. Karena nilainya bisa dikomparasikan dengan wajar.

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

In [None]:
# Mengonversi DataFrame X ke dalam bentuk array numpy
X_array = X.values

# Menentukan batas wilayah plot
x_min, x_max = X_array[:, 0].min() - 1, X_array[:, 0].max() + 1
y_min, y_max = X_array[:, 1].min() - 1, X_array[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                     np.arange(y_min, y_max, 0.01))

# Membuat plot
plt.figure(figsize=(10, 6))

# Mengambil keputusan batas kelas dengan model SVM
Z = trained_svm.predict(np.c_[xx.ravel(), yy.ravel()])

# Mengubah hasil prediksi ke dalam bentuk meshgrid
Z = Z.reshape(xx.shape)

# Membuat contour plot
plt.contourf(xx, yy, Z, alpha=0.4)

# Menampilkan data pelatihan
plt.scatter(X_array[:, 0], X_array[:, 1], c=y, s=20, edgecolors='k')

plt.title('Decision Boundary of SVM Classifier')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()