In [2]:
# Import libary yang dibutuhkan 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
# Read dataset
data = 'dataset/klasifikasi/cell_samples.csv'
df = pd.read_csv(data)

In [4]:
df.shape

(699, 11)

In [5]:
df.head(5)

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
def dataSetAnalysis(df):
    #view starting values of data set
    print("Dataset Head")
    print(df.head(3))
    print("=" * 100)
    
    # View features in data set
    print("Dataset Features")
    print(df.columns.values)
    print("=" * 100)
    
    # View How many samples and how many missing values for each feature
    print("Dataset Features Details")
    print(df.info())
    print("=" * 100)
    
    # view distribution of numerical features across the data set
    print("Dataset Numerical Features")
    print(df.describe())
    print("=" * 100)
    
    # view distribution of categorical features across the data set
    print("Dataset Categorical Features")
    print(df.describe(include=['O']))
    print("=" * 100)

In [7]:
dataSetAnalysis(df)

Dataset Head
        ID  Clump  UnifSize  UnifShape  MargAdh  SingEpiSize BareNuc  \
0  1000025      5         1          1        1            2       1   
1  1002945      5         4          4        5            7      10   
2  1015425      3         1          1        1            2       2   

   BlandChrom  NormNucl  Mit  Class  
0           3         1    1      2  
1           3         2    1      2  
2           3         1    1      2  
Dataset Features
['ID' 'Clump' 'UnifSize' 'UnifShape' 'MargAdh' 'SingEpiSize' 'BareNuc'
 'BlandChrom' 'NormNucl' 'Mit' 'Class']
Dataset Features Details
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           699 non-null    int64 
 1   Clump        699 non-null    int64 
 2   UnifSize     699 non-null    int64 
 3   UnifShape    699 non-null    int64 
 4   MargAdh      699 non-null    int64 
 

In [8]:
# Because i check there's 16 data null with symbol '?', we replace to NaN

df.replace('?', pd.NA, inplace=True)  # Use pd.NA to represent missing values in pandas

# In percentage
df.isna().sum() / len(df) * 100

ID             0.000000
Clump          0.000000
UnifSize       0.000000
UnifShape      0.000000
MargAdh        0.000000
SingEpiSize    0.000000
BareNuc        2.288984
BlandChrom     0.000000
NormNucl       0.000000
Mit            0.000000
Class          0.000000
dtype: float64

In [9]:
# We want to implace the NaN values with 0.
df['BareNuc'].fillna(0, inplace=True)

In [10]:
# Check again if null values has been filled

df.isnull().sum()

ID             0
Clump          0
UnifSize       0
UnifShape      0
MargAdh        0
SingEpiSize    0
BareNuc        0
BlandChrom     0
NormNucl       0
Mit            0
Class          0
dtype: int64

In [11]:
# View percentage distribution of target_class
df['Class'].value_counts()/float(len(df))

Class
2    0.655222
4    0.344778
Name: count, dtype: float64

In [12]:
# Replace class 2 and class 4 to 0 and 1
df['Class'] = df['Class'].replace({2: 0, 4: 1})

Class 0 memiliki arti sel kanker yang masih bersifat jinak
Class 1 memiliki arti sel kanker yang ganas

In [13]:
print(df['Class'].value_counts())

Class
0    458
1    241
Name: count, dtype: int64


In [14]:
# Melakukan pemilihan dengan Slicing, dari kolom 2-10
# Target Class berada pada kolom 11

""" Slicing X and y variables
X, variabel independen, membutuhkan kolom 2-10
y, variabel terikat, membutuhkan kolom terakhir
"""

# Fitur dari kolom 2 hingga 10
X = df.iloc[:, 1:9]
# Target dari kolom 11
y = df.iloc[:, 10]

In [15]:
# Memasukkan data kedalam training dan test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [16]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [28]:
# Import TensorFlow, Keras, Sequential, Dense
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [20]:
# Initialize ANN
classifier = Sequential()

In [21]:
# Tambahkan layer input dan layer tersembunyi pertama
classifier.add(Dense(units=6, activation='relu', input_dim=8))

# Tambahkan layer tersembunyi kedua
classifier.add(Dense(units=6, activation='relu'))

# Tambahkan layer tersembunyi ketiga (contoh)
classifier.add(Dense(units=6, activation='relu'))

# Tambahkan layer output
classifier.add(Dense(units=1, activation='sigmoid'))

In [22]:
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

classifier.fit(X_train, y_train, batch_size = 32, epochs = 100)

Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3172 - loss: 0.7136
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4221 - loss: 0.6642 
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7331 - loss: 0.6371 
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8559 - loss: 0.6093 
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8988 - loss: 0.5796 
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9217 - loss: 0.5587 
Epoch 7/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9255 - loss: 0.5229 
Epoch 8/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9335 - loss: 0.5186 
Epoch 9/100
[1m18/18[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7ff28c502e10>

Epoch 80/100
18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.9854 - loss: 0.0473 

Berdasarkan hasil diatas, model telah belajar dengan baik sehingga mendapatkan akurasi sebesar 98% dan nilai loss sebesar 0.04 yang menunjukkan model memiliki tingkat kehilangan yang rendah saat melakukan prediksi.

In [27]:
from sklearn.metrics import classification_report

# Prediksi nilai target pada data test
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)  # Ubah nilai probabilitas menjadi label biner

# Cetak laporan klasifikasi
print(classification_report(y_test, y_pred))


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        85
           1       0.96      0.98      0.97        55

    accuracy                           0.98       140
   macro avg       0.98      0.98      0.98       140
weighted avg       0.98      0.98      0.98       140

