# UAS - 2079007 - Laras Ervintyana
M140 - Teknologi Open Source

## Breast Cancer Dataset dari Scikit Learn

#### Melakukan Import Modul

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

#### Download Dataset dari Scikit-Learn

In [None]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)

# menambah target dan target name sebagai label tipe breast cancer/tumor 0 = malignant, 1 = benign
df['target'] = data.target
df['target name'] = pd.Categorical.from_codes(data.target, data.target_names)

# menulis data tersebut ke dalam csv file
df.to_csv('breast-cancer-wisconsin-data.csv', sep = ',', index = False)

#### Load Dataset

In [None]:
bc_df = pd.read_csv('breast-cancer-wisconsin-data.csv')

#### Menghilangkan Data Duplikat

In [None]:
bc_df.drop_duplicates(inplace=True)
bc_df.shape

## Klasifikasi Model

#### Melakukan Import Modul

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

#### Dataset: Features & Class Label

In [None]:
# Menghapus kolumn target dan target name
X = bc_df.drop(columns='target')
X = bc_df.drop(columns='target name')

In [None]:
# target name sebagai label untuk y
y = bc_df['target name']

#### Melakukan Training Dataset dan Testing Dataset

In [None]:
# Test Size yang digunakan adalah 40% dan Training Size adalah 60%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10)

print('training dataset')
print(X_train.shape)
print(y_train.shape)
print()
print('testing dataset:')
print(X_test.shape)
print(y_test.shape)

#### Klasifikasi Data Menggunakan K Nearest Neighbors

In [None]:
# Import Modul
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [None]:
k_range = list(range(1,26))
scores = []
for k in k_range:
    model_knn = KNeighborsClassifier(n_neighbors=k)
    model_knn.fit(X_train, y_train)
    y_pred = model_knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

In [None]:
# Plotting hasil Akurasi, x-axis adalah jumlah neighbor, dan y-axis adalah nilai akurasinya
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.tight_layout()
plt.show()

#### Klasifikasi Data Menggunakan 'Support Vector Classifier'

In [None]:
# Import Modul
from sklearn.svm import SVC

In [None]:
# Konfigurasi nilai gamma, training classifier dan testing classifier
model_svc = SVC(gamma='scale')
model_svc.fit(X_train,y_train)
y_pred = model_svc.predict(X_test)

#### Klasifikasi Data Menggunakan 'Decision Tree Classifier'

In [None]:
# Import Modul
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Konfigurasi algoritma, training classifier dan testing classifier
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_pred = model_dt.predict(X_test)

#### Klasifikasi Data Menggunakan 'Random Forest Classifier'

In [None]:
# Import Modul
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Konfigurasi algoritma, training classifier dan testing classifier
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train,y_train)
pred_rf = model_rf.predict(X_test)

#### Membandingkan Performa dari 4 Model yang Digunakan

In [None]:
# Model yang digunakan adalah K Nearest Neighbors (model_knn), Support Vector Classifier (model_svc), 
# Decision Tree Classifier (model_dt) dan Random Forest Classifier (model_rf)
models = [model_knn, model_svc, model_dt, model_rf]
accuracy_scores = []

# Melakukan looping dan prediction terhadap model yang akan dilakukan perbandingan performanya
for model in models:
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
print(accuracy_scores)

In [None]:
# Menampilkan visualisasi performa
plt.bar(['KNN', 'SVC', 'DT', 'RF'],accuracy_scores)
plt.ylim(0.90,1.01)
plt.title('Performa dari 4 Model', fontsize=15, color='r')
plt.xlabel('Models', fontsize=18, color='g')
plt.ylabel('Accuracy Score', fontsize=18, color='g')
plt.tight_layout()
plt.show()