# Latihan Workshop

deskripsi data ada di:
<code>https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original</code>

perhatikan mana ***fitur*** dan mana ***label***

ambil data di:
<code>https://raw.githubusercontent.com/dsnalzami/dataset_resources/main/breast-cancer-wisconsin.data</code>

tugas anda:
- tambahkan kode jika ada yang kurang
- jika ada FIX_ME, silakan ganti dengan variabel atau nilai yang tepat


## load library

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import missingno

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.metrics import confusion_matrix

In [None]:
pd.set_option('display.max_columns', None)

## BAB 1 - (J.62DMI00.004.1) Mengumpulkan data

catatan: perhatikan apakah header ada atau tidak.

In [None]:
dataset = pd.read_csv('FIX_ME', header=FIX_ME)

In [None]:
dataset.head()

catatan: jika diperlukan, silakan masukkan nama columns ke dataset

In [None]:
# buka kode dibawah jika diperlukan
dataset.columns=['FIX_ME']

In [None]:
dataset.head()

## BAB 2 – (J.62DMI00.005.1) Menelaah data



### analisis karakteristik data
menggunakan info

In [None]:
dataset.FIX_ME()

### cek deskripsi data
menggunakan describe

In [None]:
dataset.FIX_ME()

### cek distribusi Class
pastikan anda sudah mengetahui class yang dicari

In [None]:
dataset['FIX_ME'].value_counts()

In [None]:
sns.set(font_scale=1.4)
dataset['FIX_ME'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Status Pasien", labelpad=14)
plt.ylabel("Jumlah", labelpad=14)
plt.title("Status Pasien", y=1.02);

## Cek Distribusi fitur
silakan perbanyak kode dibawah ini sesuai fitur

In [None]:
sns.set(font_scale=1.4)
dataset['FIX_ME'].value_counts().plot(kind='bar', figsize=(20, 6), rot=0)
plt.xlabel("FIX_ME", labelpad=14)
plt.ylabel("Jumlah", labelpad=14)
plt.title("Distribusi", y=1.02);

In [None]:
plt.subplots(figsize = (25,5))
sns.countplot(x=dataset['FIX_ME'],order=dataset['FIX_ME'].value_counts().index,hue=dataset['FIX_ME'])
plt.show()

## BAB 3 – (J.62DMI00.006.1) memvalidasi data

**temuan**:
- Penilaian kualitas data sesuai dengan tujuan teknis data science, dengan sajian sebagaimana hasil analisis karakteristik data
- Penilaian tingkat kecukupan data sesuai dengan tujuan teknis data science, dengan sajian sebagaimana hasil analisis karakteristik data
- ada beberapa fitur yang nilainya hilang, harus digunakan tahapan **`memperbaiki nilai yang hilang`**


## BAB 4 – (J.62DMI00.007.1) menentukan object data
silakan generate table menggunakan https://www.tablesgenerator.com/markdown_tables

## BAB 5 – (J.62DMI00.008.1) membersihkan data

### Memperbaiki nilai yang hilang

catatan: nilai yang hilang, ditulis dalam bentuk ?, maka kita rubah menjadi NaN

In [None]:
dataset.replace("FIX_ME", np.nan, inplace= True)

### tahap memperbaiki nilai yang hilang


In [None]:
dataset.isnull().values.any()

In [None]:
dataset.loc[:, dataset.isnull().any()].columns

In [None]:
dataset.loc[:,list(dataset.loc[:,dataset.isnull().any()].columns)].isnull().sum()/(len(dataset))*100

In [None]:
missingdata_df = dataset.columns[dataset.isnull().any()].tolist()
missingno.matrix(dataset[missingdata_df])

In [None]:
# hapus fitur jika fitur yang hilang lebih dari 50%
# dataset.drop(['fitur'], axis="columns", inplace=True)

In [None]:
# Perbaiki nilai yang hilang
median_value=dataset['FIX_ME'].median()
dataset['FIX_ME']=dataset['FIX_ME'].fillna(median_value)

In [None]:
dataset.info()

## BAB 6 – (J.62DMI00.009.1) mengkonstruksi data

In [None]:
dataset['FIX_ME'] = dataset['FIX_ME'].astype(str).astype(FIX_ME)
# tambahkan jika perlu

In [None]:
dataset.info()

### menghapus nilai yang bernilai konstan

In [None]:
dataset.info()

In [None]:
dataset = dataset.loc[:,dataset.apply(pd.Series.nunique) != 1]

In [None]:
dataset.info()

## BAB 7 – (J.62DMI00.010.1) menentukan label data

In [None]:
y = dataset['FIX_ME']

In [None]:
X = dataset.drop(columns=['FIX_ME'])

### Perlakukan data original

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=FIX_ME, random_state=42)

### Perlakuan data normal

In [None]:
X_norm = StandardScaler().fit_transform(X)

In [None]:
X_train_norm, X_test_norm, Y_train_norm, Y_test_norm = train_test_split(X_norm,y, test_size=FIX_ME, random_state=42)

## BAB 8 – (J.62DMI00.013.1) membangun model

### menggunakan Decision Tree

In [None]:
parameters_dt = {
                "model__max_depth": np.arange(1,FIX_ME),
                "model__min_samples_leaf": np.arange(1,FIX_ME,2),
                "model__min_samples_split": np.arange(2,FIX_ME),
                "model__criterion": ['FIX_ME','FIX_ME'],
                "model__random_state": [42]
}

In [None]:
classifier_dt_pipeline = Pipeline([
                          ('model',DecisionTreeClassifier())
                          ])

#### Pemodelan data original

In [None]:
ori_classifier_dt = GridSearchCV(classifier_dt_pipeline, parameters_dt, cv=FIX_ME, n_jobs=-1)

In [None]:
ori_classifier_dt.fit(X_train,Y_train.ravel())

In [None]:
ori_classifier_dt.best_estimator_

In [None]:
for param_name in sorted(parameters_dt.keys()):
    print('%s: %r' %(param_name,ori_classifier_dt.best_params_[param_name]))

#### Pemodelan data normal

In [None]:
norm_classifier_dt = GridSearchCV(classifier_dt_pipeline, parameters_dt, cv=FIX_ME, n_jobs=-1)

In [None]:
norm_classifier_dt.fit(X_train_norm,Y_train_norm.ravel())

In [None]:
norm_classifier_dt.best_estimator_

In [None]:
for param_name in sorted(parameters_dt.keys()):
    print('%s: %r' %(param_name,norm_classifier_dt.best_params_[param_name]))

## BAB 9 – (J.62DMI00.014.1) mengevaluasi hasil pemodelan

### evaluasi data original

In [None]:
ori_y_pred_dt_train = ori_classifier_dt.predict(X_train)

ori_accuracy_dt_train = accuracy_score(Y_train,ori_y_pred_dt_train)
print('Akurasi pada training set: ', ori_accuracy_dt_train)

ori_precision_dt_train = precision_score(Y_train,ori_y_pred_dt_train, average='FIX_ME')
print('Precision pada training set: ', ori_precision_dt_train)

ori_recall_dt_train = recall_score(Y_train,ori_y_pred_dt_train, average='FIX_ME')
print('Recall pada training set: ', ori_recall_dt_train)

ori_y_pred_dt_test = ori_classifier_dt.predict(X_test)

ori_accuracy_dt_test = accuracy_score(Y_test,ori_y_pred_dt_test)
print('Akurasi pada test set: ', ori_accuracy_dt_test)

ori_precision_dt_test = precision_score(Y_test,ori_y_pred_dt_test, average='FIX_ME')
print('Precision pada test set: ', ori_precision_dt_test)

ori_recall_dt_test = recall_score(Y_test,ori_y_pred_dt_test, average='FIX_ME')
print('Recall pada test set: ', ori_recall_dt_test)

In [None]:
sns.heatmap(confusion_matrix(Y_test,ori_y_pred_dt_test),annot=True,cmap='viridis', fmt='.0f')
plt.xlabel('Predicted Values', fontdict={'size':14}, labelpad=10)
plt.ylabel('Actual Values', fontdict={'size':14}, labelpad=10)
plt.title('Confusion Matrix pada bagian testing untuk data asli')
plt.show()

### Evaluasi data normal

In [None]:
norm_y_pred_dt_train = norm_classifier_dt.predict(X_train_norm)

norm_accuracy_dt_train = accuracy_score(Y_train_norm,norm_y_pred_dt_train)
print('Akurasi pada training set: ', norm_accuracy_dt_train)

norm_precision_dt_train = precision_score(Y_train_norm,norm_y_pred_dt_train, average='FIX_ME')
print('Precision pada training set: ', norm_precision_dt_train)

norm_recall_dt_train = recall_score(Y_train_norm,norm_y_pred_dt_train, average='FIX_ME')
print('Recall pada training set: ', norm_recall_dt_train)

norm_y_pred_dt_test = norm_classifier_dt.predict(X_test_norm)

norm_accuracy_dt_test = accuracy_score(Y_test_norm,norm_y_pred_dt_test)
print('Akurasi pada test set: ', norm_accuracy_dt_test)

norm_precision_dt_test = precision_score(Y_test_norm,norm_y_pred_dt_test, average='FIX_ME')
print('Precision pada test set: ', norm_precision_dt_test)

norm_recall_dt_test = recall_score(Y_test_norm,norm_y_pred_dt_test, average='FIX_ME')
print('Recall pada test set: ', norm_recall_dt_test)

In [None]:
sns.heatmap(confusion_matrix(Y_test_norm,norm_y_pred_dt_test),annot=True,cmap='viridis', fmt='.0f')
plt.xlabel('Predicted Values', fontdict={'size':14}, labelpad=10)
plt.ylabel('Actual Values', fontdict={'size':14}, labelpad=10)
plt.title('Confusion Matrix pada bagian testing untuk data asli')
plt.show()

### pelaporan evaluasi

In [None]:
models = [
          ('Machine Learning Data Original', ori_accuracy_dt_train, ori_accuracy_dt_test),
          ('Machine Learning Data Normalisasi', norm_accuracy_dt_train, norm_accuracy_dt_test)
         ]

In [None]:
predict = pd.DataFrame(data = models, columns=['Model', 'Training Accuracy', 'Test Accuracy'])
predict

In [None]:
models_comparison = [
                        ('Machine Learning Data Original', ori_accuracy_dt_test, ori_recall_dt_test, ori_precision_dt_test),
                        ('Machine Learning Data Normalisasi', norm_accuracy_dt_test, norm_recall_dt_test, norm_precision_dt_test)
                    ]

In [None]:
comparison = pd.DataFrame(data = models_comparison, columns=['Model', 'Accuracy', 'Recall', 'Precision'])
comparison

In [None]:
import numpy as np

f, axes = plt.subplots(2,1, figsize=(14,10))

predict.sort_values(by=['Training Accuracy'], ascending=False, inplace=True)

sns.barplot(x='Training Accuracy', y='Model', data = predict, palette='Blues_d', ax = axes[0])
axes[0].set_xlabel('Training Accuracy', size=16)
axes[0].set_ylabel('Model')
axes[0].set_xlim(0,1.0)
axes[0].set_xticks(np.arange(0, 1.1, 0.1))

predict.sort_values(by=['Test Accuracy'], ascending=False, inplace=True)

sns.barplot(x='Test Accuracy', y='Model', data = predict, palette='Greens_d', ax = axes[1])
#axes[0].set(xlabel='Region', ylabel='Charges')
axes[1].set_xlabel('Test Accuracy', size=16)
axes[1].set_ylabel('Model')
axes[1].set_xlim(0,1.0)
axes[1].set_xticks(np.arange(0, 1.1, 0.1))

plt.show()

### Hasil terbaik akan dilihat Tree nya (Studi Kasus Machine Learning data original)

In [None]:
from sklearn import tree

In [None]:
def Analize_Feature(model,X_test):
    aux_dic = {'name':[],'coef':[]}
    pd.set_option('display.max_rows',len(X_test.columns))
    for name, importance in zip(X_test.columns,model.best_estimator_.named_steps["model"].feature_importances_):
        aux_dic['name'].append(name)
        aux_dic['coef'].append(importance)
        df_features_importance = pd.DataFrame(aux_dic)
    print(df_features_importance.sort_values(by='coef',ascending=False))
    return df_features_importance
Analize_Feature(ori_classifier_dt,X_test)

In [None]:
fn=['FIX_ME']
cn=['FIX_ME', 'FIX_ME']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(ori_classifier_dt.best_estimator_.named_steps["model"],
               feature_names = fn,
               class_names=cn,
               filled = True);
fig.savefig('imagename.png')