In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, balanced_accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

## Exploration & Preprocessing Data

In [None]:
dwn_url_test='https://drive.google.com/uc?id=' + '1MVJqOW4RkG796RDLpwDe0zZwY6z13xK7'
df_test = pd.read_csv(dwn_url_test)

dwn_url_train='https://drive.google.com/uc?id=' + '1uGVFv10T-lgFBM4dieQPxVbg5CRSarYK'
df_train = pd.read_csv(dwn_url_train)


In [None]:
df_train.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       463 non-null    int64  
 1   diagnosis                463 non-null    object 
 2   radius_mean              463 non-null    float64
 3   texture_mean             463 non-null    float64
 4   perimeter_mean           463 non-null    float64
 5   area_mean                463 non-null    float64
 6   smoothness_mean          463 non-null    float64
 7   compactness_mean         463 non-null    float64
 8   concavity_mean           463 non-null    object 
 9   concave points_mean      463 non-null    object 
 10  symmetry_mean            463 non-null    float64
 11  fractal_dimension_mean   463 non-null    float64
 12  radius_se                463 non-null    float64
 13  texture_se               463 non-null    float64
 14  perimeter_se             4

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       106 non-null    int64  
 1   radius_mean              106 non-null    float64
 2   texture_mean             106 non-null    float64
 3   perimeter_mean           106 non-null    float64
 4   area_mean                106 non-null    float64
 5   smoothness_mean          106 non-null    float64
 6   compactness_mean         106 non-null    float64
 7   concavity_mean           106 non-null    float64
 8   concave points_mean      106 non-null    float64
 9   symmetry_mean            106 non-null    float64
 10  fractal_dimension_mean   106 non-null    float64
 11  radius_se                106 non-null    float64
 12  texture_se               106 non-null    float64
 13  perimeter_se             106 non-null    float64
 14  area_se                  1

In [None]:
# CHECK MISSING VALUE
print(df_test.isnull().sum())

id                         0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [None]:
# CONVERT DTYPE OBJECT TO FLOAT
df_train.concavity_se = pd.to_numeric(df_train.concavity_se, errors = 'coerce')
df_train.concavity_mean = pd.to_numeric(df_train.concavity_mean, errors = 'coerce')
df_train.concavity_worst = pd.to_numeric(df_train.concavity_worst, errors = 'coerce')
df_train["concave points_mean"] = pd.to_numeric(df_train["concave points_mean"], errors = 'coerce')
df_train["concave points_se"] = pd.to_numeric(df_train["concave points_se"], errors = 'coerce')
df_train["concave points_worst"] = pd.to_numeric(df_train["concave points_worst"], errors = 'coerce')


In [None]:
# CHECK MISSING VALUE
print(df_train.isnull().sum())

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             7
concave points_mean        7
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               7
concave points_se          7
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            7
concave points_worst       7
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [None]:
# HANDLING MISSING VALUE 
df_train["concavity_mean"].fillna(df_train["concavity_mean"].median(), inplace = True)
df_train["concave points_mean"].fillna(df_train["concave points_mean"].median(), inplace = True)
df_train["concavity_se"].fillna(df_train["concavity_se"].median(), inplace = True)
df_train["concave points_se"].fillna(df_train["concave points_se"].median(), inplace = True)
df_train["concavity_worst"].fillna(df_train["concavity_worst"].median(), inplace = True)
df_train["concave points_worst"].fillna(df_train["concave points_worst"].median(), inplace = True)


In [None]:
# CROSSCHECK MISSING VALUE
df_train.isnull().values.any()

False

In [None]:
df_train.shape

(463, 32)

In [None]:
df_train.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [None]:
# CHEK FOR IMBALANCE DATA
df_train.diagnosis.value_counts()

B    275
M    188
Name: diagnosis, dtype: int64

In [None]:
# DROP ID
df_train = df_train.drop(['id'], axis=1)
df_train.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
# CONVERT LABEL TERGET
diagnosis = {'M': 1, 'B': 0}
df_train.diagnosis = [diagnosis[item] for item in df_train.diagnosis]
df_train.head()
df_exp = df_train

In [None]:
df_train.rename(columns={'diagnosis': 'class'}, inplace=True)

In [None]:
# SPLITTING DATA

X = df_train.drop('class', axis=1)
y = df_train['class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [None]:
# SCALLING DATA
scaler = StandardScaler()
scaler.fit(X_train)
scaler.transform(X_train)
scaler.transform(X_test)

array([[-0.05640078,  0.03947048, -0.9997156 , ..., -0.88660477,
        -0.82170966,  0.32068663],
       [-0.05660232, -0.71182095, -1.3407452 , ..., -0.87273742,
        -0.721201  , -0.33748245],
       [-0.05446348,  0.40590681,  2.0843424 , ...,  2.00024287,
         3.7641853 ,  0.81339421],
       ...,
       [-0.05577171, -0.64202356, -0.20918435, ..., -0.59401886,
        -0.82621005,  0.15890205],
       [-0.05570559, -0.0157858 , -0.07605954, ...,  0.96247722,
         0.37239318,  1.12120513],
       [-0.05574566, -0.38706917, -0.18905128, ..., -1.31694988,
        -1.60777736, -0.84332189]])

## TPOT

In [None]:
# !pip install tpot
# from tpot import TPOTClassifier

In [None]:
# tpot = TPOTClassifier(verbosity=2, population_size=40, n_jobs=-1, generations=5)
# tpot.fit(X_train, y_train)

In [None]:
# tpot.export('tpot_pipeline.py')

In [None]:
# predictions = tpot.predict(X_test)

In [None]:
# print("Confusion Matrix:")
# print(confusion_matrix(y_test, predictions))

In [None]:
# print("Classification Report")
# print(classification_report(y_test, predictions))

## Load Pipeline


In [None]:
# Hasil pipeline yang sudah dibentuk oleh TPOT

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = df_train
features = tpot_data.drop('class', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'], random_state=None)

# Average CV score on the training set was: 0.9846153846153847
exported_pipeline = GradientBoostingClassifier(learning_rate=0.5, max_depth=6, 
                                               max_features=0.4, min_samples_leaf=5, 
                                               min_samples_split=15, n_estimators=100, 
                                               subsample=0.7000000000000001)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [None]:
print("Confusion Matrix:")
print(confusion_matrix(testing_target, results))

Confusion Matrix:
[[76  1]
 [ 3 36]]


In [None]:
print("Classification Report")
print(classification_report(testing_target, results))

Classification Report
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        77
           1       0.97      0.92      0.95        39

    accuracy                           0.97       116
   macro avg       0.97      0.96      0.96       116
weighted avg       0.97      0.97      0.97       116



In [None]:
# Make predictions on the submission data

df_test = df_test.drop('id', axis=1)
df_test.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,11.6,18.36,73.88,412.7,0.08508,0.05855,0.03367,0.01777,0.1516,0.05859,...,12.77,24.02,82.68,495.1,0.1342,0.1808,0.186,0.08288,0.321,0.07863
1,13.17,18.22,84.28,537.3,0.07466,0.05994,0.04859,0.0287,0.1454,0.05549,...,14.9,23.89,95.1,687.6,0.1282,0.1965,0.1876,0.1045,0.2235,0.06925
2,13.24,20.13,86.87,542.9,0.08284,0.1223,0.101,0.02833,0.1601,0.06432,...,15.44,25.5,115.0,733.5,0.1201,0.5646,0.6556,0.1357,0.2845,0.1249
3,13.14,20.74,85.98,536.9,0.08675,0.1089,0.1085,0.0351,0.1562,0.0602,...,14.8,25.46,100.9,689.1,0.1351,0.3549,0.4504,0.1181,0.2563,0.08174
4,9.668,18.1,61.06,286.3,0.08311,0.05428,0.01479,0.005769,0.168,0.06412,...,11.15,24.62,71.11,380.2,0.1388,0.1255,0.06409,0.025,0.3057,0.07875


In [None]:
# Generate the predictions
submission = exported_pipeline.predict(df_test)


In [None]:
df_test['diagnosis'] = submission


In [None]:
df_test.to_csv('Predict_Diagnosis.csv', index=False)

## EKSPERIMEN


In [None]:
#Balancing Class With SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [None]:
# define pipeline
steps = [('over', SMOTE()), ('model', GradientBoostingClassifier(learning_rate=0.5, max_depth=6, max_features=0.4, min_samples_leaf=5, min_samples_split=15, n_estimators=100, subsample=0.7000000000000001))]
pipeline = Pipeline(steps=steps)

pipeline.fit(training_features, training_target)
results_exp = pipeline.predict(testing_features)


In [None]:
print("Confusion Matrix:")
print(confusion_matrix(testing_target, results_exp))

Confusion Matrix:
[[76  1]
 [ 2 37]]


In [None]:
print("Classification Report")
print(classification_report(testing_target, results))

Classification Report
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        77
           1       0.97      0.92      0.95        39

    accuracy                           0.97       116
   macro avg       0.97      0.96      0.96       116
weighted avg       0.97      0.97      0.97       116

