# 모델 만들기

In [1]:
import pandas as pd 
import joblib

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## 유방암 데이터

In [3]:
df_train = pd.read_csv('../static/data/cancer_train.csv')
df_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,...,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,0
1,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,...,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192,1
2,17.35,23.06,111.0,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,...,31.47,128.2,1218.0,0.124,0.1486,0.1211,0.08235,0.2452,0.06515,0
3,11.61,16.02,75.46,408.2,0.1088,0.1168,0.07097,0.04497,0.1886,0.0632,...,19.67,81.93,475.7,0.1415,0.217,0.2302,0.1105,0.2787,0.07427,1
4,11.87,21.54,76.83,432.0,0.06613,0.1064,0.08777,0.02386,0.1349,0.06612,...,28.18,83.51,507.2,0.09457,0.3399,0.3218,0.0875,0.2305,0.09952,1


In [4]:
y_train = df_train.target.values
scaler = MinMaxScaler()
X_train = scaler.fit_transform(df_train.drop(columns='target', axis=1))
X_train.shape, y_train.shape

((426, 30), (426,))

In [5]:
df_test = pd.read_csv('../static/data/cancer_test.csv')
y_test = df_test.target.values
X_test = scaler.fit_transform(df_test.drop(columns='target', axis=1))
X_test.shape, y_test.shape

((143, 30), (143,))

### 1. Logistic Regression

In [6]:
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [7]:
params = {
    #'C': [0.1, 1, 5, 10]
    'C': [3, 4, 5, 6, 7]
}

In [8]:
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9741
최적 파라미터: {'C': 5}


In [9]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test)
accuracy_score(y_test, pred)

0.972027972027972

In [10]:
joblib.dump(best_lr, '../static/model/cancer_lr.pkl')

['../static/model/cancer_lr.pkl']

### 2. SVM

In [11]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [12]:
params = {
    #'C': [0.1, 1, 5, 7, 10]
    'C': [5, 6, 7, 8, 9]
}

In [13]:
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9789
최적 파라미터: {'C': 7}


In [14]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test)
accuracy_score(y_test, pred)

0.9790209790209791

In [15]:
joblib.dump(best_sv, '../static/model/cancer_sv.pkl')

['../static/model/cancer_sv.pkl']

### 3. Decision Tree

In [16]:
dt_clf = DecisionTreeClassifier()
dt_clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

- 스케일된 데이터로 훈련/예측/평가

In [17]:
params = {
    'max_depth': [4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4]
}

In [18]:
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9460
최적 파라미터: {'max_depth': 5, 'min_samples_split': 2}


In [19]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.8951048951048951

- 스케일하지 않은 원 데이터로 훈련/예측/평가

In [20]:
X_train = df_train.drop(columns='target', axis=1).values
X_test = df_test.drop(columns='target', axis=1).values

In [21]:
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9460
최적 파라미터: {'max_depth': 5, 'min_samples_split': 3}


In [22]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9300699300699301

- 스케일하지 않고 원데이터로 학습한 모델을 저장

In [23]:
joblib.dump(best_dt, '../static/model/cancer_dt.pkl')

['../static/model/cancer_dt.pkl']

### Test
- Logistic Regression, SVM 은 스케일된 데이터로
- Decision Tree는 스케일하지 않은 데이터로

In [24]:
index = 20
scaled_test = scaler.fit_transform(df_test.iloc[:, :-1])
scaled_test.shape

(143, 30)

In [25]:
test_data = scaled_test[index, :].reshape(1,-1)
test_data

array([[0.10411744, 0.83605792, 0.09169563, 0.05119603, 0.38244376,
        0.19430688, 0.14603213, 0.17166144, 0.32941904, 0.25819288,
        0.03824404, 0.52516048, 0.028061  , 0.01376492, 0.5051476 ,
        0.2303048 , 0.12131313, 0.32600871, 0.15494894, 0.13968465,
        0.08021072, 0.65139442, 0.08210995, 0.03681193, 0.39210165,
        0.17416489, 0.24342746, 0.3143989 , 0.15684411, 0.29544724]])

In [26]:
test_data_dt = df_test.iloc[index, :-1].values.reshape(1,-1)
test_data_dt

array([[1.029e+01, 2.761e+01, 6.567e+01, 3.214e+02, 9.030e-02, 7.658e-02,
        5.999e-02, 2.738e-02, 1.593e-01, 6.127e-02, 2.199e-01, 2.239e+00,
        1.437e+00, 1.446e+01, 1.205e-02, 2.736e-02, 4.804e-02, 1.721e-02,
        1.843e-02, 4.938e-03, 1.084e+01, 3.491e+01, 6.957e+01, 3.576e+02,
        1.384e-01, 1.710e-01, 2.000e-01, 9.127e-02, 2.226e-01, 8.283e-02]])

In [27]:
label = df_test.iloc[index, -1]
label

1

In [28]:
pred_lr = best_lr.predict(test_data)
pred_sv = best_sv.predict(test_data)
pred_dt = best_dt.predict(test_data_dt)

In [29]:
label, pred_lr[0], pred_sv[0], pred_dt[0]

(1, 1, 1, 1)

In [30]:
df_test.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,20.26,23.03,132.4,1264.0,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,...,31.59,156.1,1750.0,0.119,0.3539,0.4098,0.1573,0.3689,0.08368,0
1,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,0
2,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,...,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759,1
3,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,...,25.21,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953,0
4,13.94,13.17,90.31,594.2,0.1248,0.09755,0.101,0.06615,0.1976,0.06457,...,15.38,94.52,653.3,0.1394,0.1364,0.1559,0.1015,0.216,0.07253,1


In [34]:
a = dict(zip(df_test.columns[:-1], df_test.iloc[index, :-1]))

In [35]:
a

{'mean radius': 10.29,
 'mean texture': 27.61,
 'mean perimeter': 65.67,
 'mean area': 321.4,
 'mean smoothness': 0.0903,
 'mean compactness': 0.07658,
 'mean concavity': 0.059989999999999995,
 'mean concave points': 0.02738,
 'mean symmetry': 0.1593,
 'mean fractal dimension': 0.061270000000000005,
 'radius error': 0.2199,
 'texture error': 2.239,
 'perimeter error': 1.4369999999999998,
 'area error': 14.46,
 'smoothness error': 0.01205,
 'compactness error': 0.027360000000000002,
 'concavity error': 0.04804,
 'concave points error': 0.01721,
 'symmetry error': 0.018430000000000002,
 'fractal dimension error': 0.0049380000000000005,
 'worst radius': 10.84,
 'worst texture': 34.91,
 'worst perimeter': 69.57,
 'worst area': 357.6,
 'worst smoothness': 0.1384,
 'worst compactness': 0.171,
 'worst concavity': 0.2,
 'worst concave points': 0.09127,
 'worst symmetry': 0.2226,
 'worst fractal dimension': 0.08283}