In [258]:
# category로 바꾸주는 함수

def change_category(data):
    data['품종'] = data['품종'].astype('category')
    data['색상'] = data['색상'].astype('category')
    data['성별'] = data['성별'].astype('category')
    data['중성화여부'] = data['중성화여부'].astype('category')
    data['상태'] = data['상태'].astype('category')
    
    return data.info()

In [259]:
# mix 견 / mix 견 아닌거 나눠주는 함수
# [0] --> no_mix
# [1] --> yes_mix


def divide_mix(data):
    no_mix = data[data['품종'] != 3]
    yes_mix = data[data['품종'] == 3]
    
    return no_mix, yes_mix

### Data Imbalanced Solution
---
#### Oversampling
        
    1. OverSampling
    
        1.1 SMOTE
            : Overfitting 이 존재할수도 있다.
            
        1.2 ADASYN 
            : SMOTE의 개선된 버전 
            
    +A. Cost-sensitive learning
        
            소수의 클래스에 대한 cost 값에 가중치를 더 많이 주어 균형 잡힌 학습이 가능하게 하는 방법
            
    

In [260]:
# 1번째 --> RandomOverSampler

# 2번째 --> SMOTE

# 3번째 --> ADASYN


from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
# from imblearn.over_sampling import ADASYN


def over_sampling(data):
    X = data[['품종_mean', '색상', '성별', '체중', '중성화여부', '당시의나이']]
    Y = data['상태']
    ros = RandomOverSampler(random_state=0)

    X_resampled_ros, Y_resampled_ros = ros.fit_resample(X,Y)
    
    X_resampled_smotenc, Y_resampled_smotenc = SMOTENC([0, 1, 2, 4],random_state = 101)
    
    return [X_resampled_ros, Y_resampled_ros], [X_resampled_smotenc, Y_resampled_smotenc]

    
    
    


### Data Imbalanced Solution
----

`https://datascienceschool.net/03%20machine%20learning/14.02%20%EB%B9%84%EB%8C%80%EC%B9%AD%20%EB%8D%B0%EC%9D%B4%ED%84%B0%20%EB%AC%B8%EC%A0%9C.html`
#### Under_Sampling
    1. RandomUnderSampler
        --> 제일 낮은거랑 동일한 갯수로 모든 것을 Sampling
        
    2. Tomek's link method
        --> 클래스가 다른 두 데이터가 아주 가까이 붙어있으면 토멕링크가 된다. 
        
        
    3. Edited Nearest Neighbours
       --> 다수 클래스 데이터 중 가장 가까운 k 개의 데이터가 모두 or 다수 클래스가 아니면 삭제하는 방법.
       kind_sel = 'all', kind_sel='mode'
       
    4. Neighbourhood Cleaning Rule
       ---> CNN + ENN 
       ---> Neighbor의 갯수 Tunning
       
       
    
       
            
    

In [261]:
# Under Sampling



from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours, NeighbourhoodCleaningRule


def under_sampling(data):
    
    
    X = data.iloc[:,:-1]
    Y = data['상태']
    
    
    X_downsampled_rus, Y_downsampled_rus = RandomUnderSampler().fit_resample(X,Y)
    X_downsampled_tomek, Y_downsampled_tomek = TomekLinks().fit_resample(X,Y)
    X_downsampled_ENN, Y_downsampled_ENN = EditedNearestNeighbours().fit_resample(X,Y)
    X_downsampled_NCR, Y_downsampled_NCR = NeighbourhoodCleaningRule().fit_resample(X,Y)
    
    return [X_downsampled_rus, Y_downsampled_rus], [X_downsampled_tomek,Y_downsampled_tomek], [X_downsampled_ENN, Y_downsampled_ENN], [X_downsampled_NCR, Y_downsampled_NCR]
    
    

In [None]:
def train_test_split(X,Y,test_size = 0.2 ,random_state = 0) : 
    
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state, stratify=Y)

        return X_train, X_test, y_train, y_test

In [1]:
# Model import 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB




from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

import numpy as np
import pandas as pd


def model_study_function(X_train, X_test, y_train, y_test):
    
    # Model import 

    knn_clf = KNeighborsClassifier(n_neighbors = 7)

    rf_clf = RandomForestClassifier(bootstrap = True,
     max_depth = 90,
     max_features = 2,
     min_samples_leaf = 5,
     min_samples_split = 12,
     n_estimators = 1000)

    dt_clf = DecisionTreeClassifier()

    ada_clf = AdaBoostClassifier(n_estimators = 100) 

    mlp_clf = MLPClassifier(solver='lbfgs', random_state = 10,
                      hidden_layer_sizes=[100,100], max_iter = 300) # hideen_layer_sizes Tunnig 

    grd_clf = GradientBoostingClassifier()

    gb_clf = GaussianNB() # Tunning 어려움

    ex_clf = ExtraTreeClassifier() # Tunning 딱히 불필요 



    lr_final = LogisticRegression(C = 0.5)
    
    
    # Model Training 

    knn_clf.fit(X_train, y_train) 
    rf_clf.fit(X_train, y_train) 
    dt_clf.fit(X_train, y_train) 
    ada_clf.fit(X_train, y_train) 
    mlp_clf.fit(X_train,y_train)
    grd_clf.fit(X_train,y_train)
    gb_clf.fit(X_train,y_train)
    ex_clf.fit(X_train,y_train)
    
    print('Train 완료 \n' )
    
    # Model Prediction

    knn_pred = knn_clf.predict(X_test)
    rf_pred = rf_clf.predict(X_test)
    dt_pred = dt_clf.predict(X_test)
    ada_pred = ada_clf.predict(X_test)
    mlp_pred = mlp_clf.predict(X_test)
    grd_pred = grd_clf.predict(X_test)
    gb_pred = gb_clf.predict(X_test)
    ex_pred = ex_clf.predict(X_test)

    
    knn_score = accuracy_score(y_test, knn_pred)
    rf_score = accuracy_score(y_test, rf_pred)
    dt_score = accuracy_score(y_test, dt_pred)
    ada_score = accuracy_score(y_test, ada_pred)
    mlp_score = accuracy_score(y_test, mlp_pred)
    grd_score = accuracy_score(y_test, grd_pred)
    gb_score = accuracy_score(y_test, gb_pred)  
    ex_score = accuracy_score(y_test, ex_pred)               
                   
    
    print('KNN 정확도 : {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
    print('랜덤 포레스트 정확도 : {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
    print('결정 트리 정확도 : {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
    print('에이다부스트 정확도 : {0:.4f}'.format(accuracy_score(y_test, ada_pred)))
    print('MLPClassfier 정확도 : {0:.4f}'.format(accuracy_score(y_test, mlp_pred)))
    print('GradientBoostingClassifer : {0:.4f}'.format(accuracy_score(y_test, grd_pred)))
    print('GaussianNB 정확도 : {0:.4f}'.format(accuracy_score(y_test, gb_pred)))
    print('ExtraTreeclassifer 정확도 : {0:.4f}'.format(accuracy_score(y_test, ex_pred)))
    print("=" * 30)

    
                               
    result_list = [knn_score, rf_score, dt_score, ada_score, mlp_score, grd_score, gb_score, ex_score]
    pred_list = [knn_pred, rf_pred, dt_pred, ada_pred, mlp_pred, grd_pred, gb_pred, ex_pred]
                               
    
    top5_list = pd.DataFrame(result_list).rank(ascending=False).sort_values(by=0)[:5].index


                            
                               
                               
    
    pred = np.array([pred_list[top5_list[0]],pred_list[top5_list[1]], pred_list[top5_list[2]], pred_list[top5_list[3]], pred_list[top5_list[4]]])
                               
                               
    pred = np.transpose(pred)
    
    lr_final.fit(pred, y_test)
    final = lr_final.predict(pred)

    print('최종 메타 모델의 예측 정확도 : {0:.4f}'.format(accuracy_score(y_test, final)))
    print("="* 30)
    
    

    print('최종 모델의 f1_score : ',f1_score(y_test, final) )
    print("="*30)
    
    y_train_pred = cross_val_predict(lr_final, X_train, y_train, cv = 3)
    cf = confusion_matrix(y_train, y_train_pred)
    print('최종 모델의 Confusion Matrix : \n', cf)
    print("=" * 30)
    
    return print("완료")
    


