In [3]:
import numpy as np
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, ADASYN



In [None]:

X, y = make_classification(n_classes=4, weights=[0.02, 0.05, 0.4, 0.53],n_features=10,
                           n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape %s' % Counter(y))
print('X shape:',X.shape,", y shape:",y.shape)


In [2]:

sm = SMOTE(k_neighbors=5, random_state=1203)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))
print('X_res shape:',X_res.shape,", y_res shape:",y_res.shape)


NameError: name 'SMOTE' is not defined

In [4]:

ada=ADASYN(n_neighbors=5, random_state=1203)
X_syn,y_syn=ada.fit_resample(X,y)
print('Resampled dataset shape from ADASYN %s' % Counter(y_syn))
print('X_syn shape:',X_syn.shape,", y_syn shape:",y_syn.shape)


Resampled dataset shape from ADASYN Counter({2: 528, 1: 525, 0: 525, 3: 524})
X_syn shape: (2102, 10) , y_syn shape: (2102,)


In [5]:

# Data Split --> Train:test = 3:1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_syn,y_syn,test_size=0.25,random_state=1204)


In [6]:
from sklearn.metrics import f1_score #f1 score
from sklearn.metrics import accuracy_score # accuracy
from sklearn.metrics import auc , roc_curve  # auc
from sklearn.model_selection import GridSearchCV

###################################################
#1. Logistic Regression
from sklearn.linear_model import LogisticRegression


In [8]:

param_grid_logistic={'C':[1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4]}
logistic_cv=GridSearchCV(LogisticRegression(penalty = 'l2'), param_grid_logistic, cv=5)
logistic_cv.fit(X_train,y_train)
print(logistic_cv.best_params_)

y_fit_logistic_train=logistic_cv.predict(X_train)
y_fit_logistic_test=logistic_cv.predict(X_test)
fpr_logistic_train, tpr_logtistic_train, _ = roc_curve(y_test, y_fit_logistic_test, pos_label=2)
fpr_logistic_test, tpr_logtistic_test, _ = roc_curve(y_test, y_fit_logistic_test, pos_label=2)




{'C': 0.1}




In [8]:

print("Train:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_train,y_fit_logistic_train),2),'   /' ,
      round(f1_score(y_train,y_fit_logistic_train,average='macro'),2),'    /' ,round(auc(fpr_logistic_train, tpr_logtistic_train),2))

print("Test:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_test,y_fit_logistic_test),2),'   /' ,
      round(f1_score(y_test,y_fit_logistic_test,average='macro'),2),'    /' ,round(auc(fpr_logistic_test, tpr_logtistic_test),2))


Train:
 Accuracy / F1 score / AUC  
 0.79    / 0.79     / 0.62
Test:
 Accuracy / F1 score / AUC  
 0.78    / 0.77     / 0.62


In [9]:

print('Misclassified training samples: %d' %(y_train!=y_fit_logistic_train).sum()) #the number of misclassified train data
print('Misclassified test samples: %d' %(y_test!=y_fit_logistic_test).sum()) #the number of misclassified test data


Misclassified training samples: 334
Misclassified test samples: 118


In [10]:
#2. K-nn
from sklearn.neighbors import KNeighborsClassifier
param_grid_knn={'n_neighbors':[i+1 for i in range(10)] ,
                'p':[i+1 for i in range(10)]}


In [11]:

knn_cv=GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
knn_cv.fit(X_train,y_train)
print(knn_cv.best_params_)

y_fit_knn_train=knn_cv.predict(X_train)
y_fit_knn_test=knn_cv.predict(X_test)
fpr_knn_train, tpr_logtistic_train, _ = roc_curve(y_test, y_fit_knn_test, pos_label=2)
fpr_knn_test, tpr_logtistic_test, _ = roc_curve(y_test, y_fit_knn_test, pos_label=2)


{'n_neighbors': 1, 'p': 1}


In [12]:

print("Train:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_train,y_fit_knn_train),2),'   /' ,
      round(f1_score(y_train,y_fit_knn_train,average='macro'),2),'    /' ,round(auc(fpr_knn_train, tpr_logtistic_train),2))

print("Test:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_test,y_fit_knn_test),2),'   /' ,
      round(f1_score(y_test,y_fit_knn_test,average='macro'),2),'    /' ,round(auc(fpr_knn_test, tpr_logtistic_test),2))


Train:
 Accuracy / F1 score / AUC  
 1.0    / 1.0     / 0.74
Test:
 Accuracy / F1 score / AUC  
 0.9    / 0.89     / 0.74


In [13]:

print('Misclassified training samples: %d' %(y_train!=y_fit_knn_train).sum())
print('Misclassified test samples: %d' %(y_test!=y_fit_knn_test).sum())


Misclassified training samples: 0
Misclassified test samples: 53


In [14]:
###################################################
#3. LDA
# Iris data에 대한 LDA 적합
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda= LinearDiscriminantAnalysis(store_covariance=False)
lda.fit(X_train,y_train)

y_fit_lda_train=lda.predict(X_train)
y_fit_lda_test=lda.predict(X_test)
fpr_lda_train, tpr_logtistic_train, _ = roc_curve(y_test, y_fit_lda_test, pos_label=2)
fpr_lda_test, tpr_logtistic_test, _ = roc_curve(y_test, y_fit_lda_test, pos_label=2)




In [15]:

print("Train:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_train,y_fit_lda_train),2),'   /' ,
      round(f1_score(y_train,y_fit_lda_train,average='macro'),2),'    /' ,round(auc(fpr_lda_train, tpr_logtistic_train),2))

print("Test:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_test,y_fit_lda_test),2),'   /' ,
      round(f1_score(y_test,y_fit_lda_test,average='macro'),2),'    /' ,round(auc(fpr_lda_test, tpr_logtistic_test),2))

print('Misclassified training samples: %d' %(y_train!=y_fit_lda_train).sum())
print('Misclassified test samples: %d' %(y_test!=y_fit_lda_test).sum())


Train:
 Accuracy / F1 score / AUC  
 0.78    / 0.78     / 0.59
Test:
 Accuracy / F1 score / AUC  
 0.78    / 0.78     / 0.59
Misclassified training samples: 344
Misclassified test samples: 117


In [8]:
pip list


Package              Version  
-------------------- ---------
absl-py              0.7.1    
astor                0.8.0    
attrs                19.1.0   
backcall             0.1.0    
bleach               3.1.0    
certifi              2019.6.16
colorama             0.4.1    
cycler               0.10.0   
decorator            4.4.0    
defusedxml           0.6.0    
entrypoints          0.3      
gast                 0.2.2    
google-pasta         0.1.7    
grpcio               1.22.0   
h5py                 2.9.0    
imbalanced-learn     0.5.0    
imblearn             0.0      
ipykernel            5.1.1    
ipython              7.6.0    
ipython-genutils     0.2.0    
ipywidgets           7.5.1    
jedi                 0.13.3   
Jinja2               2.10.1   
joblib               0.13.2   
jsonschema           3.0.1    
jupyter-client       5.2.4    
jupyter-core         4.5.0    
Keras-Applications   1.0.8    
Keras-Preprocessing  1.1.0    
kiwisolver           1.1.0    
Markdown

In [7]:
#4. Tree
from sklearn.tree import DecisionTreeClassifier
param_grid_tree={'criterion':["gini","entropy"] ,
                'ccp_alpha':[3]}
tree_cv=GridSearchCV(DecisionTreeClassifier(random_state=1204), param_grid_tree, cv=5)
#tree_cv.fit(X_train,y_train)
tree_cv.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__class_weight', 'estimator__criterion', 'estimator__max_depth', 'estimator__max_features', 'estimator__max_leaf_nodes', 'estimator__min_impurity_decrease', 'estimator__min_impurity_split', 'estimator__min_samples_leaf', 'estimator__min_samples_split', 'estimator__min_weight_fraction_leaf', 'estimator__presort', 'estimator__random_state', 'estimator__splitter', 'estimator', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [28]:
###################################################
#4. Tree
from sklearn.tree import DecisionTreeClassifier
param_grid_tree={'criterion':["gini","entropy"] ,
                'ccp_alpha':[2]}

#'ccp_alpha':[10**(i+1) for i in range(-4,3)]
tree_cv=GridSearchCV(DecisionTreeClassifier(random_state=1204), param_grid_tree, cv=5)
tree_cv.fit(X_train,y_train)
print(tree_cv.best_params_)


y_fit_tree_train=tree_cv.predict(X_train)
y_fit_tree_test=tree_cv.predict(X_test)
fpr_tree_train, tpr_logtistic_train, _ = roc_curve(y_test, y_fit_tree_test, pos_label=2)
fpr_tree_test, tpr_logtistic_test, _ = roc_curve(y_test, y_fit_tree_test, pos_label=2)


ValueError: Invalid parameter ccp_alpha for estimator DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1204, splitter='best'). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:

print("Train:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_train,y_fit_lda_train),2),'   /' ,
      round(f1_score(y_train,y_fit_lda_train,average='macro'),2),'    /' ,round(auc(fpr_lda_train, tpr_logtistic_train),2))

print("Test:\n","Accuracy / F1 score / AUC  \n",round(accuracy_score(y_test,y_fit_lda_test),2),'   /' ,
      round(f1_score(y_test,y_fit_lda_test,average='macro'),2),'    /' ,round(auc(fpr_lda_test, tpr_logtistic_test),2))

print('Misclassified training samples: %d' %(y_train!=y_fit_lda_train).sum())
print('Misclassified test samples: %d' %(y_test!=y_fit_lda_test).sum())
