In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import copy
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
root_path = 'drive/My Drive/Colab Notebooks/Contraceptive/'

Mounted at /content/drive


In [None]:
df = pd.read_csv(root_path+"cmc.csv", names = ['Age', 'W_Education', 'Hus_Education', 'Children','W_Islam','W_Work','Hus_occupation','STD_Living','Media_exposure','Contraceptive'])
df.drop_duplicates(inplace=True)
q1,q3 = np.percentile(df.Children,[25,75]) ## หา q1,q3
iqr = q3-q1
upper = q3 + (1.5*iqr)
lower = q1 - (1.5*iqr)
filter = df.Children>upper
df.drop(df[filter].index,inplace=True)
df.shape

(1418, 10)

<h2> สมมุติฐาน </h2>
<li> การใช้ CV จะดีกว่าการทำ Train อย่างเดียว</li>
<li> ไม่จำเป็นต้อง Normalize เนื่องจาก Feature ส่วนใหญ่เป็นประเภท นามบัญญัติ </li>
<li> การทำ Imbalance จะช่วยประสิทธิภาพมากขึ้น </li>

## Prepare Model and Function

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline


In [None]:
def modelfit(models,X_train,Y_train):
    models = models.copy()
    modellocal = []
    for name, model in models:
        clf = model.fit(X_train,Y_train)
        modellocal.append((name,clf))
    return modellocal

In [None]:
def TrainModelCV(modelcv,cv,n_jobs,X_train,Y_train):
    modellocal = []
    # modelcv = modelcv.copy()
    for name,param, model in modelcv:
        gsv = RandomizedSearchCV(
        model,
        param,
        verbose = 1,
        cv=cv,
        n_jobs=n_jobs
        )
        # print(gsv.get_params().keys())
        gsv.fit(X_train,Y_train)
        modellocal.append((name,gsv))
    return modellocal


In [None]:
def Report(models,X_train,Y_train,X_test,Y_test):
    models = models.copy()
    for name, model in models:
        print(name)
        print("!!!!Train!!!!")
        print(classification_report(model.predict(X_train),Y_train))
        print("!!!!Test!!!!")
        print(classification_report(model.predict(X_test),Y_test))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto','sqrt','log']
criterion = ['gini','entropy']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
weights = ['uniform','distance']
algorithm = ['auto','ball_tree','kd_tree','brute']

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion' : criterion}
param_grid = {'leaf_size': leaf_size,
              'n_neighbors': n_neighbors,
              'p': p,
              'weights':weights,
              'algorithm':algorithm,
              }
random_DC = { 'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion' : criterion
               }

## Train, Test split

In [None]:
X = df.drop(["Contraceptive"],axis=1)
Y = df.Contraceptive

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

## NO Cross Validate and No hyper parameter tune

In [None]:
models = []
models.append(('RandomForest',RandomForestClassifier()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('CART',DecisionTreeClassifier()))

In [None]:
model_NOCV = modelfit(models,X_train=X_train,Y_train=Y_train)
Report(model_NOCV,X_train=X_train,Y_train=Y_train,X_test=X_test,Y_test=Y_test)

RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.96      0.99      0.97       471
           2       0.96      0.93      0.94       258
           3       0.96      0.95      0.95       405

    accuracy                           0.96      1134
   macro avg       0.96      0.95      0.96      1134
weighted avg       0.96      0.96      0.96      1134

!!!!Test!!!!
              precision    recall  f1-score   support

           1       0.61      0.62      0.62       122
           2       0.34      0.35      0.35        66
           3       0.49      0.47      0.48        96

    accuracy                           0.51       284
   macro avg       0.48      0.48      0.48       284
weighted avg       0.51      0.51      0.51       284

KNN
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.82      0.70      0.75       567
           2       0.53      0.60      0.56       221
           3     

<h5> บางAlgorithm มีการ Train ได้ดี แต่ Test แย่ เช่น RandomForest โดยรวมแล้วในส่วนของ Test มี accuracy ที่พอๆกัน </h5>
<h5> ตอนนี้ยังไม่สามารถสรุปผลอะไรได้ เนื่องจากยังไม่ได้ tune hyperparameter และ ทำ Cross validatation </h5>

## Cross validate and hyperparameter

In [None]:
models = []
models.append(('RandomForest',random_grid, RandomForestClassifier()))
models.append(('KNN',param_grid,KNeighborsClassifier()))
models.append(('CART',random_DC,DecisionTreeClassifier()))

In [None]:
model_cv =TrainModelCV(models,cv=5,n_jobs=-1,X_train=X_train,Y_train=Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 282, in fit
    "Invalid value for max_features. "
ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

        nan 0.50621028        nan        nan]


In [None]:
Report(model_cv,X_train=X_train,Y_train=Y_train,X_test=X_test,Y_test=Y_test)

RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.81      0.80      0.81       492
           2       0.57      0.72      0.64       196
           3       0.78      0.70      0.73       446

    accuracy                           0.75      1134
   macro avg       0.72      0.74      0.73      1134
weighted avg       0.76      0.75      0.75      1134

!!!!Test!!!!
              precision    recall  f1-score   support

           1       0.63      0.67      0.65       118
           2       0.39      0.43      0.41        60
           3       0.54      0.47      0.51       106

    accuracy                           0.55       284
   macro avg       0.52      0.52      0.52       284
weighted avg       0.55      0.55      0.55       284

KNN
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.65      0.68      0.66       467
           2       0.39      0.47      0.42       204
           3     

<h5> ถึงแม้ว่า Accuracy ของ Train จะลดลง แต่ใน Test นั้นดีขึ้น</h5>
<h5> ถึงแม้ว่า Test จะดึขั้นแต่การเพิ่มลำดับของ accuracy ก็ยังถือว่าน้อยอยู่ดี </h5>

## Scaling

In [None]:
scaler = preprocessing.MaxAbsScaler().fit(X_train) 
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [None]:
models = []
models.append(('RandomForest',random_grid, RandomForestClassifier()))
models.append(('KNN',param_grid,KNeighborsClassifier()))
models.append(('CART',random_DC,DecisionTreeClassifier()))
model_cv =TrainModelCV(models,cv=5,n_jobs=-1,X_train=X_train_transformed,Y_train=Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 282, in fit
    "Invalid value for max_features. "
ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

 0.47534599 0.47884683        nan        nan]


In [None]:
Report(model_cv,X_train=X_train_transformed,Y_train=Y_train,X_test=X_test_transformed,Y_test=Y_test)

RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.81      0.80      0.80       494
           2       0.56      0.73      0.63       193
           3       0.78      0.70      0.74       447

    accuracy                           0.75      1134
   macro avg       0.72      0.74      0.73      1134
weighted avg       0.76      0.75      0.75      1134

!!!!Test!!!!
              precision    recall  f1-score   support

           1       0.65      0.65      0.65       124
           2       0.39      0.46      0.42        56
           3       0.53      0.47      0.50       104

    accuracy                           0.55       284
   macro avg       0.52      0.53      0.52       284
weighted avg       0.55      0.55      0.55       284

KNN
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.73      0.65      0.69       542
           2       0.46      0.55      0.50       210
           3     

<h5> โดยรวม การทำ Sclaing แทบจะไม่ส่งผลกับการเพิ่มประสิทธิภาพ model ดังนั้น จะลองใช้วิธี Imbalance ดู

## Imbalance

In [None]:
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.under_sampling import TomekLinks

ros = SMOTEENN(random_state=42)
X_resampled,y_resampled = ros.fit_resample(X_train,Y_train)


In [None]:
X_train.shape,X_resampled.shape

((1134, 9), (365, 9))

In [None]:
model_cv =TrainModelCV(models,cv=5,n_jobs=-1,X_train=X_resampled,Y_train=y_resampled)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 282, in fit
    "Invalid value for max_features. "
ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

 0.88493151        nan        nan 0.84109589]


In [None]:
Report(model_cv,X_train=X_resampled,Y_train=y_resampled,X_test=X_test,Y_test=Y_test)

RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.99      1.00      1.00       106
           2       0.99      0.95      0.97       188
           3       0.87      0.97      0.92        71

    accuracy                           0.97       365
   macro avg       0.95      0.97      0.96       365
weighted avg       0.97      0.97      0.97       365

!!!!Test!!!!
              precision    recall  f1-score   support

           1       0.47      0.69      0.56        85
           2       0.60      0.36      0.45       110
           3       0.50      0.52      0.51        89

    accuracy                           0.51       284
   macro avg       0.52      0.52      0.51       284
weighted avg       0.53      0.51      0.50       284

KNN
!!!!Train!!!!
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       107
           2       1.00      1.00      1.00       179
           3     

<h5> การ Train นั้นมี accuracy เพิ่มขึ้นอย่างเห็นได้ชัดแต่ว่า ในทาง Test ไม่ได้ไปทางที่ดีขึ้นเหมือน Train </h5>

## Imbalance+Scaling

In [None]:
ros = SMOTEENN(random_state=42)
X_resampled,y_resampled = ros.fit_resample(X_train_transformed,Y_train)

In [None]:
model_cv =TrainModelCV(models,cv=5,n_jobs=-1,X_train=X_resampled,Y_train=y_resampled)
Report(model_cv,X_train=X_resampled,Y_train=y_resampled,X_test=X_test_transformed,Y_test=Y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        80
           2       1.00      0.99      1.00       159
           3       0.99      1.00      0.99        78

    accuracy                           1.00       317
   macro avg       1.00      1.00      1.00       317
weighted avg       1.00      1.00      1.00       317

!!!!Test!!!!
              precision    recall  f1-score   support

           1       0.44      0.66      0.53        83
           2       0.64      0.39      0.49       110
           3       0.45      0.45      0.45        91

    accuracy                           0.49       284
   macro avg       0.51      0.50      0.49       284
weighted avg       0.52      0.49      0.49       284

KNN
!!!!Train!!!!
              precision    recall  f1-score   support

   

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 282, in fit
    "Invalid value for max_features. "
ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

 0.79503968        nan        nan        nan]


<h5> จะเห็นได้ว่าวิธีนี้ ไม่ได้ส่งผลให้เพิ่มประสิทธิภาพในการใช้งานจริงของ model </h5>

## Feature Selection

### วิธี Chi Square

In [None]:
import scipy.stats as stats
thisdict = {}
def ChiSquare(X,Y):
  for col in X.columns:
      s = stats.chi2_contingency(observed= pd.crosstab(X[col],Y)) #chi_squared_stat, P value, degree of freedom
      critical = stats.chi.ppf(q=0.95, df = s[2]) #0.95 confidence
      if s[0]>=critical:
          print(col," Have relation")
      else:
          print(col," not Have relation")
      thisdict[col] = s[0]
  sort_orders = sorted(thisdict.items(), key=lambda X: X[1])
  print(sort_orders)

In [None]:
ChiSquare(X,Y)

Age  Have relation
W_Education  Have relation
Hus_Education  Have relation
Children  Have relation
W_Islam  Have relation
W_Work  Have relation
Hus_occupation  Have relation
STD_Living  Have relation
Media_exposure  Have relation
[('W_Work', 4.851945500532719), ('W_Islam', 22.2954244651903), ('Media_exposure', 30.77163467819753), ('Hus_occupation', 57.6471636407325), ('STD_Living', 60.03370289654527), ('Hus_Education', 67.00688990932488), ('W_Education', 128.8714637184617), ('Age', 151.9975860174326), ('Children', 194.83905300471164)]


In [None]:
X_train =X_train.drop(['W_Work'],axis=1)
X_test =X_test.drop(['W_Work'],axis=1)

In [None]:
model_cv =TrainModelCV(models,cv=5,n_jobs=-1,X_train=X_train,Y_train=Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 282, in fit
    "Invalid value for max_features. "
ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

        nan 0.48674905 0.47618027 0.48236326]


In [None]:
Report(model_cv,X_train=X_train,Y_train=Y_train,X_test=X_test,Y_test=Y_test)

RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.79      0.80      0.80       480
           2       0.55      0.73      0.63       187
           3       0.78      0.67      0.72       467

    accuracy                           0.74      1134
   macro avg       0.71      0.73      0.72      1134
weighted avg       0.75      0.74      0.74      1134

!!!!Test!!!!
              precision    recall  f1-score   support

           1       0.62      0.64      0.63       120
           2       0.37      0.45      0.41        55
           3       0.57      0.48      0.52       109

    accuracy                           0.54       284
   macro avg       0.52      0.52      0.52       284
weighted avg       0.55      0.54      0.54       284

KNN
!!!!Train!!!!
              precision    recall  f1-score   support

           1       0.65      0.69      0.67       455
           2       0.41      0.52      0.45       196
           3     

<h5> การลบ Feature ส่งผลให้ Accuracy drop ลง มากกว่าการทำ Crossvalidation อย่างเดียว </h5>

## สรุปการทดลอง

<h5> ไม่ว่าจะวิธี Cross validatation, การทำ Sclaer หรือ Imbalance ก็ไม่สามารถเพิ่ม Accuracy Test model ให้เกิน 80% ได้ </h5>
<h5> เนื่องจากขนาดของชุดข้อมูลมีขนาดเล็กมาก เมื่อเทียบกับขนาดประชากรในประเทศ จึงส่งผลต่อการ Train model อย่างที่ทราบกันดีว่าจำนวนตัวอย่างข้อมูลส่งผลต่อการฝึก </h5>
<h5> สาเหตุในการที่แนะนำว่าต้องเพิ่มจำนวนตัวอย่างข้อมูล เพราะ Accuracy Train นั้นเป็นไปในทางที่ดี แต่ Test ออกมาไม่ดี หรือเรียกว่า overfitting ซึ่งการเพิ่มตัวอย่างข้อมูลเป็นการแก้ไขปัญหาวิธีหนึ่งในการลด overfiting </h5>
<h5> อีกวิธีในการลด overfitting คือการทำ Cross validation ซึ่งไม่สามารถทำให้ค่า Accuracy ของ Test เพิ่มขึ้นได้เท่าที่ควร </h5>