In [1]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

In [2]:
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df
wine = load_wine()
df_wine = sklearn_to_df(wine)
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [3]:
df_wine.columns

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline', 'target'],
      dtype='object')

In [4]:
df_wine.shape

(178, 14)

In [5]:
X = df_wine.iloc[:, :-1].values
y = df_wine.iloc[:, -1].values

In [6]:
y = y.astype('int')

The `KFold` function in scikit-learn is used to split a dataset into k folds for cross-validation. It takes several parameters that control the behavior of the cross-validation process. Here are the parameters of the `KFold` function:

- `n_splits`: The number of folds to create. This parameter is required. Default value is 5.
- `shuffle`: Whether to shuffle the data before splitting it into folds. The default value is `False`.
- `random_state`: The random seed to use for shuffling the data. This parameter is ignored if `shuffle` is `False`.

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
kf.get_n_splits(X)

5

This creates a `KFold` object that will split the data into 5 folds, shuffle the data before splitting it, and use a random seed of 0 for shuffling the data. The resulting `kf` object can be used to iterate over the folds of the data for cross-validation.

In [8]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i+1}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

Fold 1:
  Train: index=[  0   1   2   3   6   9  10  11  12  13  14  15  16  17  19  20  21  22
  23  24  25  27  28  29  30  31  32  34  35  36  38  39  40  41  42  43
  46  47  48  49  50  52  53  57  58  59  62  64  65  67  68  69  70  71
  72  73  74  75  76  77  78  79  81  82  83  84  85  87  88  89  91  92
  93  94  95  96  97  99 100 101 102 103 105 107 108 109 110 113 114 115
 116 117 118 119 120 122 124 125 127 128 130 131 132 133 134 135 136 137
 138 139 140 142 143 144 145 147 148 149 150 152 153 154 155 156 157 158
 159 161 162 163 165 166 167 169 170 171 172 173 174 175 176 177]
  Test:  index=[  4   5   7   8  18  26  33  37  44  45  51  54  55  56  60  61  63  66
  80  86  90  98 104 106 111 112 121 123 126 129 141 146 151 160 164 168]
Fold 2:
  Train: index=[  0   1   3   4   5   6   7   8   9  10  11  12  13  14  15  17  18  20
  21  23  25  26  28  29  31  32  33  34  35  36  37  38  39  41  42  43
  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  60  61 

In [9]:
scores = cross_val_score(RandomForestClassifier(n_estimators=20), X, y, scoring='accuracy', cv=kf)
scores

array([1.        , 0.97222222, 0.97222222, 1.        , 1.        ])

In [10]:
print("%f accuracy with a standard deviation of %f" % (scores.mean(), scores.std()))

0.988889 accuracy with a standard deviation of 0.013608
