### バージョン確認

In [1]:
import time, sklearn
start = time.time()
print(sklearn.__version__)
print("実行時間:", time.time() - start, "秒")


1.7.2
実行時間: 0.0011355876922607422 秒


### データセットの準備

In [89]:
from sklearn.datasets import load_iris

data = load_iris()
X = data.data  # 説明変数
y = data.target  # 目的変数

print(X.shape, y.shape)  # (150, 4) (150,)


(150, 4) (150,)


In [90]:
import pandas as pd

df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [91]:
#目的変数の数の確認
print(df["target"].value_counts())

target
0    50
1    50
2    50
Name: count, dtype: int64


In [92]:
import numpy as np

classes, counts = np.unique(df["target"], return_counts=True)
print(dict(zip(classes, counts)))

{np.int64(0): np.int64(50), np.int64(1): np.int64(50), np.int64(2): np.int64(50)}


In [93]:
from collections import Counter

print(Counter(df["target"]))

Counter({0: 50, 1: 50, 2: 50})


In [94]:
df.shape

(150, 5)

In [95]:
df.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

In [96]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [97]:
df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


### Scikit-learnの基本的な流れ

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [100]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [101]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((105, 4), (45, 4), (105,), (45,))

In [102]:
X_train[:5]

array([[5.7, 3.8, 1.7, 0.3],
       [6.7, 2.5, 5.8, 1.8],
       [5.1, 2.5, 3. , 1.1],
       [4.8, 3. , 1.4, 0.3],
       [6.2, 2.2, 4.5, 1.5]])

In [103]:
model = LogisticRegression(max_iter=200)

In [104]:
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [105]:
y_pred = model.predict(X_test)


In [106]:
y_pred

array([1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 0, 0, 1, 0, 2, 0, 2, 0, 0, 1, 2,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2,
       2])

In [107]:
accuracy = accuracy_score(y_test, y_pred)

In [108]:
accuracy

0.9111111111111111

In [109]:
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.91


### 分類問題の例

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report


Confusion Matrix:
 [[11  0  0]
 [ 0 17  1]
 [ 0  3 13]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.85      0.94      0.89        18
           2       0.93      0.81      0.87        16

    accuracy                           0.91        45
   macro avg       0.93      0.92      0.92        45
weighted avg       0.91      0.91      0.91        45



In [None]:
clf = DecisionTreeClassifier()
# clf = DecisionTreeClassifier(max_depth=3, random_state=42)




In [59]:
clf

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [60]:
clf.fit(X_train, y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [61]:
y_pred = clf.predict(X_test)


In [62]:
y_pred

array([1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 2,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2,
       2])

In [65]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[11  0  0]
 [ 0 17  1]
 [ 0  3 13]]


In [66]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.85      0.94      0.89        18
           2       0.93      0.81      0.87        16

    accuracy                           0.91        45
   macro avg       0.93      0.92      0.92        45
weighted avg       0.91      0.91      0.91        45



### 回帰問題の例

In [67]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score

In [70]:
# X, y = make_regression()
X, y = make_regression(n_samples=100, n_features=1, noise=20, random_state=42)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [72]:
model = LinearRegression()

In [73]:
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [74]:
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [75]:
y_pred = model.predict(X_test)

In [76]:
y_pred

array([-52.81092405,  48.34701416,  33.34954608, -28.54609152,
       -63.49008241,  19.77167983, -22.35287265, -79.29759452,
        37.59137009,  71.57072019,  12.84438712,  49.97723618,
         6.24014848,  27.06839418,  66.45833165,  16.92166706,
        17.76869766, -65.30155332, -11.36396817,  19.1411268 ])

In [77]:
mse = mean_squared_error(y_test, y_pred)

In [78]:
mse

199.4381297267683

In [79]:
r2 = r2_score(y_test, y_pred)

In [80]:
r2

0.9120436841436448

In [81]:
print(f"MSE: {mse:.2f}, R²: {r2:.2f}")


MSE: 199.44, R²: 0.91


### モデル評価方法

In [110]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1:.2f}")


F1 Score: 0.92


### データ前処理・・特徴量エンジニアリング

In [111]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # 平均0、分散1に正規化


In [112]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [113]:
X_scaled

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

### モデル選択とハイパーパラメータ調整

In [114]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [115]:
param_grid = {'C':[0.1,1,10], 'kernel':['linear','rbf']}


In [118]:
param_grid

{'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

In [117]:
grid = GridSearchCV(SVC(), param_grid, cv=5)


In [119]:
grid

0,1,2
,estimator,SVC()
,param_grid,"{'C': [0.1, 1, ...], 'kernel': ['linear', 'rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [120]:
SVC()

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [121]:
grid.fit(X_train, y_train)


0,1,2
,estimator,SVC()
,param_grid,"{'C': [0.1, 1, ...], 'kernel': ['linear', 'rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [122]:
print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)


Best Parameters: {'C': 1, 'kernel': 'linear'}
Best Score: 0.9904761904761905
