# Ансамбли моделей машинного обучения.

### 1. Работа с датасетом

In [None]:
!wget https://archive.ics.uci.edu/static/public/850/raisin.zip

--2024-04-04 10:46:30--  https://archive.ics.uci.edu/static/public/850/raisin.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘raisin.zip’

raisin.zip              [  <=>               ] 111.99K   505KB/s    in 0.2s    

2024-04-04 10:46:31 (505 KB/s) - ‘raisin.zip’ saved [114677]



In [None]:
!unzip raisin.zip
!unzip Raisin_Dataset.zip

Archive:  raisin.zip
  inflating: Raisin_Dataset.zip      
Archive:  Raisin_Dataset.zip
   creating: Raisin_Dataset/
  inflating: Raisin_Dataset/Raisin_Dataset.arff  
  inflating: Raisin_Dataset/Raisin_Dataset.txt  
  inflating: Raisin_Dataset/Raisin_Dataset.xlsx  


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC


In [None]:
raw_data = loadarff('Raisin_Dataset/Raisin_Dataset.arff')
df = pd.DataFrame(raw_data[0])

In [None]:
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524.0,442.246011,253.291155,0.819738,90546.0,0.758651,1184.04,b'Kecimen'
1,75166.0,406.690687,243.032436,0.801805,78789.0,0.68413,1121.786,b'Kecimen'
2,90856.0,442.267048,266.328318,0.798354,93717.0,0.637613,1208.575,b'Kecimen'
3,45928.0,286.540559,208.760042,0.684989,47336.0,0.699599,844.162,b'Kecimen'
4,79408.0,352.19077,290.827533,0.564011,81463.0,0.792772,1073.251,b'Kecimen'


### 2. Кодирование категориальных признаков

In [None]:
le = LabelEncoder()
df["Class"]= le.fit_transform(df["Class"])
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524.0,442.246011,253.291155,0.819738,90546.0,0.758651,1184.04,1
1,75166.0,406.690687,243.032436,0.801805,78789.0,0.68413,1121.786,1
2,90856.0,442.267048,266.328318,0.798354,93717.0,0.637613,1208.575,1
3,45928.0,286.540559,208.760042,0.684989,47336.0,0.699599,844.162,1
4,79408.0,352.19077,290.827533,0.564011,81463.0,0.792772,1073.251,1


### 3. Разделим выборку на обучающую и тестовую

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("Class",axis=1), df["Class"], test_size=0.33, random_state=42)

### 4. Обучим ансамблевые модели

In [None]:
clf_rf = RandomForestClassifier(n_estimators=200,max_depth=7)
clf_rf.fit(X_train,y_train)

In [None]:
clf_gb = GradientBoostingClassifier(n_estimators=150)
clf_gb.fit(X_train,y_train)

In [None]:
estimators = [
     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
     ('svc', make_pipeline(StandardScaler(),
                           SVC(random_state=42)))]
clf_stk = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stk.fit(X_train,y_train)

### 5. Оценим качество моделей

In [None]:
# Metrics
clfs = [clf_rf,clf_gb,clf_stk]
for clf in clfs:
    print(f"{type(clf).__name__}:")
    print('\t',f"accuracy = {accuracy_score(y_test,clf.predict(X_test))}")
    print('\t',f"f1 = {f1_score(y_test,clf.predict(X_test))}")

RandomForestClassifier:
	 accuracy = 0.8552188552188552
	 f1 = 0.858085808580858
GradientBoostingClassifier:
	 accuracy = 0.8518518518518519
	 f1 = 0.8533333333333333
StackingClassifier:
	 accuracy = 0.8585858585858586
	 f1 = 0.8627450980392157
