In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install --upgrade scikit-learn




In [3]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import sklearn

In [4]:
print(f"scikit-learn version: {sklearn.__version__}")

scikit-learn version: 1.5.1


In [5]:
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 
  
# metadata 
print(wine_quality.metadata) 
  
# variable information 
print(wine_quality.variables) 


{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'title': 'Modeling wine preferences by data mining from physicoc

In [6]:
X.shape

(6497, 11)

In [7]:
y.shape

(6497, 1)

In [8]:
y.describe()

Unnamed: 0,quality
count,6497.0
mean,5.818378
std,0.873255
min,3.0
25%,5.0
50%,6.0
75%,6.0
max,9.0


In [9]:
print(y.iloc[:, 0].unique())

[5 6 7 4 8 3 9]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Convert y_train to a 1D array using ravel()
model.fit(X_train, y_train.values.ravel())

In [12]:
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.676923076923077
              precision    recall  f1-score   support

           3       0.33      0.17      0.22         6
           4       0.83      0.12      0.20        43
           5       0.66      0.73      0.69       402
           6       0.66      0.76      0.71       597
           7       0.83      0.53      0.64       215
           8       0.78      0.39      0.52        36
           9       0.00      0.00      0.00         1

    accuracy                           0.68      1300
   macro avg       0.58      0.38      0.43      1300
weighted avg       0.69      0.68      0.67      1300



In [13]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  1   0   2   3   0   0   0]
 [  2   5  23  13   0   0   0]
 [  0   0 292 107   3   0   0]
 [  0   1 122 455  17   2   0]
 [  0   0   5  95 113   2   0]
 [  0   0   0  19   3  14   0]
 [  0   0   0   1   0   0   0]]


In [14]:
# Save the trained model as a pickle file
model_filename = 'random_forest_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved as {model_filename}")

Model saved as random_forest_model.pkl
