# ***Prompt for file and load data***

In [33]:
from google.colab import files
import io

uploaded = files.upload()

Saving winequality-combined.csv to winequality-combined (3).csv


# ***Load data***

In [34]:
import pandas as pd

# Extract filename
filename = list(uploaded.keys())[0]

#    Load CSV from either:
#      - a filename (string path)
#      - an uploaded file from Colab's files.upload() dict
#      - a Flask file object (from request.files)

if isinstance(uploaded[filename], str):
  # Assume it's a file path
  df_wine = pd.read_csv(uploaded[filename], sep="\t")
elif hasattr(uploaded[filename], 'read'):
  # Flask's file object or BytesIO
  df_wine = pd.read_csv(io.BytesIO(uploaded[filename].read()), sep="\t")
elif isinstance(uploaded[filename], bytes):
  # Bytes directly (Colab uploaded dict value)
  df_wine = pd.read_csv(io.BytesIO(uploaded[filename]), sep="\t")
else:
  raise ValueError("Unsupported file source type.")

print(df_wine.head())


   fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free_sulfur_dioxide  total_sulfur_dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol  quality  
0      8.8        6  
1      9.5        6  
2     10.1        6 

# ***Split the data for training and testing***

In [35]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder

train_cols = [
    "fixed_acidity","volatile_acidity","citric_acid","residual_sugar",
    "chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density",
    "pH","sulphates","alcohol"
]

x = df_wine[train_cols] # Use all features to train model
y = df_wine['quality'] # "quality" is our target

smote = SMOTE(k_neighbors = 4,random_state=85)
x_resampled, y_resampled = smote.fit_resample(x, y)

x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled,
                                                    stratify=y_resampled,
                                                    test_size=0.2, random_state=5)

# Initialize scaler
scaler = StandardScaler()

# Fit scaler only on training data, then transform
x_train_scaled = scaler.fit_transform(x_train)

# Use the same scaler to transform the test set
x_test_scaled = scaler.transform(x_test)

# ***Create and fit Random Forest***

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and fit the Random Forest model
rfc = RandomForestClassifier(
    n_estimators=175,
    max_depth=12,
    max_leaf_nodes=512,
    min_samples_leaf=2,
    random_state=4,
    n_jobs=-1,
    class_weight="balanced")
rfc.fit(x_train_scaled, y_train)

# Perform predictions and evaluate results
y_pred = rfc.predict(x_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


Accuracy: 0.8257365902795266
Classification report:
              precision    recall  f1-score   support

           3       0.95      0.98      0.97       567
           4       0.82      0.93      0.87       567
           5       0.69      0.66      0.68       567
           6       0.65      0.47      0.55       568
           7       0.74      0.79      0.77       567
           8       0.88      0.94      0.91       568
           9       0.99      1.00      1.00       567

    accuracy                           0.83      3971
   macro avg       0.82      0.83      0.82      3971
weighted avg       0.82      0.83      0.82      3971



# ***Save features and target as Pickle files***

In [37]:
import pickle

# Save the model to a pickle file
with open('rfc.pkl', 'wb') as f:
  pickle.dump(rfc, f)

# Save the scaler to a pickle file
with open('scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)


# Download all pickle files
from google.colab import files
files.download("rfc.pkl")
files.download("scaler.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>