In [1]:
import pandas as pd

# UCI Wine Quality Dataset URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

# Read the dataset into a pandas DataFrame
wine_quality_data = pd.read_csv(url, sep=";")

# Display the first few rows of the dataset
print(wine_quality_data.head())


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [2]:
import pandas as pd

# URL of the Wine Quality dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

# Read the CSV file into a pandas DataFrame
wine_quality_data = pd.read_csv(url, sep=";")

# Define the path and filename for the Excel file
excel_filename = "wine_quality_data.xlsx"

# Write the DataFrame to an Excel file
wine_quality_data.to_excel(excel_filename, index=False)

print("Excel file created successfully:", excel_filename)


Excel file created successfully: wine_quality_data.xlsx


In [3]:
print(wine_quality_data)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine_quality_data = pd.read_csv(url, sep=";")

# Split features and target variable
X = wine_quality_data.drop("quality", axis=1)
y = wine_quality_data["quality"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the shape of the preprocessed data
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


X_train_scaled shape: (1279, 11)
X_test_scaled shape: (320, 11)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_scaled, y_train)
logreg_pred = logreg_model.predict(X_test_scaled)
logreg_accuracy = accuracy_score(y_test, logreg_pred)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_pred)

# Display accuracy of each model
print("Logistic Regression Accuracy:", logreg_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)


Logistic Regression Accuracy: 0.575
Random Forest Accuracy: 0.659375
Support Vector Machine Accuracy: 0.559375


In [6]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the model with the best parameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train_scaled, y_train)

# Make predictions
best_rf_pred = best_rf_model.predict(X_test_scaled)

# Calculate accuracy
best_rf_accuracy = accuracy_score(y_test, best_rf_pred)

# Display the best parameters and accuracy
print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)
print("Random Forest Accuracy after Hyperparameter Tuning:", best_rf_accuracy)


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.6919822303921569
Random Forest Accuracy after Hyperparameter Tuning: 0.66875


In [7]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid for Logistic Regression
logreg_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Hyperparameter grid for Random Forest
rf_param_grid = {'n_estimators': [50, 100, 150, 200],
                 'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

# Hyperparameter grid for Support Vector Machine (SVM)
svm_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10],
                  'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

# Grid search cross-validation for Logistic Regression
logreg_grid_search = GridSearchCV(LogisticRegression(max_iter=1000), logreg_param_grid, cv=5)
logreg_grid_search.fit(X_train_scaled, y_train)
logreg_best_model = logreg_grid_search.best_estimator_
logreg_accuracy = accuracy_score(y_test, logreg_best_model.predict(X_test_scaled))

# Grid search cross-validation for Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train_scaled, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_accuracy = accuracy_score(y_test, rf_best_model.predict(X_test_scaled))

# Grid search cross-validation for Support Vector Machine (SVM)
svm_grid_search = GridSearchCV(SVC(), svm_param_grid, cv=5)
svm_grid_search.fit(X_train_scaled, y_train)
svm_best_model = svm_grid_search.best_estimator_
svm_accuracy = accuracy_score(y_test, svm_best_model.predict(X_test_scaled))

# Display accuracy of each tuned model
print("Tuned Logistic Regression Accuracy:", logreg_accuracy)
print("Tuned Random Forest Accuracy:", rf_accuracy)
print("Tuned Support Vector Machine Accuracy:", svm_accuracy)


Tuned Logistic Regression Accuracy: 0.565625
Tuned Random Forest Accuracy: 0.66875
Tuned Support Vector Machine Accuracy: 0.609375
