<a href="https://colab.research.google.com/github/deeprajbrahma123/Wine-Quality-Analysis/blob/main/Wine_Quality_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv('red_wine.csv')

# Fill missing values with the column mean
df.fillna(df.mean(), inplace=True)

# Separate features and target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Outlier detection
outlier_detector = IsolationForest(contamination=0.01)  # Adjust the contamination parameter as needed
outlier_mask = outlier_detector.fit_predict(X)
X = X[outlier_mask == 1]
y = y[outlier_mask == 1]

# Feature selection
selector = SelectKBest(score_func=f_regression, k=7)  # Adjust the k value as needed
X_selected = selector.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Scale the input variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.46936739343337924




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from sklearn.ensemble import VotingRegressor

# Load the dataset
df = pd.read_csv('red_wine.csv')

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Separate features and target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Outlier detection
outlier_detector = IsolationForest(contamination=0.01)  # Adjust the contamination parameter as needed
outlier_mask = outlier_detector.fit_predict(X)
X = X[outlier_mask == 1]
y = y[outlier_mask == 1]

# Feature selection
selector = SelectKBest(score_func=f_regression, k=7)  # Adjust the k value as needed
X_selected = selector.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Scale the input variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(RandomForestRegressor(),
                              rf_param_grid, scoring='neg_mean_squared_error',
                              cv=5)
rf_grid_search.fit(X_train_scaled, y_train)
best_rf_model = rf_grid_search.best_estimator_
print("Best Random Forest Model:", best_rf_model)

# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, None],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 1.0]
}
gb_grid_search = GridSearchCV(GradientBoostingRegressor(), gb_param_grid, scoring='neg_mean_squared_error', cv=5)
gb_grid_search.fit(X_train_scaled, y_train)
best_gb_model = gb_grid_search.best_estimator_
print("Best Gradient Boosting Model:", best_gb_model)

# Train the best models
best_rf_model.fit(X_train_scaled, y_train)
best_gb_model.fit(X_train_scaled, y_train)

# Evaluate the best models
rf_y_pred = best_rf_model.predict(X_test_scaled)
rf_mse = mean_squared_error(y_test, rf_y_pred)
print("Random Forest Mean Squared Error:", rf_mse)

gb_y_pred = best_gb_model.predict(X_test_scaled)
gb_mse = mean_squared_error(y_test, gb_y_pred)
print("Gradient Boosting Mean Squared Error:", gb_mse)

# Ensemble modeling using VotingRegressor
ensemble_model = VotingRegressor(estimators=[('Random Forest', best_rf_model), ('Gradient Boosting', best_gb_model)])
ensemble_model.fit(X_train_scaled, y_train)
ensemble_y_pred = ensemble_model.predict(X_test_scaled)
ensemble_mse = mean_squared_error(y_test, ensemble_y_pred)
print("Ensemble Model Mean Squared Error:", ensemble_mse)

# Investigate feature importance
importance = permutation_importance(ensemble_model, X_train_scaled, y_train)
feature_importance = importance.importances_mean
feature_names = X.columns[selector.get_support()]
sorted_indices = np.argsort(feature_importance)[::-1]

print("Feature Importance:")
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {feature_importance[idx]}")




Best Random Forest Model: RandomForestRegressor(n_estimators=300)
Best Gradient Boosting Model: GradientBoostingRegressor(learning_rate=0.01, max_depth=5, n_estimators=300,
                          subsample=0.8)
Random Forest Mean Squared Error: 0.43284807644489826
Gradient Boosting Mean Squared Error: 0.45196271769707336
Ensemble Model Mean Squared Error: 0.4369984672462357
Feature Importance:
alcohol: 0.5460536173832518
volatile acidity: 0.3338828509143254
sulphates: 0.32037786779422434
total sulfur dioxide: 0.19422951293992735
density: 0.11287316562774778
citric acid: 0.10595319431804695
pH: 0.013840394309136018
