#### Problem statement: 
##### To predict the quality of wine
    

#### Importing the need libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

**Loading the dataset**

In [None]:
data = pd.read_csv('dataset.csv')

In [None]:
data.head()

#### Checking the info of the dataset

In [None]:
data.info()

From the above,it is observed that the all the features are numeric and no null values is present, and the target feature is quality feature

#### Checking the descriptive statistic of the dataset

In [None]:
data.describe()

Plotting the pairplot of each pair of features in the data

In [None]:
plt.figure( figsize = (7,7) )
sns.pairplot(data)
plt.show()

In [None]:
plt.figure(figsize = (4,2.5))
cols = data.columns[:-1]
for col in cols: 
    sns.boxplot(x = col, data = data)
    plt.xlabel(f'X-axis ---> {col}')
    plt.show()

In [None]:
cols = cols[:-1]
cols

In [None]:
def outlier_replacer(data, col):
    first_quartile = data[col].quantile(.25)
    third_quartile = data[col].quantile(.75)
    iqr = (third_quartile - first_quartile) * 1.5
    maxim, minim = third_quartile + iqr , first_quartile - iqr
    max_index = data[col][data[col] > maxim].index
    min_index = data[col][data[col] < minim].index
    data[col].iloc[max_index] = maxim
    data[col].iloc[min_index] = minim
   

In [None]:
for col in cols:
    outlier_replacer(data, col)

In [None]:
for col in cols: 
    plt.figure(figsize = (4,4))
    sns.boxplot(data[col])
    plt.show()

#### Checking and removal of duplicate values

In [None]:
data.duplicated().sum()

**removing the outliers** 

In [None]:
data = data.drop_duplicates()

In [None]:
data.duplicated().sum()

#### Binning the dependent variable

In [None]:
data.head()

In [None]:
data.quality.unique()

In [None]:
# 3 & 4 --> Low quality Wine.
# 5 & 6 ----- Average qualty wine. 
# 7 & 8 -----> High Quality wine.

def replace_quality(val):
    if(3 <= val <= 4 ):
        return 'Low'
    elif(5 <= val <= 6):
        return 'Average'
    elif(val >= 7):
        return 'High'

In [None]:
data.quality =  data.quality.apply(replace_quality)

In [None]:
data.quality.unique()

#### Splitting the dataset into dependent and independent variable

In [None]:
X= data.iloc[:, :-1]
y = data.iloc[:, -1]

#### Scaling the independent variable

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

####  Splitting the dataset into the train  and tesing dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

**Scaling the dataset using the standard scaler**

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

#### Preparing the model for the grid search 

In [None]:
models = {
    
    'Logistic Regression': {
        'model': LogisticRegression(),
        'hyper_parameters' : {
            'C' : list(range(1,6)),
            'fit_intercept' : [True, False],
        }
    }, # End of a model
    
    "XG Boost":{
        "model":XGBClassifier(),
        "hyper_parameters":{
            "n_estimators":[10, 20, 30, 40, 50, 100]
        }
        }, #End of a model.
    
    'Extra Tree Classifier' : {
        'model' : ExtraTreesClassifier(),
        'hyper_parameters' : {
            'n_estimators' : list(range(30,60, 100)),
             'criterion' : ['gini', 'entropy']
        }
    }, #End of a model
    
    'SVM': {
            'model':SVC(),
            'hyper_parameters': {
                'C':list(range(1,6)),
                'kernel':['rbf','poly'],
                'degree': list(range(1,5))
            }
    }, #End of a model.
    
    'RandomForest' : {
        'model' : RandomForestClassifier(),
        'hyper_parameters' : {
            'n_estimators' : [50, 70, 90, 110],
            'criterion' : ['gini', 'criterion']
        }
    }#End of a model 
           
}  # End of all the models 

In [None]:
from sklearn.model_selection import StratifiedKFold
cv =  StratifiedKFold(n_splits = 10)

In [None]:
predict = pd.DataFrame(score)

predict.sort_values(by='Best Score')

#### Best model is Random Forest classifier with 84.33 and the hyper parameters were  
#### criterion = gini, n_estimator = 50

In [None]:
model = RandomForestClassifier(criterion = 'gini', n_estimators = 50)

In [None]:
model.fit(X_train , y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score , confusion_matrix

In [None]:
test_accuracy = accuracy_score(y_test , y_pred)
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training accuracy ===> {train_accuracy}')
print(f'Testing accuracy ====> {test_accuracy}')

Checking for the best features present in the dataset

In [None]:
best = model.feature_importances_
print(len(best))
print(best)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)