! pip3 install xgboost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate, cross_val_score 

#from sklearn.pipeline import make_pipeline
import pickle
import random
from sklearn.impute import SimpleImputer
from scipy.stats import randint
 
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBRegressor

In [4]:
wine_data = pd.read_csv('F:/iPRIMED/Python/wine_quality_dataset/winequality-red.csv', delimiter=';')
wine_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [5]:
wine_data.quality.value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [6]:
X = wine_data.iloc[:,:-1]
Y = wine_data.iloc[:, -1]
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


### SMOTE

In [7]:
X_resampled, Y_resampled = SMOTE().fit_resample(X, Y)

In [19]:
scoring = ['precision_macro', 'recall_macro', 'accuracy']

def evaluate_model(model, X_resampled, Y_resampled):
    
    print("\nCross Validate:\n")
    cv = cross_validate(model, X_resampled, Y_resampled, scoring=scoring, cv=5)
    print("\nCross Validation:",model[2],"\n")
    print("Mean Accuracy:",cv['test_accuracy'].mean())
    print("Mean Precision:",cv['test_precision_macro'].mean())
    print("Mean Recall:",cv['test_recall_macro'].mean())
    print("\n\n")

In [17]:
def get_reports(model, X, Y):
    
    y_pred = model.predict(X)
    print("\nAccuracy:",accuracy_score(Y, y_pred))
    print("Precision Score:",precision_score(Y, y_pred, average = 'macro', zero_division=0))
    print("Recall Score:",recall_score(Y, y_pred, average = 'macro', zero_division=0))
    print("f1 Score:",f1_score(Y, y_pred, average = 'macro', zero_division=0))
    
    print("\n \nClassification Report:\n\n",classification_report(Y, y_pred, zero_division=0))
    print("\n \nConfusion Matrix:\n")
    print(confusion_matrix(Y, y_pred))

In [10]:
# AdaBoostClassifier(base_estimator=RandomForestClassifier()), AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
pipelines = []
model_list = [XGBClassifier(objective= 'reg:linear'), RandomForestClassifier(), GradientBoostingClassifier()]

for model in model_list:
    pipeline= make_pipeline(SimpleImputer(strategy='median'), RobustScaler(), model, verbose=True) 
    pipelines.append(pipeline)

## Randomized Search

In [14]:
param_dist = [{'adaboostclassifier__base_estimator__n_estimators': random.choice([10,100,200,250]), 'adaboostclassifier__base_estimator__max_depth': random.choice([2,3,4,5,6,7,8])},
              {'adaboostclassifier__base_estimator__max_depth': random.choice([2,3,4,5,6,7,8]), 'adaboostclassifier__base_estimator__min_samples_split':random.choice([2,3,4,5])},
              {'xgblassifier__max_depth':[5,6,7], 'xgblassifier__gamma':[0,0.1]}, 
              {'randomforestclassifier__n_estimators':[10,50,100,200,250], 'randomforestclassifier__max_depth':[2,3,4,5,6,7,8], 'randomforestclassifier__min_samples_split':[2,3,4,5]},
              {'gradientboostingclassifier__n_estimators': random.choice([10,100,200,250]), 'gradientboostingclassifier__max_depth': random.choice([2,3,4,5,6,7,8]), 'gradientboostingclassifier__min_samples_split':random.choice([5,10]), 'gradientboostingclassifier__random_state': 1}]

scoring = ['precision_macro', 'recall_macro', 'accuracy']
n_iter = 20

for pipeline, params in zip(pipelines, param_dist):
    
    random_search = RandomizedSearchCV(pipeline, params, n_iter=n_iter, cv=5, scoring='precision_macro', n_jobs=-1,verbose=3)
    random_search.fit(X_resampled, Y_resampled)
    model = random_search.best_estimator_
    print('\033[1m',"\n\nModel:",model,'\033[0m')

    get_reports(model, X, Y)
    evaluate_model(model, X_resampled, Y_resampled)

TypeError: Parameter value is not iterable or distribution (key='adaboostclassifier__base_estimator__n_estimators', value=100)

## Grid Search

In [16]:
param_list = [# {'adaboostclassifier__base_estimator__n_estimators': [10,100,200,250], 'adaboostclassifier__base_estimator__max_depth': [2,3,4,5,6,7,8]},
              # {'adaboostclassifier__base_estimator__max_depth': [2,3,4,5,6,7,8], 'adaboostclassifier__base_estimator__min_samples_split':[2,3,4,5]},
              {'xgblassifier__max_depth':[5,6,7], 'xgblassifier__gamma':[0,0.1]}, 
              {'randomforestclassifier__n_estimators':[10,50,100,200,250], 'randomforestclassifier__max_depth':[2,3,4,5,6,7,8], 'randomforestclassifier__min_samples_split':[2,3,4,5]},
              {'gradientboostingclassifier__n_estimators': [10,100,200,250], 'gradientboostingclassifier__max_depth': [2,3,4,5,6,7,8], 'gradientboostingclassifier__min_samples_split':[5,10]}]

scoring = ['precision_macro', 'recall_macro', 'accuracy']
model_name = ['Gradientboost_wine']

for pipeline, params, name in zip(pipelines, param_list, model_name):
    
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='recall_macro', n_jobs=-1,verbose=3)
    grid_search.fit(X_resampled, Y_resampled)
    model = grid_search.best_estimator_
    print("Best Score:", grid_search.best_score_)
    print('\033[1m',"\n\nModel:",model,'\033[0m')
    with open(name+'.pickle', 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

    get_reports(model, X, Y)
    evaluate_model(model, X_resampled, Y_resampled)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[Pipeline] ..... (step 1 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing robustscaler, total=   0.0s
[Pipeline]  (step 3 of 3) Processing adaboostclassifier, total=   2.9s
[1m 

Model: Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('robustscaler', RobustScaler()),
                ('adaboostclassifier',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8,
                                                                          min_samples_split=5)))],
         verbose=True) [0m

RMSE test: 0.00

 
Confusion Matrix:

[[ 10   0   0   0   0   0]
 [  0  53   0   0   0   0]
 [  0   0 681   0   0   0]
 [  0   0   0 638   0   0]
 [  0   0   0   0 199   0]
 [  0   0   0   0   0  18]]

Cross Validate:

[Pipeline] ..... (step 1 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing robus

  _warn_prf(average, modifier, msg_start, len(result))


[Pipeline]  (step 3 of 3) Processing adaboostclassifier, total=   1.1s
[Pipeline] ..... (step 1 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing robustscaler, total=   0.0s


  _warn_prf(average, modifier, msg_start, len(result))


[Pipeline]  (step 3 of 3) Processing adaboostclassifier, total=   1.1s
[Pipeline] ..... (step 1 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing robustscaler, total=   0.1s
[Pipeline]  (step 3 of 3) Processing adaboostclassifier, total=   2.1s


  _warn_prf(average, modifier, msg_start, len(result))


[Pipeline] ..... (step 1 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing robustscaler, total=   0.0s


  _warn_prf(average, modifier, msg_start, len(result))


[Pipeline]  (step 3 of 3) Processing adaboostclassifier, total=   1.9s

Cross Validation: AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8,
                                                         min_samples_split=5)) 

Mean Accuracy: 0.5490987460815047
Mean Precision: 0.3334282630845018
Mean Recall: 0.268372622471072



Do you want to continue?
1. Yes
2. No1
1
Fitting 5 folds for each of 56 candidates, totalling 280 fits


KeyboardInterrupt: 

## Loading the model

In [20]:
model_name= ['Adaboost_RF_wine', 'Adaboost_DT_wine']
for name in model_name:
    with open('C:/Users/DEVVRAK/'+name+'.pickle', 'rb') as handle:
        model = pickle.load(handle)
    
    print('\033[1m',"\n\nModel:",model,'\033[0m')
    get_reports(model, X, Y)
    evaluate_model(model, X_resampled, Y_resampled)

[1m 

Model: Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('robustscaler', RobustScaler()),
                ('adaboostclassifier',
                 AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=8,
                                                                          n_estimators=250)))],
         verbose=True) [0m

Accuracy: 0.9949968730456535
Precision Score: 0.9638370391058563
Recall Score: 0.9877760286550784
f1 Score: 0.9749244535175542

 
Classification Report:

               precision    recall  f1-score   support

           3       1.00      1.00      1.00        10
           4       0.98      1.00      0.99        53
           5       1.00      1.00      1.00       681
           6       1.00      0.99      1.00       638
           7       0.99      0.99      0.99       199
           8       0.81      0.94      0.87        18

    accuracy                           0.99      1599
   macro avg       0.96    

In [13]:
with open('C:/Users/DEVVRAK/'+'Adaboost_RF_wine.pickle', 'rb') as handle:
    model = pickle.load(handle)
    print(model)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('robustscaler', RobustScaler()),
                ('adaboostclassifier',
                 AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=8,
                                                                          n_estimators=250)))],
         verbose=True)


In [14]:
print(classification_report(Y,model.predict(X)))

              precision    recall  f1-score   support

           3       1.00      1.00      1.00        10
           4       0.98      1.00      0.99        53
           5       1.00      1.00      1.00       681
           6       1.00      0.99      1.00       638
           7       0.99      0.99      0.99       199
           8       0.81      0.94      0.87        18

    accuracy                           0.99      1599
   macro avg       0.96      0.99      0.97      1599
weighted avg       1.00      0.99      1.00      1599



### AdaBoostClassifier

In [23]:
ab = AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=8, n_estimators=250))
ab.fit(X_resampled, Y_resampled)

AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=8,
                                                         n_estimators=250))

In [27]:
y_pred = ab.predict(X)
test_rmse = MSE(Y, y_pred) ** (1 / 2)
print('RMSE test: {:.2f}'.format(test_rmse))

RMSE test set: 0.15


In [24]:
scoring = ['precision_macro', 'recall_macro', 'accuracy']

print(confusion_matrix(Y, y_pred))
print("\nCross Validate:\n")
cv = cross_validate(ab, X, Y, scoring=scoring, cv=5)
scores = cross_val_score(ab, X, Y, cv=5)
print("Cross Validation Score:",scores.mean())
print("Mean Accuracy:",cv['test_accuracy'].mean())
print("Mean Precision:",cv['test_precision_macro'].mean())
print("Mean Recall:",cv['test_recall_macro'].mean())


Cross Validate:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross Validation Score: 0.4546255506607929
Mean Accuracy: 0.45638766519823787
Mean Precision: 0.15560924193700362
Mean Recall: 0.20015083535352293


### GradientBoostingClassifier

In [28]:
gb = GradientBoostingClassifier(n_estimators = 200, max_depth = 7, random_state = 1)
gb.fit(X_resampled, Y_resampled)

GradientBoostingClassifier(max_depth=7, n_estimators=200, random_state=1)

In [30]:
y_pred = gb.predict(X)
test_rmse = MSE(Y, y_pred) ** (1 / 2)
print('RMSE test: {:.2f}'.format(test_rmse))

RMSE test set: 0.00


In [31]:
scoring = ['precision_macro', 'recall_macro', 'accuracy']

print(confusion_matrix(Y, y_pred))
print("\nCross Validate:\n")
cv = cross_validate(gb, X, Y, scoring=scoring, cv=5)
scores = cross_val_score(gb, X, Y, cv=5)
print("Cross Validation Score:",scores.mean())
print("Mean Accuracy:",cv['test_accuracy'].mean())
print("Mean Precision:",cv['test_precision_macro'].mean())
print("Mean Recall:",cv['test_recall_macro'].mean())


Cross Validate:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross Validation Score: 0.5453744493392071
Mean Accuracy: 0.5453744493392071
Mean Precision: 0.29730089374327495
Mean Recall: 0.25691530869102464


In [39]:
rf = RandomForestClassifier(max_depth=8, n_estimators=250)
rf.fit(X_resampled, Y_resampled)

RandomForestClassifier(max_depth=8, n_estimators=250)

In [40]:
y_pred = rf.predict(X)
test_rmse = MSE(Y, y_pred) ** (1 / 2)
print('RMSE test: {:.2f}'.format(test_rmse))

RMSE test set: 0.33


In [41]:
scoring = ['precision_macro', 'recall_macro', 'accuracy']

print(confusion_matrix(Y, y_pred))
print("\nCross Validate:\n")
cv = cross_validate(rf, X, Y, scoring=scoring, cv=5)
scores = cross_val_score(rf, X, Y, cv=5)
print("Cross Validation Score:",scores.mean())
print("Mean Accuracy:",cv['test_accuracy'].mean())
print("Mean Precision:",cv['test_precision_macro'].mean())
print("Mean Recall:",cv['test_recall_macro'].mean())

[[  2   0   0   0   0   0]
 [  0  32   0   0   0   0]
 [  0   1 468  17   4   0]
 [  0   0  59 403   8   2]
 [  0   0   3   0 125   2]
 [  0   0   0   0   0   9]]

Cross Validate:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross Validation Score: 0.5938325991189427
Mean Accuracy: 0.5973568281938326
Mean Precision: 0.316939120008087
Mean Recall: 0.30313429704727096
