# Logistic Regresion - Wine Quality - Multiclass Classification

In [121]:
import pandas as pd

### Loading CSV dataset in which sep is semicolon(;)

In [123]:
data=pd.read_csv('data/winequality.csv',sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [124]:
data.quality.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [125]:
data.quality.value_counts()

quality
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: count, dtype: int64

In [126]:
data.quality.nunique() # there are 7 classes in this . so it is not binary classification. it is multiclass classification

7

In [127]:
data.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.022697,0.289181,0.089021,0.023086,-0.049396,0.09107,0.265331,-0.425858,-0.017143,-0.120881,-0.113663
volatile acidity,-0.022697,1.0,-0.149472,0.064286,0.070512,-0.097012,0.089261,0.027114,-0.031915,-0.035728,0.067718,-0.194723
citric acid,0.289181,-0.149472,1.0,0.094212,0.114364,0.094077,0.121131,0.149503,-0.163748,0.062331,-0.075729,-0.009209
residual sugar,0.089021,0.064286,0.094212,1.0,0.088685,0.299098,0.401439,0.838966,-0.194133,-0.026664,-0.450631,-0.097577
chlorides,0.023086,0.070512,0.114364,0.088685,1.0,0.101392,0.19891,0.257211,-0.090439,0.016763,-0.360189,-0.209934
free sulfur dioxide,-0.049396,-0.097012,0.094077,0.299098,0.101392,1.0,0.615501,0.29421,-0.000618,0.059217,-0.250104,0.008158
total sulfur dioxide,0.09107,0.089261,0.121131,0.401439,0.19891,0.615501,1.0,0.529881,0.002321,0.134562,-0.448892,-0.174737
density,0.265331,0.027114,0.149503,0.838966,0.257211,0.29421,0.529881,1.0,-0.093591,0.074493,-0.780138,-0.307123
pH,-0.425858,-0.031915,-0.163748,-0.194133,-0.090439,-0.000618,0.002321,-0.093591,1.0,0.155951,0.121432,0.099427
sulphates,-0.017143,-0.035728,0.062331,-0.026664,0.016763,0.059217,0.134562,0.074493,0.155951,1.0,-0.017433,0.053678


In [128]:
cormatrix=data.corr()['quality']
cormatrix

fixed acidity          -0.113663
volatile acidity       -0.194723
citric acid            -0.009209
residual sugar         -0.097577
chlorides              -0.209934
free sulfur dioxide     0.008158
total sulfur dioxide   -0.174737
density                -0.307123
pH                      0.099427
sulphates               0.053678
alcohol                 0.435575
quality                 1.000000
Name: quality, dtype: float64

In [129]:
lowcorr=cormatrix[abs(cormatrix)<0.10].index
lowcorr

Index(['citric acid', 'residual sugar', 'free sulfur dioxide', 'pH',
       'sulphates'],
      dtype='object')

In [130]:
data.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [131]:
data.groupby('quality').count()['pH']

quality
3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
Name: pH, dtype: int64

In [132]:
y=data['quality']
X=data.drop('quality',axis=1)

In [133]:
data.quality.value_counts()

quality
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: count, dtype: int64

In [134]:
from sklearn.model_selection import train_test_split #this funciton split x and y randomnly-
#so for each time the output will be diffrent.to keep the data consistant we have to set a seed.
# random_state!=0, we are setting a seed.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,random_state=42,stratify=y) #0.67 data will be for training.


stratification in the context of data splitting (such as during train-test splits or cross-validation) means that the data is divided in such a way that the distribution of the target variable (the class labels) is preserved in each subset. It ensures that each subset (train and test, or each fold in cross-validation) has a similar proportion of each class as in the original dataset.

Stratification helps maintain the proportion of each class in all subsets (train, test, or cross-validation), ensuring the model sees a balanced representation of classes and improving its ability to generalize, especially in imbalanced classification tasks.

In [137]:
from sklearn.linear_model import LogisticRegression

In [138]:
help(LogisticRegression)

Help on class LogisticRegression in module sklearn.linear_model._logistic:

class LogisticRegression(sklearn.linear_model._base.LinearClassifierMixin, sklearn.linear_model._base.SparseCoefMixin, sklearn.base.BaseEstimator)
 |  LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
 |
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the
 |  cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag', 'saga' and 'newton-cg' solvers.)
 |
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag', '

### One vs Rest

In [139]:
classifier=LogisticRegression(multi_class='ovr') #One-VS-Rest
classifier.fit(X_train,y_train) #command for training / fitting the model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [140]:
y_pred=classifier.predict(X_test)
y_pred

array([5, 6, 6, 6, 7, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 5, 6, 6, 6, 5, 6, 7,
       6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 5, 5, 6, 5, 6, 6,
       5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6,
       5, 5, 6, 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 5, 5, 6, 6, 5, 6, 6, 6,
       5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 7, 6, 6,
       6, 5, 6, 5, 6, 6, 6, 7, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 5, 5, 6,
       6, 6, 6, 6, 5, 6, 5, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6, 6, 7, 6, 5, 5,
       6, 6, 5, 6, 5, 5, 6, 6, 6, 6, 5, 6, 7, 6, 6, 5, 6, 6, 6, 7, 7, 6,
       6, 6, 6, 7, 6, 6, 6, 5, 5, 6, 7, 7, 6, 5, 6, 6, 5, 6, 6, 6, 5, 6,
       7, 5, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 5, 6, 7, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6,
       5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5,
       6, 6, 5, 6, 6, 5, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6,
       6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5,

In [141]:
#confusison matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred) #only the diagonal values are true predicitons..rest are false 
# 0+0+139+352+12+0

array([[  0,   0,   1,   3,   0,   0,   0],
       [  0,   0,  21,  12,   0,   0,   0],
       [  0,   0, 140, 150,   1,   0,   0],
       [  0,   0,  69, 358,  13,   0,   0],
       [  0,   0,   6, 152,  18,   0,   0],
       [  0,   0,   2,  28,   5,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0]], dtype=int64)

In [142]:
y_test.shape

(980,)

In [143]:
true_pred=(0+0+140+358+18+0+0)
true_pred

516

In [144]:
accuracy =(true_pred/980)*100
accuracy

52.6530612244898

In [145]:
#accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)


0.5265306122448979

In [146]:
#END