In [30]:
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [31]:
data = load_wine()

In [32]:
data.keys()

dict_keys(['target_names', 'data', 'target', 'feature_names', 'DESCR'])

In [33]:
X = data['data']

In [34]:
y = data['target']

In [35]:
wine_df = pd.DataFrame(X,columns = data['feature_names'])

In [36]:
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
model = LogisticRegression()

In [41]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
predictions = model.predict(X_test)

In [43]:
print('train score {}' .format(model.score(X_train, y_train)))
print('test score {}' .format(model.score(X_test, y_test)))

train score 0.9905660377358491
test score 0.9444444444444444


In [44]:
from sklearn.metrics import confusion_matrix, classification_report

In [45]:
print(confusion_matrix(predictions, y_test))

[[22  0  0]
 [ 3 26  0]
 [ 0  1 20]]


In [46]:
print(classification_report(predictions, y_test))

             precision    recall  f1-score   support

          0       0.88      1.00      0.94        22
          1       0.96      0.90      0.93        29
          2       1.00      0.95      0.98        21

avg / total       0.95      0.94      0.94        72



In [47]:
Steps = (
        ['scaler' , StandardScaler()],
        ['classifier', LogisticRegression()])

In [48]:
pipe = Pipeline(Steps)

In [49]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ['classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)]])

In [50]:
print("Training set score : {}" .format(pipe.score(X_train, y_train)) )
print("Test set score : {}" .format(pipe.score(X_test, y_test)) )

Training set score : 1.0
Test set score : 0.9861111111111112


In [51]:
pred = pipe.predict(X_test)

In [70]:
from sklearn import metrics
print (metrics.accuracy_score(y_test,pred))
y_pred_prob = pipe.predict_proba(X_test)[:,1]
y_pred_prob 

0.9861111111111112


array([8.94005420e-02, 2.68949568e-01, 3.21323624e-03, 1.82988982e-03,
       9.81842891e-04, 8.10262472e-01, 2.71462771e-03, 8.99400338e-03,
       8.35668062e-01, 6.79835873e-01, 3.39528790e-03, 7.82605232e-01,
       8.72993902e-02, 5.34554216e-02, 1.13999873e-03, 1.85821081e-03,
       9.73965898e-01, 1.28336267e-01, 9.86926713e-01, 9.87302088e-01,
       8.93371327e-01, 7.71209148e-02, 2.76001933e-01, 1.70889772e-04,
       2.05154499e-03, 2.05009967e-02, 9.70481500e-01, 8.65742610e-01,
       1.22433188e-02, 9.16632073e-01, 7.71491434e-04, 5.09702937e-01,
       6.11426771e-01, 4.17796331e-02, 1.69851208e-01, 1.78788471e-03,
       3.82796304e-02, 4.81951140e-01, 9.54785486e-01, 3.84102523e-03,
       9.33479787e-01, 2.10037164e-02, 2.79999039e-02, 6.32734977e-01,
       1.47261657e-01, 9.61641381e-01, 9.92721356e-01, 4.31757623e-02,
       1.28873942e-03, 8.16719597e-01, 1.89521143e-02, 7.56078159e-01,
       9.44156052e-01, 2.08398775e-03, 9.22677631e-01, 9.73238244e-01,
      

In [71]:
#ROC-- important first argument is true values, second argument is predcited probabiliteis
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred_prob)

ValueError: multiclass format is not supported