In [8]:
# Libraries you will need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [9]:
# load the data and check out the description
wine = load_wine()
feat_df = pd.DataFrame(wine["data"], columns=wine["feature_names"])
target_series = pd.Series(wine["target"])
print(wine["DESCR"])

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [10]:
#EDA can go here

feat_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [11]:
feat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [12]:
target_series.unique()

array([0, 1, 2])

In [15]:
target_series

0      0
1      0
2      0
3      0
4      0
      ..
173    2
174    2
175    2
176    2
177    2
Length: 178, dtype: int64

### Task

We want you to implement the one-vs-rest method for multiclass prediction. This is the main idea to that algorithm:



*   For each class C:
       -         make a class vector that has 1 if the class is class C, 0 otherwise
       -         Train a classifier using that vector as the labels
*   When classifying on test set
       -      Make a prediction with each classifier
       -      Predicted class is max probability predicted by the classifiers
                                                                      
We have provided the main outline of the Cross Validation experiment. You need to fill in the contents of the two inner loops. There are comment in the code that direct you where to put your code and what steps you should take. You will need to use our predefined function (binarize_class) and the LogisticRegression class:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


In [28]:


def binarize_class(class_array, class_int):
    """
    class_array - the original class vector(contains multiple class labels)
    class_int - int, the label of the class you want to transform to one-vs-rest
    """
    return class_array.map(lambda x, y=class_int: 1 if x==y else 0)

total_labels = []
total_preds = []

np.random.seed(1340)
skf = StratifiedKFold(n_splits=5, shuffle=True)
for train_indices, test_indices in skf.split(feat_df, target_series):
    x_train = feat_df.iloc[train_indices, :]
    y_train = target_series.iloc[train_indices]


    
    x_test = feat_df.iloc[test_indices, :]
    y_test = target_series.iloc[test_indices]
    
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    x_train_scaled = scaler.fit_transform(x_train)
    
    regressors = []
    for y_class in y_train.unique():
        # fill in the code for training here
        # Use binarize_class to transform the y_train vector for each class
        # Use LogisticRegression() model and fit it to the binarized vector
        # store resulting model in the regressors list
      y_binarized = binarize_class(y_train,y_class)
        
      model = LogisticRegression()
      model.fit(x_train_scaled, y_binarized)
      regressors = model.coef_
        



        


    preds = []
    for regressor in regressors:
        # fill in the code for predicting here
        # make a probability prediction on the x_test array with 
        # each regressor (classifier) that you trained
        # turn the prediction into a single column (the output is usually two columns)
        # You can use [:, [1]] to do that transformation
        # store the prediction vector in the preds list
      regressor

    # this is the actual argmax computation in numpy
    # after you have populated preds, these lines will select the class with
    # the highest probability prediction
    # preds = np.concatenate(preds, axis=1)
    # preds = np.argmax(preds, axis=1)

    # total_labels.extend(list(y_test))
    # total_preds.extend(list(preds))

# acc = accuracy_score(total_labels, total_preds)
# print("Score:", acc)


In [29]:
regressors

array([[ 0.41694738,  0.42447266,  0.40734991,  0.27726018,  0.17818098,
        -0.5123651 , -1.37282605, -0.02961389, -0.67041037,  1.1772466 ,
        -1.07747022, -1.23566648,  0.16689452]])