In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

## Get Data

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
columns = ('buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability')
data = pd.read_csv(url, header=None, names=columns)

## Investigate Data

In [3]:
data.head()

Unnamed: 0,buying,maintenance,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


## Preprocessing

### One-Hot Encode

In [4]:
features = ['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety']
data = pd.get_dummies(data, columns=features, drop_first=True)
data.head()

Unnamed: 0,acceptability,buying_low,buying_med,buying_vhigh,maintenance_low,maintenance_med,maintenance_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
0,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
1,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
2,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
3,unacc,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0
4,unacc,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1


### Target --> Categorical

In [5]:
data['acceptability'] = data['acceptability'].astype('category')

## Check Data

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 16 columns):
acceptability        1728 non-null category
buying_low           1728 non-null uint8
buying_med           1728 non-null uint8
buying_vhigh         1728 non-null uint8
maintenance_low      1728 non-null uint8
maintenance_med      1728 non-null uint8
maintenance_vhigh    1728 non-null uint8
doors_3              1728 non-null uint8
doors_4              1728 non-null uint8
doors_5more          1728 non-null uint8
persons_4            1728 non-null uint8
persons_more         1728 non-null uint8
lug_boot_med         1728 non-null uint8
lug_boot_small       1728 non-null uint8
safety_low           1728 non-null uint8
safety_med           1728 non-null uint8
dtypes: category(1), uint8(15)
memory usage: 27.3 KB


## Split Target

In [7]:
target = data.pop('acceptability')

## Check Class Balance

In [8]:
pd.value_counts(target.values, sort=False)

acc       384
good       69
unacc    1210
vgood      65
dtype: int64

## Run Models

In [23]:
lr_unbalanced = LogisticRegression(penalty='l1')
lr_balanced = LogisticRegression(penalty='l1', class_weight='balanced')

## Cross-Val Scores

In [26]:
np.mean(cross_val_score(lr_unbalanced, data, target, scoring='accuracy', cv=10, n_jobs=-1))

0.8156833250018074

In [27]:
np.mean(cross_val_score(lr_balanced, data, target, scoring='accuracy', cv=10, n_jobs=-1))

0.82364207512490994

## Grid Search

In [60]:
model = LogisticRegression()
params = {'penalty':('l1', 'l2')}
gs = GridSearchCV(model, params, cv=10, return_train_score=True)

In [61]:
gs.fit(data, target)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ('l1', 'l2')}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [62]:
gs.cv_results_

{'mean_fit_time': array([ 0.03457668,  0.01051238]),
 'mean_score_time': array([ 0.0006649 ,  0.00082512]),
 'mean_test_score': array([ 0.81539352,  0.79398148]),
 'mean_train_score': array([ 0.90033775,  0.87899007]),
 'param_penalty': masked_array(data = ['l1' 'l2'],
              mask = [False False],
        fill_value = ?),
 'params': [{'penalty': 'l1'}, {'penalty': 'l2'}],
 'rank_test_score': array([1, 2], dtype=int32),
 'split0_test_score': array([ 0.75862069,  0.73563218]),
 'split0_train_score': array([ 0.9028314 ,  0.88288288]),
 'split1_test_score': array([ 0.6954023 ,  0.67816092]),
 'split1_train_score': array([ 0.91312741,  0.89253539]),
 'split2_test_score': array([ 0.77586207,  0.81034483]),
 'split2_train_score': array([ 0.91634492,  0.88931789]),
 'split3_test_score': array([ 0.79310345,  0.74137931]),
 'split3_train_score': array([ 0.8970399 ,  0.88352638]),
 'split4_test_score': array([ 0.82080925,  0.80346821]),
 'split4_train_score': array([ 0.89903537,  0.8745980

## OVR vs Multinomial

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [70]:
ovr = LogisticRegression(penalty='l1', random_state=12, multi_class='ovr', solver='saga')
multi = LogisticRegression(penalty='l1', random_state=12, multi_class='multinomial', solver='saga')

In [71]:
ovr.fit(X_train, y_train)
multi.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l1', random_state=12, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

### Normalized Probabilities

In [79]:
ovr.predict_proba(X_test)[0]

array([  1.46338352e-02,   8.85751013e-06,   9.85145618e-01,
         2.11689294e-04])

### True Probabilities

In [80]:
multi.predict_proba(X_test)[0]

array([  4.44258064e-05,   1.10018255e-08,   9.99955435e-01,
         1.27718727e-07])

## OVR vs Multinomial

OVR
* fast
* great for classifying multiple classes
* not actual probabilities (normalized but assumes binomial distribution)

Multinomial
* slower
* great for classifying multiple classes
* actual probabilities