In [1]:
import numpy as np
import pandas
import scipy.stats
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 
%matplotlib inline
import matplotlib.pyplot as plt

# Car Evaluation Database 

Car Evaluation Database is part of UC Irvine Machine Learning Repository. The data can be found at this  [link](http://archive.ics.uci.edu/ml/datasets/Car+Evaluation). The dataset contains 7 attributes:
    1. The class, how clients accepted the car. This attribute has 4 values: unacc, acc, good, vgood
    2. buying: vhigh, high, med, low. 
    3. maint: vhigh, high, med, low.
    4. doors: 2, 3, 4, 5more 
    5. persons: 2, 4, more. 
    6. lug_boot: small, med, big. 
    7. safety: low, med, high.


In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
header =np.array(['buying','maint','doors','persons','lug_boot','safety','class'])
data = pandas.read_csv(url, low_memory=False,names=header)

In [3]:
data['recommend'] = data['class'].map(lambda x:  'dontBuy' if (x=='unacc' or x=='acc') else 'buy')

To simplify the decision making, we set only 2 groups, if the class is either 'unacc' or 'acc' then we recommend not to buy ('dontBuy'), otherwise 'buy'.
The research question is whether there is a relationship among the safety levels and the recommendation. The Ho hypothesis is no relationship thus have same proposition among all groups

In [4]:
ct=pandas.crosstab(data['recommend'],data['safety'])
print (ct)

safety     high  low  med
recommend                
buy          95    0   39
dontBuy     481  576  537


In [5]:
colsum=ct.sum(axis=0)
colpct=ct/colsum
print(colpct)

safety         high  low       med
recommend                         
buy        0.164931    0  0.067708
dontBuy    0.835069    1  0.932292


In [6]:
cs= scipy.stats.chi2_contingency(ct)
print (cs)

(110.68791550403567, 9.2134979089091716e-25, 2, array([[  44.66666667,   44.66666667,   44.66666667],
       [ 531.33333333,  531.33333333,  531.33333333]]))


Since P-value is very small 9.21e-25, we can reject H0 and claim that there is a relationship between safety level and the recommendation (whether to buy). However we need to further check sub-group, by performing post hoc test

In [7]:

def chi2ByCat(recodeObj):
    data['COMP_rec']= data['safety'].map(recodeObj)
    ct=pandas.crosstab(data['recommend'], data['COMP_rec'])
    print (ct)
    colsum=ct.sum(axis=0)
    colpct=ct/colsum
    print(colpct)
    cs= scipy.stats.chi2_contingency(ct)
    print (cs)


In [8]:
recode1 = {"high": "high", "low": "low"}
chi2ByCat(recode1)

COMP_rec   high  low
recommend           
buy          95    0
dontBuy     481  576
COMP_rec       high  low
recommend               
buy        0.164931    0
dontBuy    0.835069    1
(101.37003435741671, 7.6309496731851569e-24, 1, array([[  47.5,   47.5],
       [ 528.5,  528.5]]))


In [9]:
recode1 = {"high": "high", "med": "med"}
chi2ByCat(recode1)

COMP_rec   high  med
recommend           
buy          95   39
dontBuy     481  537
COMP_rec       high       med
recommend                    
buy        0.164931  0.067708
dontBuy    0.835069  0.932292
(25.546139635809165, 4.319293625897001e-07, 1, array([[  67.,   67.],
       [ 509.,  509.]]))


In [10]:
recode1 = {"low": "low", "med": "med"}
chi2ByCat(recode1)

COMP_rec   low  med
recommend          
buy          0   39
dontBuy    576  537
COMP_rec   low       med
recommend               
buy          0  0.067708
dontBuy      1  0.932292
(38.323035455110933, 5.9950769745059618e-10, 1, array([[  19.5,   19.5],
       [ 556.5,  556.5]]))


In [11]:
print ("the adjusted ratio: ",0.05 /3 )

the adjusted ratio:  0.016666666666666666


To conclude, as we can expect, safety level is a major factor, thus its level affects the decision of whether to buy a car or not. Since all p-values are smaller than the adjusted ratio, then for all safety level the ratio of whether to buy is signficant.