In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from itertools import product 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
SEED = 20

# For RepeatedStratifiedKFold
N_REPEATS=3

# For GridSearchCV 
N_JOBS=-1
SCORING='accuracy'
ERROR_SCORE=0

# Importing Data

In [4]:
col_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
cars = pd.read_csv("car.data", header=None, names=col_names)

# Exploring Data

In [5]:
cars.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
cars.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [7]:
# to check for any null values
cars.isnull().sum(axis=0)

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

From the results above, there is no null value in any of the columns

In [8]:
for col in cars:
    print(f"Unique values and count for column {col} :\n{cars[col].value_counts()}\n")

Unique values and count for column buying :
vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64

Unique values and count for column maint :
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64

Unique values and count for column doors :
2        432
3        432
4        432
5more    432
Name: doors, dtype: int64

Unique values and count for column persons :
2       576
4       576
more    576
Name: persons, dtype: int64

Unique values and count for column lug_boot :
small    576
med      576
big      576
Name: lug_boot, dtype: int64

Unique values and count for column safety :
low     576
med     576
high    576
Name: safety, dtype: int64

Unique values and count for column class :
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64



From the results above, it shows that most categorical variables have uniformly distributed values except for class.

In [9]:
# persons column is dropped as it is not provided in the test data
cars = cars.drop("persons", axis=1)

Other than dropping the column, the alternative is to impute the data, such as using mode or nearest neighbors. Using mode will not be useful as the unique values for persons column is uniformly distributed.

In [10]:
y = cars.pop("buying")
X = cars

In [11]:
def grid_search_model(X, y, model, grid):
    # X : inputs variables
    # Y : target variable
    # model : sklearn model
    # grid : grid that contains different parameters' values for the sklearn model
    
    # The function will run the models using different parameters and cross validations based on N_JOBS
    # Once the output of cross validation is available, the maximum value and parameters that yield that maximum value are stored
    
    cv = RepeatedStratifiedKFold(n_repeats=N_REPEATS, random_state=SEED)
    grid_search = GridSearchCV(estimator=model,
                               param_grid=grid,
                               n_jobs=N_JOBS,
                               cv=cv, 
                               scoring=SCORING)
                               #error_score=ERROR_SCORE)
    grid_result = grid_search.fit(X, y)
    max_means_score = np.nanmax( grid_result.cv_results_['mean_test_score'] )
    avg_means_score = np.nanmean( grid_result.cv_results_['mean_test_score'] )
    has_max_score =  grid_result.cv_results_['mean_test_score'] == max_means_score
    params_for_max_means_score =  np.array(grid_result.cv_results_['params']) [has_max_score]
    return max_means_score, avg_means_score, params_for_max_means_score

The function above will be used to run grid search on multiple models and models' parameters (i.e. grid). Some of the models that are included to predict the buying price are Random Forest, Logistic Regression, KNeighbours, Support Vector, and Decision Tree.  

# Random Forest Variations

In [12]:
rf_model = RandomForestClassifier()
max_depth_list = [10, 30, 40]
min_samples_split_list = [2, 20, 200]
min_samples_leaf_list = [1, 10, 100]
n_stimators_list = [10, 100, 200]
rf_grid = dict(max_depth=max_depth_list,
               min_samples_split=min_samples_split_list,
               min_samples_leaf=min_samples_leaf_list,
               n_estimators=n_stimators_list)

# Logistic Regression Variations

In [13]:
lr_model = LogisticRegression()
solver_list = ["newton-cg", "lbfgs", "liblinear"]
penalty_list = [ "none", "l2"]
C_list = [10, 1, 0.1]
lr_grid = dict(solver=solver_list,
               penalty=penalty_list,
               C=C_list)

# KNeighbours Variations

In [14]:
kn_model = KNeighborsClassifier()
n_neighbors_list = [5,10, 20]
weights_list = ["uniform", "distance"]
metric_list = ['euclidean', 'manhattan']
kn_grid = dict(n_neighbors=n_neighbors_list,
               weights=weights_list,
               metric=metric_list)

# SVC Variations

In [15]:
svc_model = SVC()
kernel_list = ['poly', 'rbf', 'sigmoid']
C_list = [50, 10, 1.0, 0.1, 0.01]
gamma_list = ['scale']
svc_grid = dict(kernel=kernel_list,
                C=C_list,
                gamma=gamma_list)

# DecisionTreeClassifier Variations

In [16]:
dt_model = DecisionTreeClassifier()
criterion_list = ["gini", "entropy"]
max_depth = [10, 30, 40]
min_samples_split_list = [2, 20, 200]
min_samples_leaf_list = [1, 10, 100]
dt_grid = dict(criterion=criterion_list,
               max_depth=max_depth_list,
               min_samples_split=min_samples_split_list,
               min_samples_leaf=min_samples_leaf_list)

In [17]:
models = [
    {"name" : "rf",
     "instance" : rf_model,
     "grid" : rf_grid},
    {"name" : "lr",
     "instance" : lr_model,
     "grid" : lr_grid},
    {"name" : "svc",
     "instance" : svc_model,
     "grid" : svc_grid},
    {"name" : "dt",
     "instance" : dt_model,
     "grid" : dt_grid}
]

X_encoders = [
    {"name" : "ordinal" ,
     "instance" : OrdinalEncoder()},
    {"name" : "onehot",
     "instance" : OneHotEncoder(drop="first", sparse=False)}
]

y_encoders = [
    {"name" : "label",
     "instance" : LabelEncoder()}
]

Ordinal encoder is useful if there is relationship between the categories. An example will be vhigh > high > med > low for maintenance price. On the other hand, one hot encoder is useful when there is no known relationship.

In [18]:
results = []

# to generate all possible combinations of these parameters
for params in product(*[X_encoders, y_encoders, models]):
    X_encoder, y_encoder, model = params
    print(f"Processing {model['name']}")
    curr_X_train = X_encoder["instance"].fit_transform(X)
    curr_y_train = y_encoder["instance"].fit_transform(y)
    max_means_score, avg_means_score, params_for_max_means_score = grid_search_model(curr_X_train,
                                                                                     curr_y_train,
                                                                                     model["instance"],
                                                                                     model["grid"])
    results += [{
        "x_encoder_name" : X_encoder["name"],
        "y_encoder_name" : y_encoder["name"],
        "model_name" : model["name"],
        "max_means_score" : max_means_score,
        "avg_means_score" : avg_means_score,
        "params_for_max_means_score" : params_for_max_means_score
    }]

Processing rf
Processing lr


45 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tanid\Miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tanid\Miniconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\tanid\Miniconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for

Processing svc
Processing dt
Processing rf
Processing lr


45 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tanid\Miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tanid\Miniconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\tanid\Miniconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for

Processing svc
Processing dt


In [19]:
results = pd.DataFrame.from_records(results)

In [20]:
results.head()

Unnamed: 0,x_encoder_name,y_encoder_name,model_name,max_means_score,avg_means_score,params_for_max_means_score
0,ordinal,label,rf,0.288582,0.227067,"[{'max_depth': 10, 'min_samples_leaf': 1, 'min..."
1,ordinal,label,lr,0.2367,0.233742,"[{'C': 0.1, 'penalty': 'l2', 'solver': 'liblin..."
2,ordinal,label,svc,0.280278,0.247326,"[{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}]"
3,ordinal,label,dt,0.278164,0.218024,"[{'criterion': 'entropy', 'max_depth': 10, 'mi..."
4,onehot,label,rf,0.293592,0.237386,"[{'max_depth': 10, 'min_samples_leaf': 1, 'min..."


In [21]:
best_result = results.iloc[results["max_means_score"].idxmax(), :]

In [25]:
best_result.max_means_score, best_result.x_encoder_name, best_result.model_name, best_result.params_for_max_means_score

(0.30902404289184887,
 'onehot',
 'lr',
 array([{'C': 10, 'penalty': 'none', 'solver': 'newton-cg'},
        {'C': 10, 'penalty': 'none', 'solver': 'lbfgs'},
        {'C': 1, 'penalty': 'none', 'solver': 'newton-cg'},
        {'C': 1, 'penalty': 'none', 'solver': 'lbfgs'},
        {'C': 0.1, 'penalty': 'none', 'solver': 'newton-cg'},
        {'C': 0.1, 'penalty': 'none', 'solver': 'lbfgs'}], dtype=object))

The result above is the encoder, model, and model's parameters of the config that yield the best means_score in terms of accuracy.

In [23]:
X_test = [{
    "maint" : "high",
    "doors" : "4",
    "lug_boot" : "big",
    "safety" : "high",
    "class" : "good"
}]
X_test = pd.DataFrame.from_records(X_test)
X_test

Unnamed: 0,maint,doors,lug_boot,safety,class
0,high,4,big,high,good


In [24]:
X_train = X_encoders[1]["instance"].fit_transform(X.to_numpy())
X_test = X_encoders[1]["instance"].transform(X_test.to_numpy())

y_train = y_encoders[0]["instance"].fit_transform(y.to_numpy())

svc_model = SVC(kernel='sigmoid', C=1, gamma='scale')
svc_model.fit(X_train, y_train)

prediction = svc_model.predict(X_test)
y_encoders[0]["instance"].inverse_transform(prediction)

array(['low'], dtype=object)

The baseline model has accuracy of 25%, which is by simply predicting only one class. As the best model's accuracy is 30.9%, it is slightly better than the baseline. However, more can be done 