In [1]:
import numpy as np
import pandas as pd

carData = pd.read_csv('motor_vehicles.csv')
print(carData.shape)  # gives a tuple with the shape of data frame

(30, 5)


In [2]:
print(carData.head())

    price maintenance  trunk safety RATING
0     low   high_cost    med   good   HIGH
1  medium    low_cost    big   poor    LOW
2    high    low_cost    big   good   HIGH
3  luxury   high_cost  small   poor    LOW
4  medium    med_cost  small    ave    LOW


In [3]:
splitRatio = .85

In [9]:
price_labels = ['low', 'medium', 'high', 'luxury']
maint_labels = ['low_cost', 'med_cost', 'high_cost', 'very_high_cost']
trunk_labels = ['small', 'med', 'big']
safety_labels = ['poor', 'ave', 'good']

In [11]:
# Do the shuffle!
reCarData = carData.reindex(np.random.permutation(carData.index))
shuffCarData = reCarData.reset_index()

In [13]:
shuffCarData

Unnamed: 0,index,price,maintenance,trunk,safety,RATING
0,5,high,very_high_cost,med,poor,LOW
1,18,luxury,high_cost,small,ave,LOW
2,2,high,low_cost,big,good,HIGH
3,1,medium,low_cost,big,poor,LOW
4,26,luxury,med_cost,small,ave,LOW
5,28,low,high_cost,small,ave,LOW
6,12,high,high_cost,big,good,LOW
7,7,high,very_high_cost,small,poor,LOW
8,9,high,med_cost,big,good,HIGH
9,15,low,low_cost,big,poor,HIGH


In [15]:
shuffCarData.pop('index')

0      5
1     18
2      2
3      1
4     26
5     28
6     12
7      7
8      9
9     15
10     0
11    25
12    16
13    22
14    10
15    20
16     4
17    17
18     8
19    11
20    27
21    14
22    21
23     6
24    23
25    13
26    24
27     3
28    29
29    19
Name: index, dtype: int64

In [17]:
shuffCarData

Unnamed: 0,price,maintenance,trunk,safety,RATING
0,high,very_high_cost,med,poor,LOW
1,luxury,high_cost,small,ave,LOW
2,high,low_cost,big,good,HIGH
3,medium,low_cost,big,poor,LOW
4,luxury,med_cost,small,ave,LOW
5,low,high_cost,small,ave,LOW
6,high,high_cost,big,good,LOW
7,high,very_high_cost,small,poor,LOW
8,high,med_cost,big,good,HIGH
9,low,low_cost,big,poor,HIGH


In [19]:
# function definition: getting the counts (frequencies of occurrence)
def count(data,colname,label,target):    # for whichever data set, pick a column and one value in that column
                                         # and an output label (target) to compare against
    condition = (data[colname] == label) & (data['RATING'] == target)
    print(data[condition])
    return len(data[condition])

In [21]:
# list of predicted values
predicted = []

# dictionary to store probabilities
probabilities = {0:{},1:{}}

#length of current training set
train_len = int(splitRatio * len(shuffCarData))
print(train_len)

#splitting the data into training and test sets
training = shuffCarData.iloc[:train_len,:] # the first train_len rows, all columns

25


In [23]:
# debugging step: looking at the training set
training

Unnamed: 0,price,maintenance,trunk,safety,RATING
0,high,very_high_cost,med,poor,LOW
1,luxury,high_cost,small,ave,LOW
2,high,low_cost,big,good,HIGH
3,medium,low_cost,big,poor,LOW
4,luxury,med_cost,small,ave,LOW
5,low,high_cost,small,ave,LOW
6,high,high_cost,big,good,LOW
7,high,very_high_cost,small,poor,LOW
8,high,med_cost,big,good,HIGH
9,low,low_cost,big,poor,HIGH


In [25]:
test_X = shuffCarData.iloc[train_len:,:-1] # features in the test set
test_y = shuffCarData.iloc[train_len:,-1]  # targets (the last entry in each row; i.e., the last column)

In [27]:
# debugging
print(test_X)

     price maintenance  trunk safety
25     low    low_cost    med    ave
26    high   high_cost    big   good
27  luxury   high_cost  small   poor
28     low   high_cost    big   good
29  luxury    med_cost    med   good


In [29]:
# debugging
print(test_y)

25    HIGH
26    HIGH
27     LOW
28    HIGH
29    HIGH
Name: RATING, dtype: object


In [31]:
# checking for dimension match
print(test_X.shape)
print(test_y.shape)

(5, 4)
(5,)


In [33]:
# counts of prior 'LOW' and 'HIGH' labels (targets)
count_LOW = count(training, 'RATING', 'LOW', 'LOW')
count_HIGH = count(training, 'RATING', 'HIGH', 'HIGH')

prior_LOW = count_LOW / len(training)
prior_HIGH = count_HIGH / len(training)

print('The prior \'LOW\' count is', count_LOW)
print('and \'HIGH\',',count_HIGH, '.')
print('The prior probability of \'LOW\' is', prior_LOW)
print('and of \'HIGH\',', prior_HIGH, '.')

     price     maintenance  trunk safety RATING
0     high  very_high_cost    med   poor    LOW
1   luxury       high_cost  small    ave    LOW
3   medium        low_cost    big   poor    LOW
4   luxury        med_cost  small    ave    LOW
5      low       high_cost  small    ave    LOW
6     high       high_cost    big   good    LOW
7     high  very_high_cost  small   poor    LOW
12  medium        med_cost    med   good    LOW
13  medium        med_cost    med    ave    LOW
16  medium        med_cost  small    ave    LOW
18  medium        med_cost  small   poor    LOW
19  medium  very_high_cost  small   poor    LOW
24  luxury       high_cost    big    ave    LOW
     price maintenance trunk safety RATING
2     high    low_cost   big   good   HIGH
8     high    med_cost   big   good   HIGH
9      low    low_cost   big   poor   HIGH
10     low   high_cost   med   good   HIGH
11  medium    med_cost   med    ave   HIGH
14    high    low_cost   med   good   HIGH
15     low    low_cost   bi

In [35]:
# TRAINING: learning the priors and class-conditionals from the training set
for col in training.columns[:-1]:
    probabilities[0][col] = {}
    probabilities[1][col] = {}

    if col == "price":
        for k in price_labels:
            count_k_LOW = count(training, col, k, 'LOW')
            count_k_HIGH = count(training, col, k, 'HIGH')
            probabilities[0][col][k] = count_k_LOW / count_LOW
            probabilities[1][col][k] = count_k_HIGH / count_HIGH
            probabilities[0][col][k] += 0.001 # throwing in a quick-and-dirty approximation of the
            probabilities[1][col][k] += 0.001 # "m estimate" for avoiding zeros in the product    
    else:
        if col == "maint_cost":
            for k in maint_labels:
                count_k_LOW = count(training, col, k, 'LOW')
                count_k_HIGH = count(training, col, k, 'HIGH')
                probabilities[0][col][k] = count_k_LOW / count_LOW
                probabilities[1][col][k] = count_k_HIGH / count_HIGH
                probabilities[0][col][k] += 0.001 # throwing in a quick-and-dirty approximation of the
                probabilities[1][col][k] += 0.001 # "m estimate" for avoiding zeros in the product
        else:
            if col == "trunk":
                for k in trunk_labels:
                    count_k_LOW = count(training, col, k, 'LOW')
                    count_k_HIGH = count(training, col, k, 'HIGH')
                    probabilities[0][col][k] = count_k_LOW / count_LOW
                    probabilities[1][col][k] = count_k_HIGH / count_HIGH
                    probabilities[0][col][k] += 0.001 # throwing in a quick-and-dirty approximation of the
                    probabilities[1][col][k] += 0.001 # "m estimate" for avoiding zeros in the product
            else:
                for k in safety_labels:
                    count_k_LOW = count(training, col, k, 'LOW')
                    count_k_HIGH = count(training, col, k, 'HIGH')
                    probabilities[0][col][k] = count_k_LOW / count_LOW
                    probabilities[1][col][k] = count_k_HIGH / count_HIGH
                    probabilities[0][col][k] += 0.001 # throwing in a quick-and-dirty approximation of the
                    probabilities[1][col][k] += 0.001 # "m estimate" for avoiding zeros in the product

  price maintenance  trunk safety RATING
5   low   high_cost  small    ave    LOW
   price maintenance trunk safety RATING
9    low    low_cost   big   poor   HIGH
10   low   high_cost   med   good   HIGH
15   low    low_cost   big   good   HIGH
22   low    med_cost   med    ave   HIGH
     price     maintenance  trunk safety RATING
3   medium        low_cost    big   poor    LOW
12  medium        med_cost    med   good    LOW
13  medium        med_cost    med    ave    LOW
16  medium        med_cost  small    ave    LOW
18  medium        med_cost  small   poor    LOW
19  medium  very_high_cost  small   poor    LOW
     price maintenance trunk safety RATING
11  medium    med_cost   med    ave   HIGH
17  medium    med_cost   med   good   HIGH
  price     maintenance  trunk safety RATING
0  high  very_high_cost    med   poor    LOW
6  high       high_cost    big   good    LOW
7  high  very_high_cost  small   poor    LOW
   price maintenance trunk safety RATING
2   high    low_cost   big 

In [37]:
import pprint

# Sample dictionary for demonstration
pp = pprint.PrettyPrinter(indent=4)

# Format each value in the nested dictionaries and print
formatted_probabilities = {
    key: {inner_key: {sub_key: "{:.3f}".format(sub_value) for sub_key, sub_value in inner_value.items()}
           for inner_key, inner_value in value.items()}
    for key, value in probabilities.items()
}

pp.pprint(formatted_probabilities)

{   0: {   'maintenance': {'ave': '0.001', 'good': '0.001', 'poor': '0.001'},
           'price': {   'high': '0.232',
                        'low': '0.078',
                        'luxury': '0.232',
                        'medium': '0.463'},
           'safety': {'ave': '0.463', 'good': '0.155', 'poor': '0.386'},
           'trunk': {'big': '0.232', 'med': '0.232', 'small': '0.539'}},
    1: {   'maintenance': {'ave': '0.001', 'good': '0.001', 'poor': '0.001'},
           'price': {   'high': '0.418',
                        'low': '0.334',
                        'luxury': '0.084',
                        'medium': '0.168'},
           'safety': {'ave': '0.334', 'good': '0.584', 'poor': '0.084'},
           'trunk': {'big': '0.418', 'med': '0.584', 'small': '0.001'}}}


In [39]:
# TESTING: reading rows from the test set, checking the value of each feature and comparing with the target outcome
for row in range(len(test_X)):
    prod_LOW = prior_LOW
    prod_HIGH = prior_HIGH
        
    for feature in test_X.columns:
        # Check if the feature value exists in the probabilities dictionary:
        if test_X[feature].iloc[row] in probabilities[0][feature]:
            prod_LOW *= probabilities[0][feature][test_X[feature].iloc[row]]
        else:
            prod_LOW *= 0  # or some default value if the feature value is not found

        if test_X[feature].iloc[row] in probabilities[1][feature]:
            prod_HIGH *= probabilities[1][feature][test_X[feature].iloc[row]]
        else:
            prod_HIGH *= 0  # or some default value if the feature value is not found
            
    # predicting the outcome
    if prod_LOW > prod_HIGH:
        predicted.append('LOW')
    else:
        predicted.append('HIGH')

# Initialize correct and incorrect counts outside the loop
correct = 0
incorrect = 0

for j in range(len(test_y)):
    if predicted[j] == 'LOW':
        if test_y.iloc[j] == 'LOW':
            correct += 1
        else:
            incorrect += 1
    else:
        if test_y.iloc[j] == 'LOW':
            incorrect += 1  # This should be incorrect since predicted is 'HIGH'
        else:
            correct += 1  # This should be correct since predicted is 'HIGH'

print('The accuracy is', correct / (correct + incorrect))

The accuracy is 0.8
