# Setup

In [2]:
!wget https://raw.githubusercontent.com/coll-j/Tugas-KK/master/Minggu-5/dataset-car-40-row.csv

--2020-11-03 02:41:23--  https://raw.githubusercontent.com/coll-j/Tugas-KK/master/Minggu-5/dataset-car-40-row.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4628 (4.5K) [text/plain]
Saving to: ‘dataset-car-40-row.csv’


2020-11-03 02:41:23 (32.6 MB/s) - ‘dataset-car-40-row.csv’ saved [4628/4628]



## Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from functools import partial
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import random

## Read Dataset

Dataset cars 40 baris

In [4]:
df = pd.read_csv('dataset-car-40-row.csv', sep=';')
df.head()

Unnamed: 0,Make,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,Audi,Old,regular unleaded,Low,6,MANUAL,front wheel drive,4,Luxury,Midsize,Sedan,Low,Low,High,Cheap
1,Audi,Old,regular unleaded,Low,6,MANUAL,front wheel drive,4,Luxury,Midsize,Sedan,Low,Low,High,Cheap
2,Audi,Old,regular unleaded,Low,6,AUTOMATIC,all wheel drive,4,Luxury,Midsize,Wagon,Low,Low,High,Cheap
3,Audi,Old,regular unleaded,Low,6,AUTOMATIC,front wheel drive,4,Luxury,Midsize,Sedan,Low,Low,High,Cheap
4,Audi,New,diesel,Low,4,AUTOMATIC,front wheel drive,4,Luxury,Compact,Sedan,High,High,High,Cheap


# Function Definitions

## Calculate Entropy

In [5]:
# Define the calculate entropy function
def calculate_entropy(df_label):
    classes,class_counts = np.unique(df_label,return_counts = True)
    entropy_value = np.sum([(-class_counts[i]/np.sum(class_counts))*np.log2(class_counts[i]/np.sum(class_counts)) 
                        for i in range(len(classes))])
    return entropy_value

## Calculate Information Gain

In [6]:
# Define the calculate information gain function
def calculate_information_gain(dataset,feature,label): 
    # Calculate the dataset entropy
    dataset_entropy = calculate_entropy(dataset[label])   
    values,feat_counts= np.unique(dataset[feature],return_counts=True)
    
    # Calculate the weighted feature entropy                                # Call the calculate_entropy function
    weighted_feature_entropy = np.sum([(feat_counts[i]/np.sum(feat_counts))*calculate_entropy(dataset.where(dataset[feature]
                              ==values[i]).dropna()[label]) for i in range(len(values))])    
    feature_info_gain = dataset_entropy - weighted_feature_entropy
    return feature_info_gain

## Create Decision Tree

In [7]:
# Define the create decision tree function
def create_decision_tree(dataset, df, features, label, parent):
    
    datum = np.unique(df[label], return_counts = True)
    unique_data = np.unique(dataset[label])
    # print('len features', len(features))
    if len(unique_data) <= 1:
        return unique_data[0]
    
    elif len(dataset) == 0:
        return unique_data[np.argmax(datum[1])]
    
    elif len(features) == 0:
        return parent
    
    else:
        parent = unique_data[np.argmax(datum[1])]
        
        item_values = [calculate_information_gain(dataset, feature, label) for feature in features]
        
        optimum_feature_index = np.argmax(item_values)
        optimum_feature = features[optimum_feature_index]
        decision_tree = {optimum_feature:{}}
        features = [i for i in features if i != optimum_feature]
        
        for value in np.unique(dataset[optimum_feature]):
            min_data = dataset.where(dataset[optimum_feature] == value).dropna()
            
            min_tree = create_decision_tree(min_data, df, features, label, parent)
            
            decision_tree[optimum_feature][value] = min_tree
        
        return (decision_tree)

## Prediction functions

In [8]:
# Define the predict function
def predict(test_data, decision_tree):
    for nodes in decision_tree.keys():
        value = test_data[nodes]
        if value in decision_tree[nodes].keys():
          decision_tree = decision_tree[nodes][value]
        else:
          decision_tree = decision_tree[nodes][random.choice(list(decision_tree[nodes].keys()))]
        
        prediction = 0
        if type(decision_tree) is dict:
            prediction = predict(test_data, decision_tree)
        else:
            prediction = decision_tree
            break
            
    return prediction

In [9]:
def predict_multiple(input, decision_tree, print_each=False):
  predictions = []
  for i in range(len(input)):

    prediction = predict(input.iloc[i, :], decision_tree)
    predictions.append(prediction)

    if print_each:
      print('Input Data: ')
      print(input.iloc[i, :])
      
      print('Prediction: ')
      print(prediction)
      print()

  return predictions

# Feature Selection

In [10]:
oe = OrdinalEncoder()
X_enc = df.drop('MSRP', axis=1)
X_enc = oe.fit_transform(X_enc)
y = df.iloc[:, -1]

In [11]:
discrete_feat_idx = [i for i in range(14)]
print(discrete_feat_idx)
bestfeatures = SelectKBest(partial(mutual_info_classif, discrete_features=discrete_feat_idx), k='all')
fit = bestfeatures.fit(X_enc, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df.columns[:-1])
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(4,'Score'))  #print 10 best features

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
              Specs     Score
0              Make  0.392889
3         Engine HP  0.392769
1              Year  0.239931
2  Engine Fuel Type  0.214667


Selected features based on scores and p-value
- Make
- Engine HP
- Year
- Engine Fuel Type

In [12]:
# Set the features and label
features = ['Make', 'Engine HP', 'Year', 'Engine Fuel Type']
label = 'MSRP'
parent=None
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)
len(df_train)

30

In [13]:
df_train.head(n=10)

Unnamed: 0,Make,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
25,Mercedes-Benz,Old,regular unleaded,Low,4,MANUAL,rear wheel drive,4,Luxury,Compact,Sedan,Medium,Low,Low,Cheap
9,BMW,New,premium unleaded,High,6,MANUAL,rear wheel drive,2,Luxury,Compact,Coupe,Medium,Medium,High,Expensive
13,BMW,New,premium unleaded,Medium,4,AUTOMATIC,all wheel drive,2,Luxury,Compact,Convertible,Medium,Medium,High,Expensive
31,Mercedes-Benz,Old,regular unleaded,Medium,6,AUTOMATIC,rear wheel drive,4,Luxury,Large,Sedan,Low,Low,Low,Cheap
34,Nissan,New,regular unleaded,High,6,AUTOMATIC,rear wheel drive,2,High-Performance,Compact,Convertible,Low,Low,Medium,Expensive
8,BMW,New,premium unleaded,High,6,MANUAL,rear wheel drive,2,Luxury,Compact,Convertible,Low,Low,High,Expensive
17,Chrysler,New,regular unleaded,High,6,AUTOMATIC,rear wheel drive,4,High-Performance,Large,Sedan,Medium,Low,Low,Expensive
24,Mercedes-Benz,Old,diesel,Low,5,AUTOMATIC,rear wheel drive,4,Luxury,Midsize,Sedan,Medium,Medium,Low,Cheap
0,Audi,Old,regular unleaded,Low,6,MANUAL,front wheel drive,4,Luxury,Midsize,Sedan,Low,Low,High,Cheap
33,Nissan,Old,regular unleaded,Low,4,MANUAL,rear wheel drive,2,Performance,Compact,Coupe,Low,Medium,Medium,Cheap


# Train/Create Decision Tree

In [14]:
# Train the decision tree model
decision_tree = create_decision_tree(df_train,df_train,features,label,parent)

# Prediction on Test Data

In [15]:
df_test_sliced = df_test[features]

## Result

In [18]:
prediction = predict_multiple(df_test_sliced, decision_tree)

df_test_sliced['prediction'] = prediction
df_test_sliced['Real value'] = df_test.iloc[:, -1]
df_test_sliced

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Make,Engine HP,Year,Engine Fuel Type,prediction,Real value
19,Chrysler,High,New,regular unleaded,Expensive,Cheap
16,Chrysler,High,New,regular unleaded,Expensive,Expensive
15,BMW,Low,New,premium unleaded,Cheap,Cheap
26,Mercedes-Benz,Low,Old,regular unleaded,Cheap,Cheap
4,Audi,Low,New,diesel,Cheap,Cheap
12,BMW,Medium,New,premium unleaded,Expensive,Expensive
37,Nissan,High,New,regular unleaded,Expensive,Cheap
27,Mercedes-Benz,Low,Old,regular unleaded,Cheap,Cheap
39,Volvo,Low,Old,regular unleaded,Cheap,Cheap
6,BMW,High,New,premium unleaded,Expensive,Expensive


In [23]:
tot_correct = np.sum(np.where(df_test_sliced['prediction'] == df_test_sliced['Real value'], 1, 0))
tot_data = len(df_test_sliced)
acc = tot_correct / tot_data
acc

0.8

# Side Note

In [17]:
decision_tree

{'Engine HP': {'High': 'Expensive',
  'Low': 'Cheap',
  'Medium': {'Make': {'Audi': 'Cheap',
    'BMW': 'Expensive',
    'Chrysler': 'Cheap',
    'Mercedes-Benz': 'Cheap'}}}}

In [21]:
sample_data = {'Make':'Mazda', 'Engine HP':'Medium', 'Year':'New', 'Engine Fuel Type':'regular unleaded'}
test_data = pd.Series(sample_data)
predict(test_data, decision_tree)

'Expensive'