In [12]:
import pandas as pd
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Read the CSV file into a pandas DataFrame
train_data = pd.read_csv('ML_DATA.csv')
test_data = pd.read_csv('ML_DATA2.csv')

In [22]:
# Combine the training and testing datasets for consistent preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)
combined_data.head()

Unnamed: 0,Age,Has_Job,Own_House,Credit_Rating,Class
0,Young,False,False,fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
3,Young,True,True,fair,Yes
4,Young,False,False,fair,No


In [25]:
# Perform one-hot encoding for categorical variables
categorical_features = ['Age', 'Has_Job', 'Own_House', 'Credit_Rating']
encoder = OneHotEncoder(handle_unknown="ignore")
encoded_data = encoder.fit_transform(combined_data[categorical_features])



In [16]:
# Split the encoded data back into training and testing datasets
train_encoded_data = encoded_data[:len(train_data)]
test_encoded_data = encoded_data[len(train_data):]


In [17]:
# Split the DataFrame into feature matrix (X) and target variable (y)
X_train = train_encoded_data
y_train = train_data['Class']
X_test = test_encoded_data


In [31]:
# Calculate the entropy of the target variable (Class)
class_counts = y_train.value_counts()#Counts the number of occurances of each class in the target variable
total_examples = len(y_train)#Calculates occurances
entropy_s = 0 # initialize the value of entropy

for count in class_counts:# Defined to calculate the entropy of the target variable using entropy formula
    p = count / total_examples
    entropy_s -= p * math.log2(p)

print("Entropy(S):", entropy_s)

Entropy(S): 0.9709505944546686


In [19]:
# Calculate the information gain for each attribute
information_gains = {}

for i, attribute in enumerate(categorical_features):
    attribute_entropy = 0# initialize the entropy value of each attribute
    attribute_values = encoder.categories_[i]#Retriving of each attribute from the encoder

    for value in attribute_values: #Calculate the entropy for each attribut
        value_examples = train_data[train_data[attribute] == value]
        value_class_counts = value_examples['Class'].value_counts()
        value_entropy = 0

        for class_count in value_class_counts:
            p = class_count / len(value_examples)
            value_entropy -= p * math.log2(p)

        attribute_entropy += (len(value_examples) / total_examples) * value_entropy#Calculation of entropy using the weighted sum of every attribute value of entropies

    information_gain = entropy_s - attribute_entropy # calculate the information gain by subtracting the attribute entropy fro the tatget value entropy
    information_gains[attribute] = information_gain

    print("Information Gain({}): {}".format(attribute, information_gain))

Information Gain(Age): 0.08300749985576883
Information Gain(Has_Job): 0.32365019815155627
Information Gain(Own_House): 0.4199730940219749
Information Gain(Credit_Rating): 0.36298956253708536


In [40]:
combined_data.head(15)

Unnamed: 0,Age,Has_Job,Own_House,Credit_Rating,Class
0,Young,False,False,fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
3,Young,True,True,fair,Yes
4,Young,False,False,fair,No
5,Middle,False,False,fair,No
6,Middle,False,False,Good,No
7,Middle,True,True,Good,Yes
8,Middle,False,True,excellent,Yes
9,Middle,False,True,excellent,Yes


In [26]:
# Create the decision tree classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)



In [41]:
# Predict the class for a new example
new_example = X_test[12].reshape(1, -1)
predicted_class = classifier.predict(new_example)

print("Predicted Class:", predicted_class)

Predicted Class: ['Yes']
