In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from random import sample

# Sample dataset
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Windy': ['False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

# Calculate entropy
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

# Calculate information gain
def info_gain(data, split_attribute_name, target_name="PlayTennis"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
    info_gain = total_entropy - weighted_entropy
    return info_gain

# Build the decision tree using ID3 algorithm
def id3(data, original_data, features, target_attribute_name="PlayTennis", parent_node_class=None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])]
    elif len(features) == 0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        item_values = [info_gain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree
        return tree

# Function to predict using a decision tree
def predict(query, tree, default=None):
    for key in list(query.keys()):
        if key in tree.keys():
            try:
                result = tree[key][query[key]]
            except:
                return default

            if isinstance(result, dict):
                return predict(query, result)
            else:
                return result
    
    return default

# Function to create a random forest
def random_forest(data, n_trees, n_features):
    trees = []
    for _ in range(n_trees):
        sample_indices = np.random.choice(len(data), size=len(data), replace=True)
        bootstrap_sample = data.iloc[sample_indices]
        selected_features = sample(list(data.columns[:-1]), n_features)
        tree = id3(bootstrap_sample, bootstrap_sample, selected_features)
        trees.append(tree)
    
    return trees

# Function to predict using a random forest
def predict_forest(query, forest):
    predictions = []
    for tree in forest:
        prediction = predict(query, tree)
        predictions.append(prediction)
    
    final_prediction = Counter(predictions).most_common(1)[0][0]
    return final_prediction

# Train the random forest
n_trees = 5  # Number of trees in the forest
n_features = 2  # Number of features to consider at each split
forest = random_forest(df, n_trees, n_features)

# Predict using the random forest
query = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Windy': 'True'}
prediction = predict_forest(query, forest)

print(f"Random Forest Prediction: {prediction}")


Random Forest Prediction: No
