In [11]:
#**** Question 3- Decision Tree ****
#Solution a)
# Step 1- Importing the libraries
import pandas as pd
import numpy as np

# Step 2- Loading Dataset
#wine_data_set is the URL from which we are loading dataset
wine_data_set = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
# Dataset are separated by ;, hence it is delimiter
wine_red_quality = pd.read_csv(wine_data_set, delimiter=';')

# Step 3- Separating the labels and the features
X = wine_red_quality.iloc[:, :-1].values  # ELeven first columns
y = wine_red_quality.iloc[:, -1].values   # lst column loaded in y

# Binary classification (if quality >= 7 then 1, else 0)
y = (y >= 7).astype(int)

# Step 4- Function information gain and entropy
def entropy(y):
    # Number of ccurances of each class 
    counts_class = np.bincount(y)
    # Converting counts to probabilities
    prob_class = counts_class/ len(y)
    return -np.sum([p * np.log2(p) for p in prob_class if p > 0])

def inform_gain(y, left_split_y, right_split_y):
    # Finding sample which fall into left split, p is its proportion
    p = len(left_split_y) / len(y)
    # Inform_gain is diffrence of parents entropy to weighted sum of child entropies
    return entropy(y) - p * entropy(left_split_y) - (1 - p) * entropy(right_split_y)

# Step 5- Decision Tree (DT) Class implementation
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    

    #Building recursively DT
    def fit(self, X, y, depth=0):
        number_of_samples, number_of_features = X.shape
        if number_of_samples <= 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.bincount(y).argmax()
        #Find the best threshold and best feature
        best_feature, best_thres = self.best_split(X, y)
        if best_feature is None:
            return np.bincount(y).argmax()
        #determine which go to left child and right child
        indices_l = X[:, best_feature] < best_thres
        indices_r = X[:, best_feature] >= best_thres

        left_sub_tree = self.fit(X[indices_l], y[indices_l], depth + 1)
        right_sub_tree = self.fit(X[indices_r], y[indices_r], depth + 1)

        return (best_feature, best_thres, left_sub_tree, right_sub_tree)

    def best_split(self, X, y):
        #Based on inform gain best feature, thres and gain tracked
        gain = -1
        feature, thres_best = None, None
        num_samples, num_features = X.shape

        for current_feature in range(num_features):#iterate over feature
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                indices_l = X[:, current_feature] < threshold
                indices_r = X[:, current_feature] >= threshold
                if len(y[indices_l]) == 0 or len(y[indices_r]) == 0:
                    continue

                current_gain = inform_gain(y, y[indices_l], y[indices_r])
                if current_gain > gain:
                    gain = current_gain
                    feature = current_feature
                    thres_best = threshold

        return feature, thres_best # Reutrning best split means best thresthold and feature
    # Identify class for each sample in X (done by Predict)
    def predict(self, X):
        return [self._predict(inputs, self.tree) for inputs in X]
    #Recursively traverse tree
    def _predict(self, inputs, tree):
        if isinstance(tree, (int, np.int64)):
            return tree
        feature_predict, thres_predict, sub_tree_left, sub_tree_right = tree
        if inputs[feature_predict] < thres_predict:
            return self._predict(inputs, sub_tree_left)
        else:
            return self._predict(inputs, sub_tree_right)

# Step 6- DT(decision tree) training
deci_tree = DecisionTree(max_depth=5) #Initialize D tree with max depth of 5
deci_tree.tree = deci_tree.fit(X, y)

# Step 7- Using training data predict
predicts = deci_tree.predict(X)

# Finding the accuracy
accuracy_final = np.mean(predicts == y)
print(f"Training Accuracy: {accuracy_final:.4f}")


Training Accuracy: 0.8868


In [9]:
#**** Question 3- Decision Tree + cross valid ****
#Solution b)
# Step 1- Importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# Step 2- Loading Dataset
#wine_data_set is the URL from which we are loading dataset
wine_data_set = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
# Dataset are separated by ;, hence it is delimiter
wine_red_quality = pd.read_csv(wine_data_set, delimiter=';')

# Step 3- Separating the labels and the features
X = wine_red_quality.iloc[:, :-1].values  # ELeven first columns
y = wine_red_quality.iloc[:, -1].values   # lst column loaded in y

# Binary classification (if quality >= 7 then 1, else 0)
y = (y >= 7).astype(int)

# Step 4- Function information gain and entropy
def entropy(y):
    # Number of ccurances of each class 
    counts_class = np.bincount(y)
    # Converting counts to probabilities
    prob_class = counts_class/ len(y)
    return -np.sum([p * np.log2(p) for p in prob_class if p > 0])

def inform_gain(y, left_split_y, right_split_y):
    # Finding sample which fall into left split, p is its proportion
    p = len(left_split_y) / len(y)
    # Inform_gain is diffrence of parents entropy to weighted sum of child entropies
    return entropy(y) - p * entropy(left_split_y) - (1 - p) * entropy(right_split_y)

# Step 5- Decision Tree (DT) Class implementation
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    

    #Building recursively DT
    def fit(self, X, y, depth=0):
        number_of_samples, number_of_features = X.shape
        if number_of_samples <= 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.bincount(y).argmax()
        #Find the best threshold and best feature
        best_feature, best_thres = self.best_split(X, y)
        if best_feature is None:
            return np.bincount(y).argmax()
        #determine which go to left child and right child
        indices_l = X[:, best_feature] < best_thres
        indices_r = X[:, best_feature] >= best_thres

        left_sub_tree = self.fit(X[indices_l], y[indices_l], depth + 1)
        right_sub_tree = self.fit(X[indices_r], y[indices_r], depth + 1)

        return (best_feature, best_thres, left_sub_tree, right_sub_tree)

    def best_split(self, X, y):
        #Based on inform gain best feature, thres and gain tracked
        gain = -1
        feature, thres_best = None, None
        num_samples, num_features = X.shape

        for current_feature in range(num_features):#iterate over feature
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                indices_l = X[:, current_feature] < threshold
                indices_r = X[:, current_feature] >= threshold
                if len(y[indices_l]) == 0 or len(y[indices_r]) == 0:
                    continue

                current_gain = inform_gain(y, y[indices_l], y[indices_r])
                if current_gain > gain:
                    gain = current_gain
                    feature = current_feature
                    thres_best = threshold

        return feature, thres_best # Reutrning best split means best thresthold and feature
    # Identify class for each sample in X (done by Predict)
    def predict(self, X):
        return [self._predict(inputs, self.tree) for inputs in X]
    #Recursively traverse tree
    def _predict(self, inputs, tree):
        if isinstance(tree, (int, np.int64)):
            return tree
        feature_predict, thres_predict, sub_tree_left, sub_tree_right = tree
        if inputs[feature_predict] < thres_predict:
            return self._predict(inputs, sub_tree_left)
        else:
            return self._predict(inputs, sub_tree_right)

# Step 6- DT(decision tree) training
deci_tree = DecisionTree(max_depth=5) #Initialize D tree with max depth of 5
deci_tree.tree = deci_tree.fit(X, y)

# Step 7- Using training data predict
predicts = deci_tree.predict(X)

# Finding the accuracy
accuracy_final = np.mean(predicts == y)
print(f"Training Accuracy: {accuracy_final:.4f}")

# Step 8- Funcntion for trainig tree
def tree_train(X_train, y_train):
    tree = DecisionTree(max_depth=5)
    tree.tree = tree.fit(X_train, y_train)
    return tree

# Step 7: Define cross validation function
def cross_valid_accu(X, y, folds=10):
    ten_folds = KFold(n_splits=folds, shuffle=True, random_state=42)
    accuracies = []

    #For each fold generate indices for testing and training set
    for ind_train, ind_test in ten_folds.split(X):
        #for features
        X_train, X_test = X[ind_train], X[ind_test]
        #for labels
        y_train, y_test = y[ind_train], y[ind_test]
        
        model = tree_train(X_train, y_train)
        # Now make prediction on test set
        predict_cross_valid = model.predict(X_test)
        # Calculating accuracy by comparing labels
        accu_cross_valid= np.mean(predict_cross_valid == y_test) #Accuracy for each fold is putted here
        accuracies.append(accu_cross_valid)
    
    return np.mean(accuracies)

# Perform cross-validation
cross_validation_accuracy = cross_valid_accu(X, y)
print(f'Cross-Validation Accuracy: {cross_validation_accuracy:.4f}')


Training Accuracy: 0.8868
Cross-Validation Accuracy: 0.8762


In [10]:
#**** Question 3- Decision Tree ****
#Solution c)
# Step 1- Importing the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV


# Step 2- Loading Dataset
#wine_data_set is the URL from which we are loading dataset
wine_data_set = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
# Dataset are separated by ;, hence it is delimiter
wine_red_quality = pd.read_csv(wine_data_set, delimiter=';')

# Step 3- Separating the labels and the features
X = wine_red_quality.iloc[:, :-1].values  # ELeven first columns
y = wine_red_quality.iloc[:, -1].values   # lst column loaded in y

# Binary classification (if quality >= 7 then 1, else 0)
y = (y >= 7).astype(int)

# Step 4- Function information gain and entropy
def entropy(y):
    # Number of ccurances of each class 
    counts_class = np.bincount(y)
    # Converting counts to probabilities
    prob_class = counts_class/ len(y)
    return -np.sum([p * np.log2(p) for p in prob_class if p > 0])

def inform_gain(y, left_split_y, right_split_y):
    # Finding sample which fall into left split, p is its proportion
    p = len(left_split_y) / len(y)
    # Inform_gain is diffrence of parents entropy to weighted sum of child entropies
    return entropy(y) - p * entropy(left_split_y) - (1 - p) * entropy(right_split_y)

# Step 5- Decision Tree (DT) Class implementation
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    

    #Building recursively DT
    def fit(self, X, y, depth=0):
        number_of_samples, number_of_features = X.shape
        if number_of_samples <= 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.bincount(y).argmax()
        #Find the best threshold and best feature
        best_feature, best_thres = self.best_split(X, y)
        if best_feature is None:
            return np.bincount(y).argmax()
        #determine which go to left child and right child
        indices_l = X[:, best_feature] < best_thres
        indices_r = X[:, best_feature] >= best_thres

        left_sub_tree = self.fit(X[indices_l], y[indices_l], depth + 1)
        right_sub_tree = self.fit(X[indices_r], y[indices_r], depth + 1)

        return (best_feature, best_thres, left_sub_tree, right_sub_tree)

    def best_split(self, X, y):
        #Based on inform gain best feature, thres and gain tracked
        gain = -1
        feature, thres_best = None, None
        num_samples, num_features = X.shape

        for current_feature in range(num_features):#iterate over feature
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                indices_l = X[:, current_feature] < threshold
                indices_r = X[:, current_feature] >= threshold
                if len(y[indices_l]) == 0 or len(y[indices_r]) == 0:
                    continue

                current_gain = inform_gain(y, y[indices_l], y[indices_r])
                if current_gain > gain:
                    gain = current_gain
                    feature = current_feature
                    thres_best = threshold

        return feature, thres_best # Reutrning best split means best thresthold and feature
    # Identify class for each sample in X (done by Predict)
    def predict(self, X):
        return [self._predict(inputs, self.tree) for inputs in X]
    #Recursively traverse tree
    def _predict(self, inputs, tree):
        if isinstance(tree, (int, np.int64)):
            return tree
        feature_predict, thres_predict, sub_tree_left, sub_tree_right = tree
        if inputs[feature_predict] < thres_predict:
            return self._predict(inputs, sub_tree_left)
        else:
            return self._predict(inputs, sub_tree_right)

# Step 6- DT(decision tree) training
deci_tree = DecisionTree(max_depth=5) #Initialize D tree with max depth of 5
deci_tree.tree = deci_tree.fit(X, y)

# Step 7- Using training data predict
predicts = deci_tree.predict(X)

# Finding the accuracy without impovement
accuracy_final = np.mean(predicts == y)
print(f"Training Accuracy without improvement is {accuracy_final:.4f}")

##For improvements with grid search (hyperparameter tuning), ensemble method(random forest), feature scaling
# Step 8- Scaling the features
scal_improve = StandardScaler() #Standardize the features
#Compute SD and mean on training data and then scaling
X_improve = scal_improve.fit_transform(X)

# Step 9- Cross valid function with tuning
def r_f_cross_validation_improve(improve_X, improve_y, num_of_fold=10):
    # dictionary
    param_grid = {
        'n_estimators': [50, 100, 200], #no of trees
        'max_depth': [None, 10, 20, 30], # maximum tree depth
        'min_samples_split': [2, 5, 10], #min number of samples for spliting
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy'] #To measure quality of split
    }

    state_r_f = RandomForestClassifier(random_state=42)
    #Using Random Forest for tuning
    search_in_grid = GridSearchCV(estimator=state_r_f, param_grid=param_grid, cv=num_of_fold, n_jobs=-1, scoring='accuracy')
    search_in_grid.fit(improve_X, improve_y)
   
    return search_in_grid.best_estimator_

best_model_store = r_f_cross_validation_improve(X, y) #It stores the model which is obtained as best model
best_model_store.fit(X, y) #On complete dataset, train best model
improved_accu_final = best_model_store.score(X, y) #Finding accuracy of model on dataset
print(f'Accuracy with improvement is {improved_accu_final:.4f}')


Training Accuracy without improvement is 0.8868
Accuracy with improvement is 1.0000
