##Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# %cd /content/drive/My\ Drive/Data_mining/Lab2/Notebooks
# !ls

#Installing Packages

In [None]:
# !pip install ipynb

##Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
import math
import csv
import time
import re
import tracemalloc
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
# from ipynb.fs.full.Load_Dataset import load_dataset, get_dataset_names

plt.rc('xtick',labelsize=15)
plt.rc('ytick',labelsize=15)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import load_iris, load_digits, load_wine


# Node class

In [None]:
class Node:
  def __init__(self):
    # self.parent = parent
    # self.val = val
    self.splitPoint = None
    self.splitAttrInd = None
    self.children = {}
    self.classLabel = None
    self.maxClass = None
    
  def insertChild(self, node):
    self.children[node.val] = node

  def deleteChild(self, node):
    del self.children[node.val]
  

# Dicision Tree Implementation


In [None]:
randomState = 123

## Measure(Attribute selection method)

### Information Gain

In [None]:
def info(Y):
  infoForData = 0
  classes, classCnt = np.unique(Y, return_counts= True)
  # print('classes: ', classes, ' classCnt:', classCnt)
  for i in range(classes.shape[0]):
    prob = classCnt[i]/Y.shape[0]
    infoForData += - (prob)*math.log2(prob)

  return infoForData

def selectAttributeByGain(node, X, Y, marker, cache):
  attrInfo = cache['attrInfo']
  maxInfo = -np.inf
  splitAttrInd = None
  splitPoint = None
  candidateSplitPoint = None
  attrList = [i for i in range(marker.shape[0]) if marker[i]==1]
  # print('attrList=',attrList)

  # starting attribute selection

  # entrpy of full data
  infoForData = info(Y)
  # print('information of whole data={}'.format(infoForData))
  for i in attrList:
    attrInfoDict = attrInfo[i]
    
    # entropy for projected data
    infoForProjData = 0
    if attrInfoDict['type'] == 'discrete':
      # print('attr values = ', attrInfoDict['values'])
      for val in  attrInfoDict['values']:
        # print('for attr value= ',val)
        # indices = [j for j in range(X.shape[0]) if X[j, i] == val ]
        indices = X[:, i] == val
        projectedX, projectedY = X[indices], Y[indices]
        # print('projected data touples cnt = ', projectedY.shape[0])
        prob = projectedX.shape[0]/Y.shape[0]
        infoForProjData += prob* info(projectedY)
    else:
      # if attribute is continuous
      infoForProjData = np.inf
      sortedValues = np.sort(X[:, i])
      for j in range(sortedValues.shape[0]-1):
        tempInfoForProjData = 0
        midPoint = (sortedValues[j]+sortedValues[j+1])/2.0
        indices = X[:, i] <= midPoint
        projectedX, projectedY = X[indices], Y[indices]
        # print('projected data touples cnt = ', projectedY.shape[0])
        prob = projectedX.shape[0]/Y.shape[0]  
        tempInfoForProjData += prob* info(projectedY)
        
        indices = X[:, i] > midPoint
        projectedX, projectedY = X[indices], Y[indices]
        prob = projectedX.shape[0]/Y.shape[0]
        tempInfoForProjData += prob* info(projectedY)

        if tempInfoForProjData<infoForProjData:
          infoForProjData = tempInfoForProjData
          candidateSplitPoint = midPoint
 
    # print('for {}th attribute information of projected data={}'.format(i,infoForProjData))
    gain = infoForData - infoForProjData
    # print('for {}th attribute gain={}'.format(i,gain))
    if maxInfo < gain:
      maxInfo = gain
      splitAttrInd = i
      if attrInfoDict['type']=='continuous':
        splitPoint = candidateSplitPoint
      else:
        splitPoint = None
  # print('maxInfo= ',maxInfo)
  # print('splitAttrInd= ',splitAttrInd)
  # print('splitPoint= ',splitPoint)
  # print('*************************************************')

  return splitAttrInd, splitPoint

In [None]:
def selectAttributeByGainRatio(node, X, Y, marker, cache):
  attrInfo = cache['attrInfo']
  maxInfo = -np.inf
  splitAttrInd = None
  splitPoint = None
  candidateSplitPoint = None
  attrList = [i for i in range(marker.shape[0]) if marker[i]==1]
  # print('attrList=',attrList)

  # starting attribute selection

  # entrpy of full data
  infoForData = info(Y)
  # print('information of whole data={}'.format(infoForData))
  for i in attrList:
    attrInfoDict = attrInfo[i]
    splitInfo = 0

    # entropy for projected data
    infoForProjData = 0
    if attrInfoDict['type'] == 'discrete':
      # print('attr values = ', attrInfoDict['values'])
      for val in  attrInfoDict['values']:
        # print('for attr value= ',val)
        # indices = [j for j in range(X.shape[0]) if X[j, i] == val ]
        indices = X[:, i] == val
        projectedX, projectedY = X[indices], Y[indices]
        # print('projected data touples cnt = ', projectedY.shape[0])
        prob = projectedX.shape[0]/Y.shape[0]
        infoForProjData += prob* info(projectedY)
        if prob != 0:
          splitInfo -= prob*np.log2(prob)
        print('prob={}, log2(prob)={}, splitInfo={}'.format(prob,np.log2(prob),splitInfo))
    else:
      # if attribute is continuous
      infoForProjData = np.inf
      sortedValues = np.sort(X[:, i])
      for j in range(sortedValues.shape[0]-1):
        tempInfoForProjData = 0
        tempSplitInfo = 0
        midPoint = (sortedValues[j]+sortedValues[j+1])/2.0
        indices = X[:, i] <= midPoint
        projectedX, projectedY = X[indices], Y[indices]
        # print('projected data touples cnt = ', projectedY.shape[0])
        prob = projectedX.shape[0]/Y.shape[0]  
        tempInfoForProjData += prob* info(projectedY)
        if prob != 0:
          tempSplitInfo -= prob*np.log2(prob)
        
        indices = X[:, i] > midPoint
        projectedX, projectedY = X[indices], Y[indices]
        prob = projectedX.shape[0]/Y.shape[0]
        tempInfoForProjData += prob* info(projectedY)
        if prob != 0:
          tempSplitInfo -= prob*np.log2(prob)

        if tempInfoForProjData<infoForProjData:
          infoForProjData = tempInfoForProjData
          candidateSplitPoint = midPoint
          splitInfo= tempSplitInfo
    # print('for {}th attribute information of projected data={}'.format(i,infoForProjData))
    gain = infoForData - infoForProjData
    gainRatio = gain/splitInfo
    print('for {}th attribute infoForData={}, infoForProjData={}, gain={}, splitInfo={},  gainRatio={}'.format(i,infoForData,infoForProjData,gain,splitInfo,gainRatio))
    if maxInfo < gainRatio:
      maxInfo = gainRatio
      splitAttrInd = i
      if attrInfoDict['type']=='continuous':
        splitPoint = candidateSplitPoint
      else:
        splitPoint = None
  print('maxInfo= ',maxInfo)
  print('splitAttrInd= ',splitAttrInd)
  print('splitPoint= ',splitPoint)
  # print('*************************************************')

  return splitAttrInd, splitPoint

### Make Decision Tree Function

In [None]:
def makeTree(X, Y, marker, cache, selectionMeasure = 'gain'):
  node = Node() 
  attrInfo = cache['attrInfo']
  attrList = [i for i in range(marker.shape[0]) if marker[i]==1]
  # print('attrList=',attrList)

  classes, classCnt = np.unique(Y, return_counts= True)
  mxClassInd = np.argmax(classCnt)
  node.maxClass = classes[mxClassInd]
  # print('classes: ', classes, ' classCnt:', classCnt)
  if classes.shape[0]==1:
    # all touples are of the same class
    node.classLabel = classes[0]
    return node
  if len(attrList)==0:
    # no attribute to split. Labeling with max class
    classes, classCnt = np.unique(Y, return_counts=True)
    mxClassInd = np.argmax(classCnt)
    node.classLabel = classes[mxClassInd]
    # print('calss label=', classes[mxClassInd])
    return node

  # starting attribute selection
  if selectionMeasure =='gain':
    splitAttrInd, splitPoint = selectAttributeByGain(node, X, Y, marker, cache)
  else:
    splitAttrInd, splitPoint = selectAttributeByGainRatio(node, X, Y, marker, cache)

  node.splitAttrInd, node.splitPoint = splitAttrInd, splitPoint
  attrInfoDict = attrInfo[splitAttrInd]
  if attrInfoDict['type']=='discrete':
    for val in  attrInfoDict['values']:
      indices =  X[:, splitAttrInd] == val 
      projectedX, projectedY = X[indices], Y[indices]
      
      if projectedY.shape[0]==0:
        newNode = Node()
        # if there is no touple for this value
        mxClassInd = np.argmax(classCnt)
        newNode.classLabel = classes[mxClassInd]
        node.children[val] = newNode
        # print('calss label=', classes[mxClassInd])
      else:
        marker[splitAttrInd]=0      
        node.children[val] = makeTree( projectedX, projectedY, marker, cache, selectionMeasure)
        marker[splitAttrInd]=1
  else:
    indices =  X[:, splitAttrInd] <= splitPoint 
    projectedX, projectedY = X[indices], Y[indices]
    # print('going to less branch')
    if projectedY.shape[0]==0:
      newNode = Node()
      # if there is no touple for this value
      mxClassInd = np.argmax(classCnt)
      newNode.classLabel = classes[mxClassInd]
      node.children['less'] = newNode
    else:
      node.children['less'] = makeTree( projectedX, projectedY, marker, cache, selectionMeasure)
    
    indices =  X[:, splitAttrInd] > splitPoint 
    projectedX, projectedY = X[indices], Y[indices]
    # print('going to greater branch')
    if projectedY.shape[0]==0:
      newNode = Node()
      # if there is no touple for this value
      mxClassInd = np.argmax(classCnt)
      newNode.classLabel = classes[mxClassInd]
      node.children['greater'] = newNode
    else:
      node.children['greater'] = makeTree( projectedX, projectedY, marker, cache, selectionMeasure)
    
  return node



In [None]:
def predict(node, X):
  if not bool(node.children):
    return node.classLabel
  if node.splitPoint!=None:
    # split attribute is continuous
    if X[node.splitAttrInd]<=node.splitPoint:
      return predict(node.children['less'], X)
    else:
      return predict(node.children['greater'], X)
  if X[node.splitAttrInd] not in node.children:
    return node.maxClass
  return predict(node.children[X[node.splitAttrInd]], X)

def predictDataDecisionTree(node, X):
  YPred = np.array([])
  for x in X:
    yPred = predict(node, x)
    YPred = np.append(YPred, yPred)
    # print(yPred)
  return YPred

### Loading Dataset

In [None]:
# datasetNames =['iris'  ,'wine' ,'australian' ,'breastTissue', 'dermatology','glass', 'parkinsons', 'pima' ,'sonar', 'yeast', 'heart' ,'segmentation', 'ionosphere', 'ecoli' ]
# # datasetName = 'sampleDataset2'
# datasetName = 'iris'
# X, Y =   load_dataset(datasetName)
# # print(X)
# # print(Y)
# threshold = 15
# attrMeans = {}
# attrStds = {}
# classes= np.unique(Y)
# attrInfo = []

# for i in range(X.shape[1]):
#   attrInfoDict = {}
#   values = np.unique(X[:,i])
#   if values.shape[0]>threshold:
#     attrInfoDict['type']= 'continuous'
#     attrInfoDict['values']= None
#   else:
#     attrInfoDict['type']= 'discrete'
#     attrInfoDict['values']= values
#   attrInfo.append(attrInfoDict) 

# for i in range(classes.shape[0]):
#   cls = classes[i]
#   projX, projY = X[Y==cls], Y[Y==cls]
#   attrMeans[cls], attrStds[cls] = [], []
#   for j in range(X.shape[1]):
#     attrInfoDict = attrInfo[j]
#     if attrInfoDict['type']=='continuous': 
#       attrMeans[cls].append(np.mean(projX[:,j]))
#       attrStds[cls].append(np.std(projX[:,j]))
#     else:
#       attrMeans[cls].append(None)
#       attrStds[cls].append(None)
# cache={
#     'attrInfo': attrInfo,
#     'attrMeans': attrMeans,
#     'attrStds': attrStds
#   }


# print(X, Y)
# # print(trainSize, testSize)
# # print(attrInfo)


### Creating Decision Tree

In [None]:
# marker = np.ones((X.shape[1]))
# root = makeTree( X, Y, marker, cache, selectionMeasure = 'gain')

### Performance

In [None]:
# # print(XTest)
# YPred = predictDataDecisionTree(root, X)
# precision, recall, f1score, _ = precision_recall_fscore_support(Y, YPred)
# accuracy = accuracy_score(Y, YPred)
# print(np.sum(Y==YPred))
# print([i for i in range(Y.shape[0]) if Y[i]!=YPred[i]])
# print('precision = {}, recall= {}, f1score={}, accuracy = {}'.format(precision, recall, f1score, accuracy))

In [None]:
# XTest = [['youth', 'high', 'no', 'fair']]
# YPred = predictDataDecisionTree(root, XTest)
# print(YPred)