In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import f1_score,accuracy_score
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [4]:
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini
# Select the best split point for a dataset
def get_split_gini(dataset):
    class_values = list(set(row[-1] for row in dataset))
    gini_all=[]
    for index in range(len(dataset[0])-1):
        value= np.mean([row[index] for row in dataset])
        groups = test_split(index, value, dataset)
        gini = gini_index(groups, class_values)
        gini_all.append([gini,index,value,groups])
    [s_gini,s_index,s_value,s_groups]= min(gini_all,key= lambda x: x[0])
    return {'index':s_index, 'value':s_value, 'groups':s_groups}

In [5]:
def info_gain(groups, classes,dataset):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    infogain = 0.0
    for class_val in classes:
            p = [row[-1] for row in dataset].count(class_val) / n_instances
            if p == 0.0 :
              continue
            infogain += - p * np.log2(p)
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            if p == 0.0 :
              continue
            score += - p * np.log2(p)
        # weight the group score by its relative size
        infogain -= (score) * (size / n_instances)
    return infogain
# Select the best split point for a dataset
def get_split_info_gain(dataset):
    class_values = list(set(row[-1] for row in dataset))
    info_all=[]
    for index in range(len(dataset[0])-1):
        value= np.mean([row[index] for row in dataset])
        groups = test_split(index, value, dataset)
        info = info_gain(groups, class_values,dataset)
        info_all.append([info,index,value,groups])
    [s_info,s_index,s_value,s_groups]= max(info_all,key= lambda x: x[0])
    return {'index':s_index, 'value':s_value, 'groups':s_groups}

In [6]:
def gain_ratio(groups, classes,dataset):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    infogain = 0.0
    splitinfo = 0.0
    for class_val in classes:
            p = [row[-1] for row in dataset].count(class_val) / n_instances
            if p == 0.0 :
              continue
            infogain += - p * np.log2(p)
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        n=size/n_instances
        splitinfo -=  n * np.log2(n)
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            if p == 0.0 :
              continue
            score += - p * np.log2(p)
        # weight the group score by its relative size
        infogain -= (score) * (size / n_instances)
    # print(splitinfo)
    if splitinfo != 0.0 :
      return infogain/splitinfo
    else :
      return None
# Select the best split point for a dataset
def get_split_gain_ratio(dataset):
    class_values = list(set(row[-1] for row in dataset))
    gr_all=[]
    for index in range(len(dataset[0])-1):
        value=np.mean([row[index] for row in dataset])
        groups = test_split(index, value, dataset)
        gr = gain_ratio(groups, class_values,dataset)
        if gr != None:
          gr_all.append([gr,index,value,groups])
    [s_gr,s_index,s_value,s_groups]= max(gr_all,key= lambda x: x[0])
    return {'index':s_index, 'value':s_value, 'groups':s_groups}

In [7]:
def mc_error(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    error = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        prob = []
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            prob.append(p)
        # weight the group score by its relative size
        
        error += (1.0 - np.amax(prob)) * (size / n_instances)
    return error
# Select the best split point for a dataset
def get_split_mc_error(dataset):
    class_values = list(set(row[-1] for row in dataset))
    error_all=[]
    for index in range(len(dataset[0])-1):
        value=np.mean([row[index] for row in dataset])
        groups = test_split(index, value, dataset)
        error = mc_error(groups, class_values)
        error_all.append([error,index,value,groups])
    [s_error,s_index,s_value,s_groups]= min(error_all,key= lambda x: x[0])
    return {'index':s_index, 'value':s_value, 'groups':s_groups}

In [8]:
def chi_square(groups, classes,dataset):
    # count all samples at split point
    count_group=[len(group) for group in groups]
    n_instances = float(sum(count_group))
    count_class=[]
    for class_val in classes:
            count_class.append([row[-1] for row in dataset].count(class_val))
    expected_matrix=[c*g/n_instances for g in count_group for c in count_class]
    chi = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        prob = []
        for class_val in classes:
            n = [row[-1] for row in group].count(class_val) 
            ind=(groups.index(group)*2 + classes.index(class_val))
            chi+= (n-expected_matrix[ind])**2/expected_matrix[ind]
    return chi
# Select the best split point for a dataset
def get_split_chi_square(dataset):
    class_values = list(set(row[-1] for row in dataset))
    chi_all=[]
    for index in range(len(dataset[0])-1):
        value=np.mean([row[index] for row in dataset])
        groups = test_split(index, value, dataset)
        chi = chi_square(groups, class_values,dataset)
        chi_all.append([chi,index,value,groups])
    [s_chi,s_index,s_value,s_groups]= max(chi_all,key= lambda x: x[0])
    return {'index':s_index, 'value':s_value, 'groups':s_groups}

In [9]:
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [10]:
# Create child splits for a node or make terminal
def split(node, max_depth, min_leafsize, depth,get_split_type): 
  left, right = node['groups'] 
  del(node['groups'])
  # check for a no split
  if not left or not right:
      node['left'] = node['right'] = to_terminal(left + right)
      return
  # check for max depth
  if depth >= max_depth:
      node['left'], node['right'] = to_terminal(left), to_terminal(right)
      return
  # process left child
  if len(left) <= min_leafsize or len(set([row[-1] for row in left]))==1:
      node['left'] = to_terminal(left)
  else:
      node['left'] = get_split_type(left)
      split(node['left'], max_depth, min_leafsize, depth+1,get_split_type)
  # process right child
  if len(right) <= min_leafsize or len(set([row[-1] for row in right]))==1:
      node['right'] = to_terminal(right)
  else:
      node['right'] = get_split_type(right)
      split(node['right'], max_depth, min_leafsize, depth+1,get_split_type)

In [11]:
# Build a decision tree
def build_tree(train, max_depth, min_leafsize,get_split_type):
    root = get_split_type(train)
    split(root, max_depth, min_leafsize, 1,get_split_type)
    return root


In [12]:
# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']


In [13]:
def decision_tree(train, test, max_depth, min_leafsize,get_split_type):
    tree = build_tree(train, max_depth, min_leafsize,get_split_type)
    predictions =[predict(tree, row) for row in test]
    return predictions

In [14]:
def evaluate_DT_withKfold(dataset, n_folds, max_depth, min_leafsize,criterion):
  criteria={'gini': get_split_gini,'infogain': get_split_info_gain,'gainratio': get_split_gain_ratio,
             'mc_error': get_split_mc_error,'chi_square': get_split_chi_square}
  kf = KFold(n_splits=n_folds,shuffle=True,random_state=0)
  actual=[]
  pred=[]
  # actual=[row[-1] for row in test]
  for train_index, test_index in kf.split(dataset):
      train=(np.array(dataset)[train_index,:]).tolist()
      test=(np.array(dataset)[test_index,:]).tolist()
      actual.extend([row[-1] for row in test])
      for row in test:
        row[-1] =None
      predictions=decision_tree(train, test, max_depth, min_leafsize,criteria[criterion])
      pred.extend(predictions)
  return accuracy_score(actual,pred),f1_score(actual,pred)

      

In [15]:
fval=np.zeros((56,5))
acv=np.zeros((56,5))
criteria=['gini','infogain','gainratio','mc_error','chi_square']
n_folds=10
max_depth=[6,6,6,6,6]
min_leafsize=[5,5,5,5,5]
with tf.device('/device:GPU:0'):
  for i in range(1,57):#[12,15,21]:
    print(i)
    df=pd.read_csv('/content/drive/My Drive/DM_assignment/'+str(i)+'.csv',header= None)
    dfnew=df.copy()
    dfnew[len(df.columns)-1]=df[len(df.columns)-1].map(lambda x : int(x>0))
    dflist=dfnew.values.tolist()
    for j in range(0,5):
      acv[i-1,j],fval[i-1,j]=evaluate_DT_withKfold(dflist, n_folds, max_depth[j], min_leafsize[j],criteria[j])


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56


In [17]:
df_acv=pd.DataFrame(acv,index=range(1,57),columns=criteria)
df_acv.to_csv('/content/drive/My Drive/DM_assignment2/accuracy.csv')

df_fval=pd.DataFrame(fval,index=range(1,57),columns=criteria)
df_fval.to_csv('/content/drive/My Drive/DM_assignment2/Fmeasure.csv')