In [4]:
!pip freeze
!pip3 install numpy
!pip3 install pandas
!pip3 install sklearn
!pip3 install matplotlib
!pip3 install pydotplus
!pip3 install six
!pip install -Uqq ipdb

absl-py==1.2.0
aeppl==0.0.33
aesara==2.7.9
aiohttp==3.8.3
aiosignal==1.2.0
alabaster==0.7.12
albumentations==1.2.1
altair==4.2.0
appdirs==1.4.4
arviz==0.12.1
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
async-timeout==4.0.2
asynctest==0.13.0
atari-py==0.2.9
atomicwrites==1.4.1
attrs==22.1.0
audioread==3.0.0
autograd==1.5
Babel==2.10.3
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==5.0.1
blis==0.7.8
bokeh==2.3.3
branca==0.5.0
bs4==0.0.1
CacheControl==0.12.11
cached-property==1.5.2
cachetools==4.2.4
catalogue==2.0.8
certifi==2022.9.24
cffi==1.15.1
cftime==1.6.2
chardet==3.0.4
charset-normalizer==2.1.1
click==7.1.2
clikit==0.6.2
cloudpickle==1.5.0
cmake==3.22.6
cmdstanpy==1.0.7
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.0.2
cons==0.4.5
contextlib2==0.5.5
convertdate==2.4.0
crashtest==0.3.1
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.3.0
cvxpy==1.2.1
cycler==0.11.0
cymem==2.0.6
Cython==0.29.32
daft==0.0.4
dask==2022.2.0
datascience==0.17.5
debugpy==1.0.0
decorator==4.4.

# Problem 6: Cross-Regression

# Part 1: Regress

Tree code is copied & modified from my Problem 2 work.

In [5]:
import pandas as pd
import numpy as np
# import ipdb

# preparing data for use
url = 'https://raw.githubusercontent.com/f1nn3g4n/SENG474Test/ad161e4c364f92adc9730fe6c06edd2769b9f355/elections_clean.csv'
df = pd.read_csv(url, sep='\t', index_col=0)
# help from https://datascienceparichay.com/article/randomly-shuffle-pandas-dataframe-rows/#:~:text=There%20are%20a%20number%20of%20ways%20to%20shuffle,sklearn.utils%20to%20shuffle%20your%20dataframe.%20Here%E2%80%99s%20the%20syntax%3A
df = df.sample(frac=1)
feature_repeats = {}

In [6]:
# altering per_capita_inc to be binned by quartiles
# help from https://stackoverflow.com/questions/45273731/binning-a-column-with-pandas
bins = []
inc_s = df.per_capita_inc
bins.append(round(inc_s.min()-0.1,2))
bins.append(round(inc_s.quantile(0.25),2))
bins.append(round(inc_s.quantile(0.5),2))
bins.append(round(inc_s.quantile(0.75),2))
bins.append(round(inc_s.max()+0.1,2))
df['per_capita_inc_binned'] = pd.cut(df.per_capita_inc, bins=bins)

In [7]:
total = df.shape[0]
train_size = int(round(total*0.7))

df_train = df.iloc[:train_size,:]
df_valid = df.iloc[train_size:,:]

category_features = {'ethnic_male', 'ethnic_female', 'education', 'religion', 
                     'per_capita_inc_binned',}
label_vector = 'deep_pov_all'

In [8]:
from statistics import mean

class TreeNode:
  def __init__(self, data):
    self.name = data.name # where data is a pandas Series
    self.children = {} # dictionary of str:node/list pairs
    for value in data.unique():
      self.children[value] = None

  def __str__(self):
    print('Node: '+self.name)

  def findMaxDepth(self, d):
    max_d = d+1
    for child in self.children:
      if type(self.children[child]) is not list:
        child_d = self.children[child].findMaxDepth(d+1)
        if max_d < child_d:
          max_d = child_d
    return max_d

def printTree(node, tab_count):
  ''' Creates a visualization of the ID3 tree '''
  tabs = tab_count*'  '
  print(tabs+node.name)
  tabs += '  '
  for child in node.children:
    print(tabs+str(child))
    if type(node.children[child]) is list:
      print(tabs+'  ',round(mean(node.children[child]),4))
    else:
      printTree(node.children[child],tab_count+2)

In [9]:
def assignFeatRe(feature):
  ''' Increases count of feature repeat '''
  if feature_repeats.get(feature.name) != None:
    feature_repeats[feature.name]+=1
  else:
    feature_repeats[feature.name]=1

def findInfoGain(feature, train_data):
  ''' Takes a feature and training data and finds information gain, 
  for feature splitting. 
  '''
  feature_vals = feature.unique()
  n = train_data.shape[0]
  feat_var = 0
  feature_val_counts = feature.value_counts(sort=False)
  tree_var = 0
  i = 0
  values = np.array(train_data[label_vector].tolist())
  tree_var = np.var(values)

  # feature variance
  for feat_val, count in feature_val_counts.iteritems():
    f_data = train_data[train_data[feature.name]==feat_val]
    feat_vals = np.array(f_data[label_vector].tolist())
    feat_var += count/n * np.var(feat_vals)

  return tree_var - feat_var

def findSplitFeature(train_data, categories):
  max_ig = -1
  max_info_feature = None

  for category in categories:
    category_ig = findInfoGain(train_data[category], train_data)
    if max_ig < category_ig:
      max_ig = category_ig
      max_info_feature = train_data[category]

  # ipdb.set_trace(context=6)

  return max_info_feature

def buildID3Tree(train_data, cur_node, root, categories, B):
  ''' Takes in training data, returns a tree using the TreeNode class above. '''
  if root == None:
    categories = category_features.copy()
    root_feature = findSplitFeature(train_data, categories)
    root = TreeNode(root_feature)
    cur_node = root
    assignFeatRe(root_feature)

  for child in cur_node.children:
    child_data = train_data[train_data[cur_node.name]==child]
    child_count = child_data.shape[0]

    if child_count <= B or len(categories) == 1:
      # reached a leaf
      cur_node.children[child] = child_data[label_vector].tolist()
      train_data = train_data[train_data[cur_node.name]!=child]
    elif child_count != 0 and len(categories) != 1:
      new_cat = categories.difference({cur_node.name})
      # need to add node
      child_feature = findSplitFeature(child_data, new_cat)
      child_node = TreeNode(child_feature)
      # track repeat features
      assignFeatRe(child_feature)
      cur_node.children[child] = child_node
      buildID3Tree(child_data, child_node, root, new_cat,B)

  return root

In [10]:
feature_repeats = {}
df_train_copy = df_train.copy()
# %pdb on
tree = buildID3Tree(df_train_copy,None,None,None,10)

In [11]:
# %pdb off
printTree(tree,0)

per_capita_inc_binned
  (-0.12, 0.48]
    education
      Some College
        ethnic_female
          WHITE FEMALE
            religion
              MAINLINE CHRISTIAN
                ethnic_male
                  WHITE MALE
                     0.0773
              CATHOLIC
                 0.0613
              NON-CATHOLIC CHRISTIAN
                 0.0689
              CHRISTIAN GENERIC
                 0.0626
          BLACK FEMALE
             0.2238
      GTE Bachelor's
        religion
          OTHER MISC
            ethnic_female
              WHITE FEMALE
                ethnic_male
                  WHITE MALE
                     0.0565
          CATHOLIC
            ethnic_female
              WHITE FEMALE
                ethnic_male
                  WHITE MALE
                     0.0521
              BLACK FEMALE
                 0.0945
          MAINLINE CHRISTIAN
            ethnic_female
              WHITE FEMALE
                ethnic_male
                  WHITE

In [12]:
# help from https://www.studytonight.com/post/what-is-mean-squared-error-mean-absolute-error-root-mean-squared-error-and-r-squared

def calcMSError(dataset, tree):
  ''' Calculates and returns the mean squared error of a tree from a dataset '''
  total = dataset.shape[0]
  squared_error = 0

  for index, row in dataset.iterrows():
    cur_node = tree
    in_tree = True
    row_val = row[label_vector]
    while in_tree:
      row_node_value = row[cur_node.name]
      child = cur_node.children.get(row_node_value)

      if child == None:
        in_tree = False
      elif type(child) == list:
        in_tree = False
        squared_error += (row_val - mean(child))**2
      else:
        cur_node = child
  
  return squared_error/total

In [13]:
train_error = calcMSError(df_train, tree)
valid_error = calcMSError(df_valid, tree)
print('train error: ', train_error)
print('valid error: ', valid_error)
print("\ntree repeats and depth:")
print(feature_repeats)
print(tree.findMaxDepth(0))

train error:  0.0004712330633142677
valid error:  0.0007113653160556875

tree repeats and depth:
{'per_capita_inc_binned': 1, 'education': 9, 'ethnic_female': 17, 'religion': 15, 'ethnic_male': 36}
5


# Part 2: Cross-validation

In [14]:
from math import floor

def kFoldTreeSelection(k, train_data):
  ''' Builds k trees using k-fold cross validation and returns best one '''
  trees = []
  tree_scores = []
  k_size = int(floor(train_data.shape[0]/k))

  for i in range(k):
    # partition data
    k_test = train_data.iloc[i*k_size:(i+1)*k_size]
    k_t1 = train_data.iloc[:i*k_size]
    k_t2 = train_data.iloc[(i+1)*k_size:]
    k_train = pd.concat([k_t1, k_t2])
    k_train_copy = k_train.copy()
    # make tree
    k_tree = buildID3Tree(k_train_copy,None,None,None,10)
    trees.append(k_tree)
    # record validation score
    tree_scores.append(calcMSError(k_test, k_tree))

  minMSE = min(tree_scores)
  return trees[tree_scores.index(minMSE)]

In [15]:
k_fold_tree = kFoldTreeSelection(5, df_train)

In [16]:
print('Best DT from k fold validation:')
printTree(k_fold_tree, 0)

Best DT from k fold validation:
per_capita_inc_binned
  (-2.88, -0.66]
    ethnic_female
      WHITE FEMALE
        education
          Some College
            religion
              CATHOLIC
                 0.0908
              MAINLINE CHRISTIAN
                ethnic_male
                  WHITE MALE
                     0.1078
              NON-CATHOLIC CHRISTIAN
                ethnic_male
                  WHITE MALE
                     0.1064
              PENTECOSTAL / CHARISMATIC
                 0.1144
              OTHER CHRISTIAN
                 0.0947
              OTHER
                 0.1448
              OTHER MISC
                 0.0971
              CHRISTIAN GENERIC
                 0.0705
              MORMON
                 0.0865
          LT High School
            religion
              MAINLINE CHRISTIAN
                ethnic_male
                  WHITE MALE
                     0.0917
              PENTECOSTAL / CHARISMATIC
                 0.0869
   