In [1]:
#%pip3 install pandas

In [2]:
import numpy as np
import pandas as pd
import cart
import categorical_gini
import continuous_gini
import minimal_cart
import tree

In [3]:
df = pd.read_csv('~/ExampleData.csv')

|Variable name|Data Type|Description|
|-------------|---------|-----------|
|Treatment<br>(output category)|Categorical|A: Branded drug <br>B: Generic Drug|
|Age|Integer|Age in years after birth|
|Sex|Categorical|M: Male,<br> F: Female|
|OverallHealthIdx|Integer|Score from a self-reported questionnaire|
|Time|Integer|Number of hours since emergency room admission|
|Censored <br>(poorly named variable)|Categorical|The identifier of the physician type:<br>0: Internal Medicine doctor, <br>1: Family Medicine doctor|

In [4]:
df

Unnamed: 0,Treatment,Age,Sex,OverallHealthIdx,Time,Censored
0,A,95,M,56,39,1
1,A,85,M,66,37,1
2,A,73,F,67,39,1
3,A,62,M,39,23,0
4,A,62,M,92,29,0
...,...,...,...,...,...,...
195,B,88,M,76,35,0
196,B,70,F,48,37,0
197,B,79,F,66,37,0
198,B,91,F,88,41,1


In [5]:
arr = df.to_numpy()

In [6]:
arr

array([['A', 95, 'M', 56, 39, 1],
       ['A', 85, 'M', 66, 37, 1],
       ['A', 73, 'F', 67, 39, 1],
       ...,
       ['B', 79, 'F', 66, 37, 0],
       ['B', 91, 'F', 88, 41, 1],
       ['B', 50, 'M', 73, 22, 0]], dtype=object)

In [7]:
# correct type for Censored variable
arr[:,-1] = arr[:,-1].astype(str)

In [8]:
arr

array([['A', 95, 'M', 56, 39, '1'],
       ['A', 85, 'M', 66, 37, '1'],
       ['A', 73, 'F', 67, 39, '1'],
       ...,
       ['B', 79, 'F', 66, 37, '0'],
       ['B', 91, 'F', 88, 41, '1'],
       ['B', 50, 'M', 73, 22, '0']], dtype=object)

In [9]:
# predictors
X = arr[:,1:]
X

array([[95, 'M', 56, 39, '1'],
       [85, 'M', 66, 37, '1'],
       [73, 'F', 67, 39, '1'],
       [62, 'M', 39, 23, '0'],
       [62, 'M', 92, 29, '0'],
       [80, 'F', 35, 31, '0'],
       [84, 'F', 55, 28, '0'],
       [79, 'F', 51, 35, '0'],
       [70, 'M', 68, 40, '0'],
       [72, 'M', 44, 27, '1'],
       [82, 'M', 61, 29, '1'],
       [73, 'F', 115, 47, '1'],
       [87, 'F', 61, 38, '0'],
       [67, 'M', 91, 24, '0'],
       [86, 'F', 67, 22, '0'],
       [60, 'M', 63, 31, '1'],
       [61, 'F', 70, 40, '1'],
       [65, 'F', 70, 52, '1'],
       [70, 'M', 76, 28, '1'],
       [81, 'M', 63, 52, '1'],
       [74, 'F', 85, 40, '1'],
       [86, 'M', 93, 56, '1'],
       [78, 'F', 72, 21, '0'],
       [60, 'F', 60, 39, '0'],
       [79, 'F', 78, 59, '1'],
       [106, 'M', 97, 48, '1'],
       [76, 'M', 48, 34, '0'],
       [88, 'M', 70, 49, '1'],
       [79, 'F', 47, 36, '1'],
       [69, 'M', 88, 20, '0'],
       [73, 'F', 74, 21, '0'],
       [57, 'F', 59, 22, '1'],
      

In [10]:
# outcome
y = arr[:,0]
y

array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B

In [11]:
clf = cart.DecisionTreeClassifier(max_depth=1000000)

In [12]:
y.size

200

In [13]:
clf.fit(X, y)

IndexError: list index out of range

In [None]:
idx=1
thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
y

In [None]:
distinct_classes = np.unique(y)
distinct_classes

In [None]:
for i in range(len(distinct_classes)):
    y = np.where(y == distinct_classes[i], i,y)
    
y

In [None]:
num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]

In [None]:
num_samples_per_class

In [None]:
thresholds, classes = zip(*sorted(zip(X[:, idx], y)))

In [None]:
idx=1

In [None]:
np.unique(X[:,-1])