In [1]:
### Imports
import pandas as pd
from sklearn import tree
from sklearn import metrics

In [2]:
### Load File
fullData = pd.read_csv('De-identified ARR Dataset - new.csv')

In [32]:
### Remove columns which either have all/mostly blank values
### or are the values we are attempting to predict
headers = list(fullData.columns.values)
badHeaders = [
    'id', 'BMI', 'Number of Future Relapses 1monthto3years', 
    'Number of Future Relapses 1yrto3yrs', 
    'NfLValue', 'HighNfL Binary'
]
for header in badHeaders:
    headers.remove(header)

trimmedData = fullData[headers]

In [33]:
### Find Blank Values
print(trimmedData.isna().sum()[trimmedData.isna().sum() > 0])

DiseasedurationatFV         1
EDSS_FV                   109
PYRAMIDAL_FUNCTION         98
CEREBELLAR_FUNCTION        99
BRAINSTEM_FUNCTION        100
SENSORY_FUNCTION           99
BOWEL_BLADDER_FUNCTION    102
VISUAL_FUNCTION            99
MENTAL_FUNCTION            99
dtype: int64


In [None]:
### Remove single blank row from DiseasedurationatFV
trimmedData = trimmedData.dropna(subset=['DiseasedurationatFV'])
print(trimmedData.isna().sum()[trimmedData.isna().sum() > 0])
print(trimmedData.shape)

EDSS_FV                   109
PYRAMIDAL_FUNCTION         98
CEREBELLAR_FUNCTION        99
BRAINSTEM_FUNCTION        100
SENSORY_FUNCTION           99
BOWEL_BLADDER_FUNCTION    102
VISUAL_FUNCTION            99
MENTAL_FUNCTION            99
dtype: int64
(2191, 27)


In [None]:
### Investigate pattern of blank rows
print(trimmedData.iloc[::, 7:16].isna().value_counts())

DISEASE_CATEGORY_DESC_FV  EDSS_FV  PYRAMIDAL_FUNCTION  CEREBELLAR_FUNCTION  BRAINSTEM_FUNCTION  SENSORY_FUNCTION  BOWEL_BLADDER_FUNCTION  VISUAL_FUNCTION  MENTAL_FUNCTION
False                     False    False               False                False               False             False                   False            False              1993
                          True     False               False                False               False             False                   False            False                93
                          False    True                True                 True                True              True                    True             True                 80
                          True     True                True                 True                True              True                    True             True                 16
                          False    False               False                False               False            

In [None]:
### Begin with simplest: remove rows with blanks
blanksRemoved = trimmedData.dropna()
print(blanksRemoved.shape)
categoricalColumns = blanksRemoved.select_dtypes(include=['object']).columns.to_list()
remainingColumns = blanksRemoved.drop(columns=categoricalColumns)
oneHots = pd.get_dummies(blanksRemoved[categoricalColumns])
print(remainingColumns.shape)
print(oneHots.shape)
oneHotPlusNumerical = pd.concat([remainingColumns, oneHots],axis=1)
print(oneHotPlusNumerical.shape)

(1993, 27)
(1993, 11)
(1993, 79)
(1993, 90)


In [None]:
### Reset indices since we deleted rows without reindexing
print(oneHotPlusNumerical.index)
oneHotPlusNumerical.reset_index(drop = True, inplace = True)
print(oneHotPlusNumerical.index)

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191],
      dtype='int64', length=1993)
RangeIndex(start=0, stop=1993, step=1)


In [None]:
### Perform Decision Tree classification and check accuracy
import warnings
warnings.filterwarnings('ignore')
treeClassifier = tree.DecisionTreeClassifier(max_depth=5)
indicator = oneHotPlusNumerical['Future Relapse Binary']
xValues = oneHotPlusNumerical.drop(['Future Relapse Binary'], axis=1)
treeModel = treeClassifier.fit(xValues, indicator)

classificationResults = pd.DataFrame(columns=['Real Label', 'Predicted Label', 'Difference'])
classificationResultsArray = []
count = 0
for _, row in oneHotPlusNumerical.iterrows():
    classificationResultsArray.append(treeModel.predict([row[1:]])[0])

classificationResults['Predicted Label'] = pd.Series(classificationResultsArray)
classificationResults['Real Label'] = oneHotPlusNumerical['Future Relapse Binary']
classificationResults['Difference'] = abs(classificationResults['Real Label'] - classificationResults['Predicted Label'])
diffValueCounts = classificationResults['Difference'].value_counts()
print(diffValueCounts)
print(f'Accuracy: {(diffValueCounts[0]/(diffValueCounts[0] + diffValueCounts[1]))*100:.2f}%')

Difference
0    1113
1     880
Name: count, dtype: int64
Accuracy: 55.85%


In [None]:
mtrc = metrics.accuracy_score
## prefer class 1 to be high (ensure we detect those who are likely to relapse) even at the cost of low class 0 accuracy
## aim is class 1 accuracy 80%