In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
access = pd.read_csv('generated_access_data.csv')
access.head()

Unnamed: 0,p_id,d_id,location_of_access,time_of_access,Specialization,diagnostic_category,emergency,access_granted
0,0f5dd9c2-4444-445c-9c4d-dd458450a6ac,1085ca4a-d2e9-41a7-adeb-ead185f9547b,00:16:3e:59:66:25,14:37,GENERAL PRACTICE,INJURIES / POSIONING AND TOXIC EFFECTS OF DRUGS,no,yes
1,6301e923-b258-4d3c-bf5e-50fdaedd8ffd,1085ca4a-d2e9-41a7-adeb-ead185f9547b,00:16:3e:59:66:25,19:58,GENERAL PRACTICE,FACTORS INFLUENCING HEALTH STATUS AND OTHER C...,no,yes
2,0806f223-e089-48b1-add0-4281b6983745,1085ca4a-d2e9-41a7-adeb-ead185f9547b,00:16:3e:59:66:25,12:23,GENERAL PRACTICE,SKIN + SUBCUTANEOUS TISSUE AND BREAST,no,yes
3,d05ebf41-ad86-4671-a6ba-2921334279b2,1085ca4a-d2e9-41a7-adeb-ead185f9547b,00:16:3e:59:66:25,16:48,GENERAL PRACTICE,RESPIRATORY SYSTEM,no,yes
4,e85a08b1-ec4e-479d-900c-ee09583736da,1085ca4a-d2e9-41a7-adeb-ead185f9547b,00:16:3e:59:66:25,17:29,GENERAL PRACTICE,BURNS,no,yes


In [3]:
# count the number of entries
len(access)

858400

In [4]:
# check if there are any null values
access.isnull().sum()

p_id                   0
d_id                   0
location_of_access     0
time_of_access         0
Specialization         0
diagnostic_category    0
emergency              0
access_granted         0
dtype: int64

In [5]:
fuzzified = pd.read_csv('generated_fuzzified_access_data.csv')
fuzzified.head()

Unnamed: 0,diagnostic_category,location_of_access,time_of_access,emergency,previous_history,access_granted
0,1.0,1.0,1.0,0,0.95,1
1,1.0,1.0,1.0,0,0.95,1
2,1.0,1.0,1.0,0,0.95,1
3,1.0,1.0,1.0,0,0.95,1
4,1.0,1.0,1.0,0,0.95,1


In [6]:
fuzzified.describe()

Unnamed: 0,diagnostic_category,location_of_access,time_of_access,emergency,previous_history,access_granted
count,858400.0,858400.0,858400.0,858400.0,858400.0,858400.0
mean,0.713793,0.74141,0.865731,0.024395,0.749247,0.799449
std,0.384819,0.38553,0.286185,0.154273,0.266177,0.400413
min,0.1,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.5,1.0,0.0,0.66,1.0
50%,1.0,1.0,1.0,0.0,0.86,1.0
75%,1.0,1.0,1.0,0.0,0.93,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# seperate the features and target variables
features = fuzzified[['diagnostic_category', 'location_of_access', 'time_of_access', 'emergency', 'previous_history']]
target = fuzzified['access_granted']

In [8]:
# split the data into training and testing data
# 70% training, 30% testing
# random state is set so that every time the same split is achieved
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

In [9]:
# initialize a random forest classifier
# try tuning in different parameters 
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

In [10]:
# make predictions
predictions = clf.predict(X_test)

In [11]:
# calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy

0.94187247592420009

In [12]:
# calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [13]:
tn

45003

In [14]:
# precision, recall and fscore
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.84      0.87      0.86     51520
          1       0.97      0.96      0.96    206000

avg / total       0.94      0.94      0.94    257520

