In [1]:
import pandas as pd

fetal_health = pd.read_csv('fetal_health.csv')

fetal_health['histogram_tendency'] = fetal_health['histogram_tendency'].astype(str)
fetal_health = pd.get_dummies(fetal_health)
fetal_health.fetal_health = fetal_health.fetal_health.astype(int).astype(str) #make outcome categorical


In [2]:
from sklearn.model_selection import train_test_split
fetal_health.shape


fetal_health, fetal_health_test = train_test_split(fetal_health, 
                                    test_size = .2) ## withold our test set

fetal_health_train, fetal_health_validate = train_test_split(fetal_health, 
                                    test_size = .2) 


#We have 2126 observations, so .3 should give us some of each class in the validation set.

In [3]:
## Evaluate class imbalance

fetal_health_train_1 = fetal_health_train.loc[fetal_health_train.fetal_health == '1']
fetal_health_train_2 = fetal_health_train.loc[fetal_health_train.fetal_health == '2']
fetal_health_train_3 = fetal_health_train.loc[fetal_health_train.fetal_health == '3']

print(fetal_health_train_1.shape)
print(fetal_health_train_2.shape)
print(fetal_health_train_3.shape)


(1034, 24)
(201, 24)
(104, 24)


In [5]:
## rebalance classes in the train set

from sklearn.utils import resample

def rebalance_classes(input_df):
    class_1 = input_df.loc[input_df.fetal_health == '1']
    class_2 = input_df.loc[input_df.fetal_health == '2']
    class_3 = input_df.loc[input_df.fetal_health == '3']
    
    class_2 = resample(class_2, n_samples = class_1.shape[0])
    class_3 = resample(class_3, n_samples = class_1.shape[0])
    
    output_df = pd.concat([class_1, class_2, class_3], ignore_index = True)
    return(output_df)
    
fetal_health_train = rebalance_classes(fetal_health_train)


In [6]:
from sklearn.ensemble import RandomForestClassifier 

RandomForest_100 = RandomForestClassifier(
n_estimators = 100, #I want to experiment with different values of this
criterion = "gini" #I don't think this should perform too differently from entropy, but I want to try 
)

In [7]:
rf_100_fit = RandomForest_100.fit(X = fetal_health_train.drop('fetal_health', axis = 1), 
                  y = fetal_health_train.fetal_health)
#In the future I might do cross validation here


rf_100_preds = rf_100_fit.predict(X= fetal_health_validate.drop('fetal_health', axis = 1))

In [8]:
## Overall accuracy
import numpy as np

agreement = (rf_100_preds == fetal_health_validate.fetal_health)

print("Overall Accuracy:")

print(np.mean(agreement))

for i in range(3):
    my_class = str(i + 1)
    print('')
    print("Accuracy in class " + my_class + ":")
    print(np.mean(agreement[fetal_health_validate.fetal_health == my_class]))


Overall Accuracy:
0.9442508710801394

Accuracy in class 1:
0.9805194805194806

Accuracy in class 2:
0.7647058823529411

Accuracy in class 3:
0.8409090909090909


In [9]:
from sklearn.metrics import confusion_matrix

confusion_matrix(fetal_health_validate.fetal_health, rf_100_preds)


array([[422,  12,   1],
       [ 19,  64,   3],
       [  1,   2,  50]])

In [10]:
## Function for training and accuracy without rebalancing.
# I want to do a simulation study to see if rebalancing is worth it.

def no_rebalancing():
    fetal_health_train, fetal_health_validate = train_test_split(fetal_health, 
                                    test_size = .3) 
    RandomForest_100 = RandomForestClassifier(n_estimators = 100)
    rf_100_fit = RandomForest_100.fit(X = fetal_health_train.drop('fetal_health', axis = 1), y = fetal_health_train.fetal_health)
    rf_100_preds = rf_100_fit.predict(X= fetal_health_validate.drop('fetal_health', axis = 1))
    agreement = (rf_100_preds == fetal_health_validate.fetal_health)
    return([np.mean(agreement), 
           np.mean(agreement[fetal_health_validate.fetal_health == '1']),
           np.mean(agreement[fetal_health_validate.fetal_health == '2']),
           np.mean(agreement[fetal_health_validate.fetal_health == '3'])])
    
                                              

In [11]:
def with_rebalancing():
    fetal_health_train, fetal_health_validate = train_test_split(fetal_health, 
                                    test_size = .3) 
    fetal_health_train = rebalance_classes(fetal_health_train)
    
    
    RandomForest_100 = RandomForestClassifier(n_estimators = 100)
    rf_100_fit = RandomForest_100.fit(X = fetal_health_train.drop('fetal_health', axis = 1), y = fetal_health_train.fetal_health)
    rf_100_preds = rf_100_fit.predict(X= fetal_health_validate.drop('fetal_health', axis = 1))
    agreement = (rf_100_preds == fetal_health_validate.fetal_health)
    return([np.mean(agreement), 
           np.mean(agreement[fetal_health_validate.fetal_health == '1']),
           np.mean(agreement[fetal_health_validate.fetal_health == '2']),
           np.mean(agreement[fetal_health_validate.fetal_health == '3'])])

In [12]:
with_rebalancing_mat = np.matrix(["overall", "1", "2", '3'])

no_rebalancing_mat = np.matrix(["overall", "1", "2", '3'])

for i in range(100): #Do 100 replications to see what happens
    with_rebalancing_mat = np.vstack([with_rebalancing_mat, with_rebalancing()])
    no_rebalancing_mat = np.vstack([no_rebalancing_mat, no_rebalancing()])



In [13]:
with_rebalancing_df = pd.DataFrame(with_rebalancing_mat[1:,:])
no_rebalancing_df = pd.DataFrame(no_rebalancing_mat[1:,:])

mat_cols = ["overall", "1", "2", '3']

with_rebalancing_df.columns = mat_cols
no_rebalancing_df.columns = mat_cols

for col in mat_cols:
    with_rebalancing_df[col] = with_rebalancing_df[col].astype('float')
    no_rebalancing_df[col] = no_rebalancing_df[col].astype('float')

In [14]:
print("Accracy without rebalancing")
no_rebalancing_df.mean()

Accracy without rebalancing


overall    0.935314
1          0.979382
2          0.740907
3          0.848591
dtype: float64

In [15]:
print("Accuracy after rebalancing")
with_rebalancing_df.mean()

Accuracy after rebalancing


overall    0.926394
1          0.977576
2          0.699492
3          0.828892
dtype: float64

So, it looks like oversampling was a good idea. We had a bit better accuracy on the minority classes, and a slight increase in overall accuracy. 

Now, let's train our final model. 



In [16]:
##Using the full training set from above
RandomForest_100 = RandomForestClassifier(n_estimators = 100)


fetal_health = rebalance_classes(fetal_health)
rf_100_fit = RandomForest_100.fit(X = fetal_health.drop('fetal_health', axis = 1), y = fetal_health.fetal_health)
rf_100_preds = rf_100_fit.predict(X= fetal_health_test.drop('fetal_health', axis = 1))


importances = rf_100_fit.feature_importances_

In [17]:

important_labelled = pd.DataFrame(zip(fetal_health.columns[:-1], importances))

In [18]:
important_labelled.sort_values(1, ascending = False)

Unnamed: 0,0,1
17,histogram_mean,0.116926
8,mean_value_of_short_term_variability,0.103222
9,percentage_of_time_with_abnormal_long_term_var...,0.09815
7,abnormal_short_term_variability,0.086904
16,histogram_mode,0.081335
18,histogram_median,0.070752
3,uterine_contractions,0.057534
1,accelerations,0.052281
0,baseline value,0.045603
11,histogram_width,0.0442


In [19]:
agreement = (rf_100_preds == fetal_health_test.fetal_health)

print("Overall Accuracy:")

print(np.mean(agreement))

for i in range(3):
    my_class = str(i + 1)
    print('')
    print("Accuracy in class " + my_class + ":")
    print(np.mean(agreement[fetal_health_test.fetal_health == my_class]))

Overall Accuracy:
0.9389671361502347

Accuracy in class 1:
0.9882352941176471

Accuracy in class 2:
0.6896551724137931

Accuracy in class 3:
0.8571428571428571
