In [77]:
import pandas as pd

fetal_health = pd.read_csv('fetal_health.csv')

fetal_health['histogram_tendency'] = fetal_health['histogram_tendency'].astype(str)
fetal_health = pd.get_dummies(fetal_health)
fetal_health.fetal_health = fetal_health.fetal_health.astype(int).astype(str) #make outcome categorical


In [78]:
from sklearn.model_selection import train_test_split
fetal_health.shape

fetal_health_train, fetal_health_test = train_test_split(fetal_health, 
                                    test_size = .3) 


#We have 2126 observations, so .3 should give us some of each class in the test set.

In [79]:
## Evaluate class imbalance

fetal_health_train_1 = fetal_health_train.loc[fetal_health_train.fetal_health == '1']
fetal_health_train_2 = fetal_health_train.loc[fetal_health_train.fetal_health == '2']
fetal_health_train_3 = fetal_health_train.loc[fetal_health_train.fetal_health == '3']

print(fetal_health_train_1.shape)
print(fetal_health_train_2.shape)
print(fetal_health_train_3.shape)


(1164, 24)
(202, 24)
(122, 24)


In [80]:
## rebalance classes in the train set

from sklearn.utils import resample

fetal_health_train_3 = resample(fetal_health_train_3, 
                               n_samples = fetal_health_train_1.shape[0])

fetal_health_train_2 = resample(fetal_health_train_2, 
                               n_samples = fetal_health_train_1.shape[0])

fetal_health_train = pd.concat([fetal_health_train_1,fetal_health_train_2,fetal_health_train_3], 
                               ignore_index=True)

In [81]:
from sklearn.ensemble import RandomForestClassifier 

RandomForest_100 = RandomForestClassifier(
n_estimators = 100, #I want to experiment with different values of this
criterion = "gini" #I don't think this should perform too differently from entropy, but I want to try 
)

In [82]:
rf_100_fit = RandomForest_100.fit(X = fetal_health_train.drop('fetal_health', axis = 1), 
                  y = fetal_health_train.fetal_health)
#In the future I might do cross validation here


rf_100_preds = rf_100_fit.predict(X= fetal_health_test.drop('fetal_health', axis = 1))

In [83]:
## Overall accuracy
import numpy as np

agreement = (rf_100_preds == fetal_health_test.fetal_health)

print("Overall Accuracy:")

print(np.mean(agreement))

for i in range(3):
    my_class = str(i + 1)
    print('')
    print("Accuracy in class " + my_class + ":")
    print(np.mean(agreement[fetal_health_test.fetal_health == my_class]))


Overall Accuracy:
0.9373040752351097

Accuracy in class 1:
0.9755600814663951

Accuracy in class 2:
0.7419354838709677

Accuracy in class 3:
0.9259259259259259


In [84]:
from sklearn.metrics import confusion_matrix

confusion_matrix(fetal_health_test.fetal_health, rf_100_preds)


array([[479,  11,   1],
       [ 21,  69,   3],
       [  1,   3,  50]])

In [95]:
## Function for training and accuracy without rebalancing.
# I want to do a simulation study to see if rebalancing is worth it.

def no_rebalancing():
    fetal_health_train, fetal_health_test = train_test_split(fetal_health, 
                                    test_size = .3) 
    RandomForest_100 = RandomForestClassifier(n_estimators = 100)
    rf_100_fit = RandomForest_100.fit(X = fetal_health_train.drop('fetal_health', axis = 1), y = fetal_health_train.fetal_health)
    rf_100_preds = rf_100_fit.predict(X= fetal_health_test.drop('fetal_health', axis = 1))
    agreement = (rf_100_preds == fetal_health_test.fetal_health)
    return([np.mean(agreement), 
           np.mean(agreement[fetal_health_test.fetal_health == '1']),
           np.mean(agreement[fetal_health_test.fetal_health == '2']),
           np.mean(agreement[fetal_health_test.fetal_health == '3'])])
    
                                              

In [96]:
def with_rebalancing():
    fetal_health_train, fetal_health_test = train_test_split(fetal_health, 
                                    test_size = .3) 
    fetal_health_train_1 = fetal_health_train.loc[fetal_health_train.fetal_health == '1']
    fetal_health_train_2 = fetal_health_train.loc[fetal_health_train.fetal_health == '2']
    fetal_health_train_3 = fetal_health_train.loc[fetal_health_train.fetal_health == '3']
    
    fetal_health_train_3 = resample(fetal_health_train_3, 
                               n_samples = fetal_health_train_1.shape[0])

    fetal_health_train_2 = resample(fetal_health_train_2, 
                               n_samples = fetal_health_train_1.shape[0])
    fetal_health_train = pd.concat([fetal_health_train_1,fetal_health_train_2,fetal_health_train_3], 
                               ignore_index=True)
    
    
    RandomForest_100 = RandomForestClassifier(n_estimators = 100)
    rf_100_fit = RandomForest_100.fit(X = fetal_health_train.drop('fetal_health', axis = 1), y = fetal_health_train.fetal_health)
    rf_100_preds = rf_100_fit.predict(X= fetal_health_test.drop('fetal_health', axis = 1))
    agreement = (rf_100_preds == fetal_health_test.fetal_health)
    return([np.mean(agreement), 
           np.mean(agreement[fetal_health_test.fetal_health == '1']),
           np.mean(agreement[fetal_health_test.fetal_health == '2']),
           np.mean(agreement[fetal_health_test.fetal_health == '3'])])

In [97]:
with_rebalancing_mat = np.matrix(["overall", "1", "2", '3'])

no_rebalancing_mat = np.matrix(["overall", "1", "2", '3'])

for i in range(100):
    with_rebalancing_mat = np.vstack([with_rebalancing_mat, with_rebalancing()])
    no_rebalancing_mat = np.vstack([no_rebalancing_mat, no_rebalancing()])



In [118]:
with_rebalancing_df = pd.DataFrame(with_rebalancing_mat[1:,:])
no_rebalancing_df = pd.DataFrame(no_rebalancing_mat[1:,:])

mat_cols = ["overall", "1", "2", '3']

with_rebalancing_df.columns = mat_cols
no_rebalancing_df.columns = mat_cols

for col in mat_cols:
    with_rebalancing_df[col] = with_rebalancing_df[col].astype('float')
    no_rebalancing_df[col] = no_rebalancing_df[col].astype('float')

In [121]:
print("Accracy without rebalancing")
no_rebalancing_df.mean()

Accracy without rebalancing


overall    0.939248
1          0.984403
2          0.728308
3          0.867041
dtype: float64

In [122]:
print("Accuracy after rebalancing")
with_rebalancing_df.mean()

Accuracy after rebalancing


overall    0.940282
1          0.971081
2          0.790440
3          0.896337
dtype: float64

So, it looks like oversampling was a good idea. We had a bit better accuracy on the minority class, and a slight increase in overall accuracy. 

