In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.shape

(1338, 7)

In [4]:
sex_cleanup = {'sex': {'male': 1, 'female': 0}}
df.replace(sex_cleanup, inplace=True)

smoker_cleanup = {'smoker': {'yes': 1, 'no': 0}}
df.replace(smoker_cleanup, inplace = True)

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [6]:
df =pd.get_dummies(df)

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [8]:
for column in df.columns:
    df[column] = df[column].fillna(df[column].median())

In [9]:
df.isnull().sum()

age                 0
sex                 0
bmi                 0
children            0
smoker              0
charges             0
region_northeast    0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

In [10]:
df['sex'].value_counts()

1    676
0    662
Name: sex, dtype: int64

In [11]:
X = df.drop('sex',axis =1)
y = df['sex']

## MLP Classifier

In [12]:
mlp = MLPClassifier(hidden_layer_sizes=(20,))

In [13]:
mlp.fit(X,y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [31]:
scores_mlp = cross_val_score(mlp, X, y, cv =5)
print('MLP classfier score: {:.2%} +/- {:.2%}'.format (scores_mlp.mean(), scores_mlp.std()))
y_pred_mlp = mlp.predict(X)
print('Classifcation Report:',classification_report(y, y_pred_mlp))

MLP classfier score: 49.70% +/- 0.44%
Classifcation Report:              precision    recall  f1-score   support

          0       0.50      0.79      0.61       662
          1       0.53      0.23      0.32       676

avg / total       0.52      0.51      0.47      1338



## Changing Activation Function

In [15]:
mlp2 = MLPClassifier(activation='logistic')

In [16]:
mlp2.fit(X,y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [32]:
scores_mlp2 = cross_val_score(mlp2, X, y, cv =5)
print('MLP classfier score: {:.2%} +/- {:.2%}'.format (scores_mlp2.mean(), scores_mlp2.std()))
y_pred_mlp2 = mlp2.predict(X)
print('Classifcation Report:',classification_report(y, y_pred_mlp2))

MLP classfier score: 50.52% +/- 0.08%
Classifcation Report:              precision    recall  f1-score   support

          0       0.00      0.00      0.00       662
          1       0.51      1.00      0.67       676

avg / total       0.26      0.51      0.34      1338



  'precision', 'predicted', average, warn_for)


## Changing Hidden Layer Size

In [21]:
mlp3 = MLPClassifier(hidden_layer_sizes=(9,8,7,6))

In [22]:
mlp3.fit(X,y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(9, 8, 7, 6), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [33]:
scores_mlp3 = cross_val_score(mlp3, X, y, cv =5)
print('MLP classfier score: {:.2%} +/- {:.2%}'.format (scores_mlp3.mean(), scores_mlp3.std()))
y_pred_mlp3 = mlp3.predict(X)
print('Classifcation Report:',classification_report(y, y_pred_mlp3))

MLP classfier score: 50.52% +/- 0.71%
Classifcation Report:              precision    recall  f1-score   support

          0       0.50      0.70      0.58       662
          1       0.51      0.30      0.38       676

avg / total       0.50      0.50      0.48      1338



## Random Forest Classifer

In [18]:
rfc = RandomForestClassifier(max_depth=20)
rfc.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
scores_rfc = cross_val_score(rfc, X, y, cv = 5)
print('Random Forest Classifier Score: {:.2%} +/- {:.2%}'.format(scores_rfc.mean(),scores_rfc.std()))
y_pred_rfc = rfc.predict(X)
print('Classifcation Report:',classification_report(y, y_pred_rfc))

Random Forest Classifier Score: 54.78% +/- 1.65%
Classifcation Report:              precision    recall  f1-score   support

          0       0.97      0.99      0.98       662
          1       0.99      0.97      0.98       676

avg / total       0.98      0.98      0.98      1338



## Conclusion

I tried a few different models using an MLP Classifier. The initial model I tried with 20 layers got a score of 50%. I tried using a model where I changed the activation to logistic and it was worse. I also tried a model where I modified the zise of the hidden layers which was similar to the initial model. I think the Random Forest model that I made is overfitting the data in comparison to my first MLP model.