In [49]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# !conda install -y -c conda-forge scikit-learn 
# !conda install -y plotly

# -y -c = yes channel, conda-forge = channel with the latest libraries

usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: yes channel, conda-forge = channel with the latest libraries


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\kfatur\anaconda3

  added / updated specs:
    - plotly


The following packages will be UPDATED:

  ca-certificates    conda-forge::ca-certificates-2020.12.~ --> pkgs/main::ca-certificates-2021.1.19-haa95532_1

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi            conda-forge::certifi-2020.12.5-py38ha~ --> pkgs/main::certifi-2020.12.5-py38haa95532_0
  conda              conda-forge::conda-4.10.0-py38haa244f~ --> pkgs/main::conda-4.10.0-py38haa95532_0
  openssl            conda-forge::openssl-1.1.1k-h8ffe710_0 --> pkgs/main::openssl-1.1.1k-h2bbff1b_0


Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done


Laden der Daten:

In [4]:
data = pd.read_csv('data_insurance.csv', sep = ";", 
                   names = ['age', 'sex', 'bmi', 'children', 'smoker', 'canton', "pbf", 'charges'])
data = data.iloc[1:]

data['charges'] = data['charges'].str.replace("’", "").apply(pd.to_numeric)
data['pbf'] = data['pbf'].apply(pd.to_numeric)
data['bmi'] = data['bmi'].apply(pd.to_numeric)
data['age'] = data['age'].apply(pd.to_numeric)
data['children'] = data['children'].apply(pd.to_numeric)

# Encode categorical variables.
data["female"] = data["sex"] == "w"
data["male"] = data["sex"] == "m"
data["smoker"] = data["smoker"].apply(lambda x: x=="ja")

for canton_name in data["canton"].unique():
    data[canton_name.lower()] = data["canton"] == canton_name

# Remove encoded categorical variables.
data = data.drop('sex', axis=1)
data = data.drop('canton', axis=1)

# 1 record with negative percentage of body fat.
data = data[data["pbf"]>0]

# Divide the charges into categories. Is there a more objective way to determine the categories? K-clustering?
bins = [0, 5000, 10000, 20000, 100000]
bin_labels = [0, 1, 2, 3]
data["charges_level"] = pd.cut(data["charges"], bins=bins, labels=bin_labels, include_lowest=True)

data.head()

Unnamed: 0,age,bmi,children,smoker,pbf,charges,female,male,so,ag,bs,bl,charges_level
1,19,24.72,0,True,35.94,4253,True,False,True,False,False,False,0
2,18,29.416,1,False,26.86,2494,False,True,False,True,False,False,0
3,28,28.8,3,False,26.92,3138,False,True,False,True,False,False,0
4,33,20.564,0,False,7.98,1553,False,True,False,False,True,False,0
5,32,25.504,0,False,21.84,2768,False,True,False,False,True,False,0


In [5]:
data.describe()

Unnamed: 0,age,bmi,children,pbf,charges
count,1337.0,1337.0,1337.0,1337.0,1337.0
mean,39.207928,26.937517,1.095737,31.700337,6694.910995
std,14.055179,4.874029,1.205571,11.988762,5973.323665
min,18.0,15.168,0.0,3.7,1171.0
25%,27.0,23.452,0.0,22.92,3292.0
50%,39.0,26.72,1.0,30.99,4847.0
75%,51.0,30.16,2.0,40.02,7864.0
max,64.0,44.904,5.0,66.27,59703.0


In [10]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

graph_data = data.sort_values(by=['charges']).reset_index()
# To estimate what percentage of clients incur what amount of charges, add column 'progression' of charges.
graph_data['progression'] = graph_data.index / max(graph_data.index)

fig = make_subplots(rows=2, cols=2)

fig.add_trace(go.Scatter(x=graph_data['progression'], y=graph_data['charges'],
                        name='Charges progression'),
             row=1, col=1)
fig.add_trace(go.Histogram(x=graph_data['charges'],
                          name='Charges distribution'),
             row=1, col=2)
fig.add_trace(go.Histogram(x=graph_data['charges_level'],
                          name='Clients per charges category'),
             row=2, col=2)

# Charges categories determined manually with bins above, should be done later with k-clustering
fig.add_trace(go.Box(y=graph_data[graph_data['charges_level']==0]['charges'],
                    name='low'),
              row=2, col=1)
fig.add_trace(go.Box(y=graph_data[graph_data['charges_level']==1]['charges'],
                    name='mid'),
              row=2, col=1)
fig.add_trace(go.Box(y=graph_data[graph_data['charges_level']==2]['charges'],
                    name='high'),
             row=2, col=1)
fig.add_trace(go.Box(y=graph_data[graph_data['charges_level']==3]['charges'],
                    name='extreme'),
             row=2, col=1)

fig.update_layout(height=800, width=1000,
                  title_text='Charges categorization',
                  legend=dict(orientation="h",
                              yanchor="bottom",
                              y=1.02,
                              xanchor="right", x=1))
fig.show()


In [27]:
from sklearn.model_selection import train_test_split

labels = np.array(data['charges'])
# The data we want to split = 'input'
input = data.drop('charges', axis=1).drop('charges_level', axis=1)

# The argument for the split function has to be an array
input_list = list(input.columns)
input = np.array(input)

# Split the data into training and test sets
train_input, test_input, train_labels, test_labels = \
    train_test_split(input, labels, test_size = 0.25, random_state = 42) 
# to make your tests reproducible, random_state gives the same output for each call

print('Training input shape:', train_input.shape)
print('Training labels shape:', train_labels.shape)
print('Testing input shape:', test_input.shape)
print('Testing labels Shape:', test_labels.shape)

Training input shape: (1002, 11)
Training labels shape: (1002,)
Testing input shape: (335, 11)
Testing labels Shape: (335,)


In [35]:
# Built a regression random forest model
# We obtain continuous labels, which we categorize later manually - it improves the model's predictive power / accuracy.

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()
model.fit(train_input, train_labels)

# Round up prediction labels as integers
prediction_labels = model.predict(test_input).astype(int)

result = pd.DataFrame()
result['prediction'] = prediction_labels
result['prediction_charges_level'] = pd.cut(result['prediction'],
                                            bins=bins, # use bins from before
                                            labels=bin_labels, 
                                            include_lowest=True)

result['actual'] = test_labels
result['actual_charges_level'] = pd.cut(result['actual'], 
                                        bins=bins, 
                                        labels=bin_labels, 
                                        include_lowest=True)

# Find out the difference between the predicted and the actual value for each record
result['charges_diff'] = abs(result['prediction'] - result['actual'])
# The difference as percentage of the charge
result['charges_perc_diff'] = result['charges_diff'] / ((result['prediction'] + result['actual']) / 2)
result['correct_charge_level'] = result['prediction_charges_level'] == result['actual_charges_level']

result.head()

Unnamed: 0,prediction,prediction_charges_level,actual,actual_charges_level,charges_diff,charges_perc_diff,correct_charge_level
0,2674,0,2566,0,108,0.041221,True
1,3626,0,3939,0,313,0.08275,True
2,9986,1,10723,2,737,0.071177,False
3,5147,1,4811,0,336,0.067483,False
4,5446,1,5847,1,401,0.071017,True


In [38]:
# Find the difference (in %) per level of charges
result.merge(
    result.groupby('actual_charges_level').mean()['charges_perc_diff'].rename('diff_perc_level').reset_index(),
    left_on='actual_charges_level', right_on='actual_charges_level') # on what to join

Unnamed: 0,prediction,prediction_charges_level,actual,actual_charges_level,charges_diff,charges_perc_diff,correct_charge_level,diff_perc_level
0,2674,0,2566,0,108,0.041221,True,0.075154
1,3626,0,3939,0,313,0.082750,True,0.075154
2,5147,1,4811,0,336,0.067483,False,0.075154
3,2573,0,2344,0,229,0.093146,True,0.075154
4,3605,0,3840,0,235,0.063130,True,0.075154
...,...,...,...,...,...,...,...,...
330,32639,3,33041,3,402,0.012241,True,0.231371
331,36290,3,34928,3,1362,0.038249,True,0.231371
332,37641,3,40939,3,3298,0.083940,True,0.231371
333,38304,3,39746,3,1442,0.036951,True,0.231371


In [43]:
# Model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.figure_factory as ff

accuracy = accuracy_score(result["actual_charges_level"], result["prediction_charges_level"])


# The conf. matrix compares the actual values against the predicted values
cf_matrix = confusion_matrix(result["actual_charges_level"], result["prediction_charges_level"])
cf_matrix
fig = ff.create_annotated_heatmap(cf_matrix, 
                                  x=['low', 'mid', 'high', 'extreme'],
                                  y=['p_low', 'p_mid', 'p_high', 'p_extreme'], # predicted catogories
                                  annotation_text=cf_matrix)
fig.update_layout(title_text="Confusion Matrix")
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Accuracy score: %.3f" % accuracy,
                        xref="paper", 
                        yref="paper"))
fig.show()

# Setting xref/yref to "paper" will cause the x and y attributes to be interpreted
# in paper coordinates / normalized coordinates and the object will be placed 
# absoutely (in our case, to avoid overalping with the matrix.)

#### Interpretation of the confusion matrix
The diagonal values stand for the correctly classified cases.

| Charges level | Correctly classified | Accuracy |
| --- | --- | --- |
| low | 168/(168+13) | .928 |
| mid | 87/100 | .870 |
| high | 30/38 | .789 |
| extreme | 14/16 | .875 |


In [46]:
# Overview of the results
level_analysis = data.groupby("charges_level").mean().reset_index()
level_analysis["level_name"] = ["low", "mid", "high", "extreme"]
level_analysis.head()

Unnamed: 0,charges_level,age,bmi,children,smoker,pbf,charges,female,male,so,ag,bs,bl,level_name
0,0,33.146802,24.566105,1.039244,0.050872,26.831977,3296.171512,0.465116,0.534884,0.273256,0.255814,0.215116,0.255814,low
1,1,43.503401,28.206821,1.151927,0.260771,35.054943,7005.811791,0.555556,0.444444,0.219955,0.251701,0.287982,0.240363,mid
2,2,49.248447,31.712447,1.124224,0.490683,40.205839,13484.198758,0.478261,0.521739,0.198758,0.354037,0.242236,0.204969,high
3,3,53.234043,33.384426,1.297872,0.93617,42.352766,30272.574468,0.425532,0.574468,0.148936,0.425532,0.234043,0.191489,extreme


In [47]:
fig = px.bar(level_analysis, x='charges_level', y='smoker',
             text="smoker",
             title="Percentage of smokers per charges category")

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_traces(texttemplate='%{text:.2f%}', textposition='outside')
fig.update_yaxes(range=[0, 1], title="% of smokers")
fig.update_xaxes(title="Charges level", tickmode="array", 
                 tickvals=level_analysis["charges_level"],
                 ticktext=level_analysis["level_name"])

fig.show()