In [1]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
import pgmpy.inference

In [2]:
def normalizeKindAge(age):
    if age == "n.a.":
        return "n.a.";
    elif int(age) < 10:
        return "<10"
    elif int(age) < 18:
        return "<18"
    elif int(age) < 25:
        return "<25"
    else:
        return ">25"

def normalizeIncome(income):
    income = int(income)
    if income < 20000:
        return "<20000"
    elif income < 40000:
        return "<40000"
    elif income < 80000:
        return "<80000"
    elif income < 100000:
        return "<100000"
    else:
        return ">100000"

In [3]:
data = pd.read_csv('versicherung_x.csv', delimiter=';')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle data
#raw_data.loc[raw_data['aeltestesKind'] != "n.a.", 'aeltestesKind'].apply(int).plot.kde()
data['aeltestesKind'] = data['aeltestesKind'].apply(normalizeKindAge)
data['juengstesKind'] = data['juengstesKind'].apply(normalizeKindAge)
data['Familieneinkommen'] = data['Familieneinkommen'].apply(normalizeIncome)

In [4]:
state_names = {}
for name in list(data):
    state_names[name] = list(data[name].unique())
print(state_names)

{'Altersgruppe': ['18-25', '26-35', '36-49', '50-65', '66-'], 'Geschlecht': ['w', 'm'], 'Verheiratet': ['nein', 'ja'], 'Kinderzahl': [1, 2, 3, 0, 4], 'aeltestesKind': ['<10', 'n.a.', '<18', '<25', '>25'], 'juengstesKind': ['<10', 'n.a.', '<18', '>25', '<25'], 'Bildungsstand': ['Hauptschule', 'Studium', 'Lehre', 'Gymnasium'], 'Beruf': ['Handwerk', 'Handel', 'Industrie', 'Oeffentlicher Dienst'], 'Familieneinkommen': ['<40000', '<100000', '<80000', '<20000', '>100000'], 'Wohnverhaeltnis': ['Miete', 'Eigentum'], 'Ort': ['urban', 'laendlich'], 'Land': ['Bayern', 'Hessen', 'Baden-Wuerttemberg'], 'Aktienbesitz': ['nein', 'ja'], 'Tarif': ['B', 'A']}


In [5]:
def train(model, df, split=0.75):
    train = df[:int(len(df) * split)]
    model.fit(train, state_names=state_names)

def evaluate(model, df, split=0.75):
    test = df[int(len(df) * (split-1)):]
    result = model.predict(test.drop("Tarif", 1))
    result["Expected"] = test["Tarif"]
    tests = len(result)
    hits = np.where(result["Expected"] == result["Tarif"], [1 for x in range(tests)], [0 for x in range(tests)])
    print("Predicted Tarif with an accuracy of: {}".format(sum(hits) / tests))
    #print(result)

In [6]:
# Custom Network
custom_net = [
("Altersgruppe", "Bildungsstand"),
("Altersgruppe", "aeltestesKind"),
("Altersgruppe", "juengstesKind"),
("Altersgruppe", "Wohnverhaeltnis"),
("Altersgruppe", "Tarif"),

("Geschlecht", "Tarif"),

("Verheiratet", "Beruf"),
("Verheiratet", "Kinderzahl"),

("Kinderzahl", "Wohnverhaeltnis"),
("Kinderzahl", "Tarif"),

("aeltestesKind", "Wohnverhaeltnis"),

("juengstesKind", "Wohnverhaeltnis"),

("Bildungsstand", "Beruf"),
("Bildungsstand", "Aktienbesitz"),

("Beruf", "Familieneinkommen"),

("Familieneinkommen", "Tarif"),
("Familieneinkommen", "Wohnverhaeltnis"),

("Wohnverhaeltnis", "Tarif"),

("Ort", "Land"),
("Ort", "Wohnverhaeltnis"),

("Land", "Tarif"),

("Aktienbesitz", "Familieneinkommen")
]

In [7]:
# Network built with OpenMarkov
markov_net = [
("Altersgruppe", "Kinderzahl"),

("Verheiratet", "Geschlecht"),
("Verheiratet", "Ort"),

("Kinderzahl", "juengstesKind"),
("Kinderzahl", "Verheiratet"),

("juengstesKind", "aeltestesKind"),
("juengstesKind", "Ort"),

("Bildungsstand", "Beruf"),
("Bildungsstand", "Aktienbesitz"),
("Bildungsstand", "Wohnverhaeltnis"),
("Bildungsstand", "Familieneinkommen"),
("Bildungsstand", "Altersgruppe"),

("Familieneinkommen", "Altersgruppe"),

("Wohnverhaeltnis", "Tarif"),
("Wohnverhaeltnis", "Ort"),
("Wohnverhaeltnis", "Land"),

("Tarif", "Land"),
]

In [8]:
# Network all nodes to Tarif
# Actually not using all variables because it would take too long to train

simple_net = [
("Altersgruppe", "Tarif"),

#("Geschlecht", "Tarif"),

("Verheiratet", "Tarif"),

("Kinderzahl", "Tarif"),

#("aeltestesKind", "Tarif"),

#("juengstesKind", "Tarif"),

#("Bildungsstand", "Tarif"),

("Beruf", "Tarif"),

("Familieneinkommen", "Tarif"),

("Wohnverhaeltnis", "Tarif"),

#("Ort", "Tarif"),

#("Land", "Tarif"),

("Aktienbesitz", "Tarif"),
]
less_data = data.copy().drop(['Geschlecht', 'aeltestesKind', 'juengstesKind', 'Bildungsstand', 'Ort', 'Land'], axis=1)

In [9]:
for split in [0.25, 0.5, 0.75]:
    print('Split: {}%'.format(split * 100))
    for net in [custom_net, simple_net, markov_net]:
        df = data.copy()
        if net == simple_net:
            df = df.drop(['Geschlecht', 'aeltestesKind', 'juengstesKind', 'Bildungsstand', 'Ort', 'Land'], axis=1)
        model = BayesianModel(net)
        train(model, df, split=split)
        evaluate(model, df, split=split)
    print('\n')

Split: 25.0%
Predicted Tarif with an accuracy of: 0.28
Predicted Tarif with an accuracy of: 0.3
Predicted Tarif with an accuracy of: 0.8


Split: 50.0%
Predicted Tarif with an accuracy of: 0.44
Predicted Tarif with an accuracy of: 0.37
Predicted Tarif with an accuracy of: 0.87


Split: 75.0%
Predicted Tarif with an accuracy of: 0.42
Predicted Tarif with an accuracy of: 0.36
Predicted Tarif with an accuracy of: 0.92


