In [1]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
import pgmpy.inference

In [2]:
def normalizeKindAge(age):
    if age == "n.a.":
        return "n.a.";
    elif int(age) < 10:
        return "<10"
    elif int(age) < 18:
        return "<18"
    elif int(age) < 25:
        return "<25"
    else:
        return ">25"

def normalizeIncome(income):
    income = int(income)
    if income < 20000:
        return "<20000"
    elif income < 40000:
        return "<40000"
    elif income < 60000:
        return "<60000"
    elif income < 80000:
        return "<80000"
    elif income < 100000:
        return "<100000"
    else:
        return ">100000"

In [3]:
data = pd.read_csv('versicherung_x.csv', delimiter=';')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle data
#raw_data.loc[raw_data['aeltestesKind'] != "n.a.", 'aeltestesKind'].apply(int).plot.kde()
data['aeltestesKind'] = data['aeltestesKind'].apply(normalizeKindAge)
data['juengstesKind'] = data['juengstesKind'].apply(normalizeKindAge)
data['Familieneinkommen'] = data['Familieneinkommen'].apply(normalizeIncome)

In [4]:
def train(model, df, split=0.75):
    train = df[:int(len(df) * split)]
    model.fit(train, state_names={"Kinderzahl": [0, 1, 2, 3, 4]})

def evaluate(model, df, split=0.75):
    test = df[int(len(df) * split):]
    result = model.predict(test.drop("Tarif", 1))
    result["Expected"] = test["Tarif"]
    tests = len(result)
    hits = np.where(result["Expected"] == result["Tarif"], [1 for x in range(tests)], [0 for x in range(tests)])
    print("Predicted Tarif with an accuracy of: {}".format(sum(hits) / tests))
    #print(result)

In [5]:
# Custom Network
net = [
("Altersgruppe", "Bildungsstand"),
("Altersgruppe", "aeltestesKind"),
("Altersgruppe", "juengstesKind"),
("Altersgruppe", "Wohnverhaeltnis"),
("Altersgruppe", "Tarif"),

("Geschlecht", "Tarif"),

("Verheiratet", "Beruf"),
("Verheiratet", "Kinderzahl"),

("Kinderzahl", "Wohnverhaeltnis"),
("Kinderzahl", "Tarif"),

("aeltestesKind", "Wohnverhaeltnis"),

("juengstesKind", "Wohnverhaeltnis"),

("Bildungsstand", "Beruf"),
("Bildungsstand", "Aktienbesitz"),

("Beruf", "Familieneinkommen"),

("Familieneinkommen", "Tarif"),
("Familieneinkommen", "Wohnverhaeltnis"),

("Wohnverhaeltnis", "Tarif"),

("Ort", "Land"),
("Ort", "Wohnverhaeltnis"),

("Land", "Tarif"),

("Aktienbesitz", "Familieneinkommen")
]
model = BayesianModel(net)
train(model, data)
evaluate(model, data)

Predicted Tarif with an accuracy of: 0.5


In [6]:
# Network built with OpenMarkov
net = [
("Altersgruppe", "Kinderzahl"),

("Verheiratet", "Geschlecht"),
("Verheiratet", "Ort"),

("Kinderzahl", "juengstesKind"),
("Kinderzahl", "Verheiratet"),

("juengstesKind", "aeltestesKind"),
("juengstesKind", "Ort"),

("Bildungsstand", "Beruf"),
("Bildungsstand", "Aktienbesitz"),
("Bildungsstand", "Wohnverhaeltnis"),
("Bildungsstand", "Familieneinkommen"),
("Bildungsstand", "Altersgruppe"),

("Familieneinkommen", "Altersgruppe"),

("Wohnverhaeltnis", "Tarif"),
("Wohnverhaeltnis", "Ort"),
("Wohnverhaeltnis", "Land"),

("Tarif", "Land"),
]
model = BayesianModel(net)
train(model, data)
evaluate(model, data)

Predicted Tarif with an accuracy of: 0.88
