In [1]:
# "Standalone" Version

In [44]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
import pgmpy.inference

from pgmpy.readwrite.BIF import BIFWriter, BIFReader

In [45]:
def normalizeKindAge(age):
    if age == "n.a.":
        return "n.a.";
    elif int(age) < 10:
        return "<10"
    elif int(age) < 18:
        return "<18"
    elif int(age) < 25:
        return "<25"
    else:
        return ">25"

def normalizeIncome(income):
    income = int(income)
    if income < 20000:
        return "<20000"
    elif income < 40000:
        return "<40000"
    elif income < 80000:
        return "<80000"
    elif income < 100000:
        return "<100000"
    else:
        return ">100000"

In [46]:
data = pd.read_csv('../versicherung_x.csv', delimiter=';')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle data
#raw_data.loc[raw_data['aeltestesKind'] != "n.a.", 'aeltestesKind'].apply(int).plot.kde()
data['aeltestesKind'] = data['aeltestesKind'].apply(normalizeKindAge)
data['juengstesKind'] = data['juengstesKind'].apply(normalizeKindAge)
data['Familieneinkommen'] = data['Familieneinkommen'].apply(normalizeIncome)

In [47]:
def train(model, df, split=0.75):
    train = df[:int(len(df) * split)]
    model.fit(train, state_names={"Kinderzahl": [0, 1, 2, 3, 4]})

def evaluate(model, df, split=0.75):
    test = df[int(len(df) * (split-1)):]
    result = model.predict(test.drop("Tarif", 1))
    result["Expected"] = test["Tarif"]
    tests = len(result)
    hits = np.where(result["Expected"] == result["Tarif"], [1 for x in range(tests)], [0 for x in range(tests)])
    print("Predicted Tarif with an accuracy of: {}".format(sum(hits) / tests))
    #print(result)

In [48]:
# Network built with OpenMarkov
net = [
("Altersgruppe", "Kinderzahl"),

("Verheiratet", "Geschlecht"),
("Verheiratet", "Ort"),

("Kinderzahl", "juengstesKind"),
("Kinderzahl", "Verheiratet"),

("juengstesKind", "aeltestesKind"),
("juengstesKind", "Ort"),

("Bildungsstand", "Beruf"),
("Bildungsstand", "Aktienbesitz"),
("Bildungsstand", "Wohnverhaeltnis"),
("Bildungsstand", "Familieneinkommen"),
("Bildungsstand", "Altersgruppe"),

("Familieneinkommen", "Altersgruppe"),

("Wohnverhaeltnis", "Tarif"),
("Wohnverhaeltnis", "Ort"),
("Wohnverhaeltnis", "Land"),

("Tarif", "Land"),
]
model = BayesianModel(net)
train(model, data)
evaluate(model, data)
type(model)

Predicted Tarif with an accuracy of: 0.9


pgmpy.models.BayesianModel.BayesianModel

In [46]:
#model = BIFReader("test").get_model()
type(model)

#model.predict(data.drop("Tarif", 1))

#evaluate(model, data)


pgmpy.models.BayesianModel.BayesianModel

In [51]:
# Pickle Test
import pickle
#pickle.dump(model, open("bayesian_model.p", "wb") )

In [52]:
model_recover = pickle.load(open("bayesian_model.p", "rb"))
type(model_recover)

pgmpy.models.BayesianModel.BayesianModel

In [53]:
evaluate(model_recover, data)

Predicted Tarif with an accuracy of: 0.96


In [57]:
validation_data = pd.read_csv('versicherung_validation.csv', delimiter=';')
validation_data = validation_data.drop(columns=["Tarif"])
validation_data['aeltestesKind'] = validation_data['aeltestesKind'].apply(normalizeKindAge)
validation_data['juengstesKind'] = validation_data['juengstesKind'].apply(normalizeKindAge)
validation_data['Familieneinkommen'] = validation_data['Familieneinkommen'].apply(normalizeIncome)

In [58]:
validation_data["predicted_tarif"] = model_recover.predict(validation_data)
print(validation_data)

  Altersgruppe Geschlecht Verheiratet  Kinderzahl aeltestesKind juengstesKind  \
0        50-65          m          ja           2           <25           <25   
1        36-49          m          ja           1           <18           <18   

  Bildungsstand      Beruf Familieneinkommen Wohnverhaeltnis        Ort  \
0         Lehre  Industrie            <40000           Miete  laendlich   
1       Studium  Industrie           >100000        Eigentum  laendlich   

     Land Aktienbesitz predicted_tarif  
0  Hessen         nein               B  
1  Hessen           ja               B  


In [60]:
!python programmentwurf.py

Searching for bayesan_model.p
 Loading Bayesan Model


Success

Opening valuation data...

Predicting Tarif & appending to new 'prediction tarif' column:


  Altersgruppe Geschlecht Verheiratet  ...    Land Aktienbesitz Predicted Tarif
0        50-65          m          ja  ...  Hessen         nein               B
1        36-49          m          ja  ...  Hessen           ja               B

[2 rows x 14 columns]


Printing out CPDs to cpd_exp.txt
