In [2]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
import pgmpy.inference

In [3]:
def normalizeKindAge(age):
    if age == "n.a.":
        return "n.a.";
    elif int(age) > 18:
        return ">18"
    else:
        return "<18"

def normalizeIncome(income):
    income = int(income)
    if income < 20000:
        return "<20000"
    elif income < 40000:
        return "<40000"
    elif income < 60000:
        return "<60000"
    elif income < 80000:
        return "<80000"
    elif income < 100000:
        return "<100000"
    else:
        return ">100000"

In [4]:
data = pd.read_csv('versicherung_x.csv', delimiter=';')
data['aeltestesKind'] = data['aeltestesKind'].apply(normalizeKindAge)
data['juengstesKind'] = data['juengstesKind'].apply(normalizeKindAge)
data['Familieneinkommen'] = data['Familieneinkommen'].apply(normalizeIncome)

In [5]:
net = [
("Altersgruppe", "Bildungsstand"),
("Altersgruppe", "aeltestesKind"),
("Altersgruppe", "juengstesKind"),
("Altersgruppe", "Wohnverhaeltnis"),
("Altersgruppe", "Tarif"),

("Geschlecht", "Tarif"),

("Verheiratet", "Beruf"),
("Verheiratet", "Kinderzahl"),

("Kinderzahl", "Wohnverhaeltnis"),
("Kinderzahl", "Tarif"),

("aeltestesKind", "Wohnverhaeltnis"),

("juengstesKind", "Wohnverhaeltnis"),

("Bildungsstand", "Beruf"),
("Bildungsstand", "Aktienbesitz"),

("Beruf", "Familieneinkommen"),

("Familieneinkommen", "Tarif"),
("Familieneinkommen", "Wohnverhaeltnis"),

("Wohnverhaeltnis", "Tarif"),

("Ort", "Land"),
("Ort", "Wohnverhaeltnis"),

("Land", "Tarif"),

("Aktienbesitz", "Familieneinkommen")
]
model = BayesianModel(net)

In [14]:
separation = 191
train = data[:separation]
test = data[separation:]
model.fit(train)

In [15]:
result = model.predict(test.drop("Tarif", 1))

In [16]:
result["Expected"] = data[separation:]["Tarif"]
result

Unnamed: 0,Tarif,Expected
191,B,B
192,A,B
193,B,B
194,A,A
195,A,B
196,A,A
197,B,B
198,A,B
199,A,B
200,B,B


In [17]:
tests = len(result)
hits = np.where(result["Expected"] == result["Tarif"], [1 for x in range(tests)], [0 for x in range(tests)])
sum(hits) / tests

0.6

In [54]:
from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [25]:
data.head()

Unnamed: 0,Altersgruppe,Geschlecht,Verheiratet,Kinderzahl,aeltestesKind,juengstesKind,Bildungsstand,Beruf,Familieneinkommen,Wohnverhaeltnis,Ort,Land,Aktienbesitz,Tarif
0,36-49,w,ja,4,<18,<18,Studium,Oeffentlicher Dienst,<100000,Eigentum,laendlich,Bayern,ja,A
1,50-65,m,ja,2,>18,>18,Lehre,Industrie,<40000,Miete,laendlich,Hessen,nein,B
2,36-49,m,ja,1,<18,<18,Studium,Industrie,>100000,Eigentum,laendlich,Bayern,nein,A
3,26-35,w,nein,0,n.a.,n.a.,Studium,Industrie,<100000,Miete,urban,Baden-Wuerttemberg,ja,B
4,66-,m,nein,0,n.a.,n.a.,Studium,Handel,<40000,Miete,urban,Baden-Wuerttemberg,ja,B


In [36]:
new_df = data.copy()
new_df = new_df.apply(preprocessing.LabelEncoder().fit_transform)
new_df.head()
#new_df.drop(columns=['Tarif']).head()

Unnamed: 0,Altersgruppe,Geschlecht,Verheiratet,Kinderzahl,aeltestesKind,juengstesKind,Bildungsstand,Beruf,Familieneinkommen,Wohnverhaeltnis,Ort,Land,Aktienbesitz,Tarif
0,2,1,0,4,0,0,3,3,0,0,0,1,0,0
1,3,0,0,2,1,1,2,2,2,1,0,2,1,1
2,2,0,0,1,0,0,3,2,4,0,0,1,1,0
3,1,1,1,0,2,2,3,2,0,1,1,0,0,1
4,4,0,1,0,2,2,3,0,2,1,1,0,0,1


In [48]:
X_train, X_test, y_train, y_test = train_test_split(new_df.drop(columns=['Tarif']), 
                                                                new_df['Tarif'],
                                                                test_size = 0.2)

In [57]:
model = BernoulliNB() #ComplementNB() #MultinomialNB() #GaussianNB()
predicted = model.fit(X_train, y_train)

In [58]:
target_pred = model.predict(X_test)
accuracy_score(y_test, target_pred)

0.9024390243902439