In [None]:
from sklearn import tree
from matplotlib import pyplot as plt

# X is the training set 
# Each example in X has 4 binary features
X = [
    [0, 0, 1, 0],
    [0, 1, 0, 1],
    [1, 1, 0, 0],
    [1, 0, 1, 1],
    [0, 0, 0, 1],
    [1, 0, 1, 0],
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [1, 1, 1, 0],
    [1, 1, 1, 0],
]

# Y is the classes associated with the training set. 
# For instance the label of the first and second example is 1; of the third example is 0, etc
Y = [1, 1, 0, 0, 1, 1, 1, 0, 0, 1]

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

clf.predict([[1,1,1,1] , [0,1,0,0] , [1,1,0,1] ])

In [None]:
text_representation = tree.export_text(clf)
print(text_representation)

In [None]:
fig = plt.figure(figsize=(10,7))
_ = tree.plot_tree(clf, 
                   feature_names= ("f1","f2" , "f3", "f4"),
                   class_names= ("false (0)", "true (1)" ), 
                   filled=True)


# Présentation du dataset

Les features sont:
- Race (race_African-American, race_Caucasian)
- Genre (gender_Female, gender_Male)
- Age (age_18-20, age_21-22, age_23-25, age_26-45, age_>45)
- Crime en étant mineur (juvenile-felonies_=0, juvenile-felonies_>0, juvenile-misdemeanors_=0, juvenile-misdemeanors_>0, juvenile-crimes_=0, juvenile-crimes_>0)
- Antécédants (priors_0, priors_1, priors_2-3, priors_>3)
- Type de crime (charge_degree_Misdemeanor, charge_degree_Felony)
- Récidive

# Réductions

On peut réduire le dataset en supprimant toutes les colones redondantes (not et \_\_AND\_\_).


In [None]:
import csv
import numpy as np
from utils import load_from_csv

train_examples, original_train_label, features, prediction = load_from_csv("./dataset.csv")

def clean_dataset(examples, features):
    drop = []
    for i, feature in enumerate(features):
        if False and ("__AND__" in feature or "not" in feature):
            drop.append(i)
    return np.delete(examples, drop, 1), np.delete(features, drop, 0)


def extract_train_data(data, labels, count=0.8):
    count = int(data.shape[0] * count)
    index = np.arange(data.shape[0])
    np.random.shuffle(index)
    return data[index[:count],:], labels[index[:count]], data[index[count:],:], labels[index[count:]]

cleaned_examples, cleaned_features = clean_dataset(train_examples, features)

train_examples, train_labels, other_examples, other_labels = extract_train_data(cleaned_examples, original_train_label)


other_results = []
train_results = []
for d in range(1, 30):
    clf = tree.DecisionTreeClassifier(
        max_depth=d
    )
    clf = clf.fit(train_examples, train_labels)

    result = clf.predict(other_examples)
    other_results.append(np.sum(result != other_labels) / other_labels.shape[0])
    result = clf.predict(train_examples)
    train_results.append(np.sum(result != train_labels) / train_labels.shape[0])


plt.plot(range(1, 30), other_results, label="new data error")
plt.plot(range(1, 30), train_results, label="train data error")
plt.legend()
plt.title("Pourcentage d'erreur en fonction de la profondeur de l'analyse")
plt.show()

In [None]:
other_results = []
train_results = []
for d in range(1, 100):
    clf = tree.DecisionTreeClassifier(
        min_samples_leaf=d
    )
    clf = clf.fit(train_examples, train_labels)

    result = clf.predict(other_examples)
    other_results.append(np.sum(result != other_labels) / other_labels.shape[0])
    result = clf.predict(train_examples)
    train_results.append(np.sum(result != train_labels) / train_labels.shape[0])


plt.plot(range(1, 100), other_results, label="new data error")
plt.plot(range(1, 100), train_results, label="train data error")
plt.legend()
plt.title("Pourcentage d'erreur en fonction du minimum par feuille")
plt.show()


In [None]:
other_results = []
train_results = []
for d in ["best", "random"]:
    s_o = 0
    s_t = 0
    for i in range(100):
        train_examples, train_labels, other_examples, other_labels = extract_train_data(cleaned_examples, original_train_label)
        clf = tree.DecisionTreeClassifier(
            splitter=d
        )
        clf = clf.fit(train_examples, train_labels)

        result = clf.predict(other_examples)
        s_o += np.sum(result != other_labels) / other_labels.shape[0]
        result = clf.predict(train_examples)
        s_t += np.sum(result != train_labels) / train_labels.shape[0]
    other_results.append(s_o / 100)
    train_results.append(s_t / 100)


plt.plot(["best", "random"], other_results, label="new data error")
plt.plot(["best", "random"], train_results, label="train data error")
plt.legend()
plt.title("Pourcentage d'erreur en fonction du choix du split")
plt.show()

In [None]:
# 5-cross-validation

kf = KFold(n_splits=2)
#j'adore les licornes <3
#c'est trop trop bien <3
#et les pizza c'est trop bon <3

In [None]:
true_positive = 