# Overfitting

In [1]:
import pandas as pd 
from sklearn.datasets import make_moons

# Erzeugung künstlicher Daten
X_array, y_array = make_moons(n_samples=2000, random_state=0, noise=0.3)

daten = pd.DataFrame({
    'Kilometerstand [km]': 10000 * (X_array[:,0] + 2),
    'Preis [EUR]': 5000 * (X_array[:,1] + 2),
    'verkauft': y_array,
    })

X = daten[['Kilometerstand [km]', 'Preis [EUR]']].values
y = daten['verkauft'].values

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.5, random_state=0)


In [3]:
import xgboost as xgb

modell = xgb.XGBClassifier(n_estimators=200)

modell.fit(X_train, y_train)

score_train = modell.score(X_train, y_train)
print(f'Score bezogen auf Trainingsdaten: {score_train:.2f}')
score_test = modell.score(X_test, y_test)
print(f'Score bezogen auf Testdaten: {score_test:.2f}')

Score bezogen auf Trainingsdaten: 1.00
Score bezogen auf Testdaten: 0.91


In [4]:
xgb.get_config()

{'use_rmm': False, 'verbosity': 1}

In [5]:
modell = xgb.XGBClassifier(n_estimators=200, eval_metric=['error', 'logloss'])

auswertungsdaten = [(X_train, y_train), (X_test, y_test)]
modell.fit(X_train, y_train, eval_set=auswertungsdaten)



[0]	validation_0-error:0.07100	validation_0-logloss:0.50184	validation_1-error:0.09000	validation_1-logloss:0.50700
[1]	validation_0-error:0.07000	validation_0-logloss:0.39395	validation_1-error:0.09200	validation_1-logloss:0.40302
[2]	validation_0-error:0.06700	validation_0-logloss:0.32546	validation_1-error:0.09000	validation_1-logloss:0.33853
[3]	validation_0-error:0.06600	validation_0-logloss:0.27612	validation_1-error:0.08400	validation_1-logloss:0.29657
[4]	validation_0-error:0.06500	validation_0-logloss:0.24326	validation_1-error:0.08300	validation_1-logloss:0.26750
[5]	validation_0-error:0.06300	validation_0-logloss:0.21908	validation_1-error:0.08800	validation_1-logloss:0.24820
[6]	validation_0-error:0.06700	validation_0-logloss:0.19809	validation_1-error:0.08200	validation_1-logloss:0.23518
[7]	validation_0-error:0.06300	validation_0-logloss:0.18358	validation_1-error:0.08100	validation_1-logloss:0.22703
[8]	validation_0-error:0.06500	validation_0-logloss:0.17340	validation_1

In [6]:
results = modell.evals_result()
fehler = pd.DataFrame({
    'Fehler Trainingsdaten': results['validation_0']['error'],
    'Fehler Testdaten': results['validation_1']['error']
    })
kostenfunktion = pd.DataFrame({
    'Fehler Trainingsdaten': results['validation_0']['logloss'],
    'Fehler Testdaten': results['validation_1']['logloss']
    })

In [7]:
import plotly.express as px 

fig = px.scatter(fehler,
    title='Fehler',
    labels={'value': 'Fehler', 'index': 'Iteration', 'variable': 'Legende'})
fig.show()

In [8]:
fig = px.scatter(kostenfunktion,
    title='Kostenfunktion',
    labels={'value': 'Kostenfunktion', 'index': 'Iteration', 'variable': 'Legende'})
fig.show()

In [9]:
from pycaret.classification import *
s = setup(daten, target = 'verkauft', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,verkauft
2,Target type,Binary
3,Original data shape,"(2000, 3)"
4,Transformed data shape,"(2000, 3)"
5,Transformed train set shape,"(1400, 3)"
6,Transformed test set shape,"(600, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


In [10]:
# import ClassificationExperiment and init the class
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()

In [11]:
exp.setup(daten, target = 'verkauft', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,verkauft
2,Target type,Binary
3,Original data shape,"(2000, 3)"
4,Transformed data shape,"(2000, 3)"
5,Transformed train set shape,"(1400, 3)"
6,Transformed test set shape,"(600, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x178d9cad0>

In [12]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9129,0.9713,0.9171,0.9104,0.9135,0.8257,0.8262,0.028
rf,Random Forest Classifier,0.9086,0.9608,0.9071,0.9107,0.9085,0.8171,0.8179,0.037
knn,K Neighbors Classifier,0.9057,0.9554,0.9129,0.9008,0.9064,0.8114,0.8123,0.01
ada,Ada Boost Classifier,0.9029,0.9681,0.9043,0.9029,0.9031,0.8057,0.8066,0.015
xgboost,Extreme Gradient Boosting,0.9021,0.9626,0.9,0.9055,0.9021,0.8043,0.8055,0.011
et,Extra Trees Classifier,0.8957,0.9586,0.8886,0.9035,0.8951,0.7914,0.793,0.026
lightgbm,Light Gradient Boosting Machine,0.8957,0.9649,0.8943,0.8987,0.8956,0.7914,0.793,0.194
dt,Decision Tree Classifier,0.8843,0.8843,0.8771,0.8923,0.8833,0.7686,0.7709,0.004
lr,Logistic Regression,0.8493,0.9338,0.8514,0.85,0.8497,0.6986,0.7002,0.344
qda,Quadratic Discriminant Analysis,0.8464,0.9334,0.8443,0.8502,0.8462,0.6929,0.6946,0.004


In [13]:
xgb = create_model('xgboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8929,0.9443,0.8571,0.9231,0.8889,0.7857,0.7877
1,0.9071,0.9661,0.9571,0.8701,0.9116,0.8143,0.8184
2,0.9,0.9745,0.9143,0.8889,0.9014,0.8,0.8003
3,0.9143,0.9602,0.8714,0.9531,0.9104,0.8286,0.8316
4,0.9286,0.9678,0.9429,0.9167,0.9296,0.8571,0.8575
5,0.9357,0.9873,0.9,0.9692,0.9333,0.8714,0.8737
6,0.9214,0.9838,0.9143,0.9275,0.9209,0.8429,0.8429
7,0.8429,0.9169,0.8571,0.8333,0.8451,0.6857,0.686
8,0.8643,0.9436,0.8714,0.8592,0.8652,0.7286,0.7286
9,0.9143,0.9816,0.9143,0.9143,0.9143,0.8286,0.8286
