# Beispiel Code für ML

In [1]:
#Common Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Entscheidungsbäume

In [None]:
#Zufällige Datenpunkte Generieren
np.random.seed(42)

num_points = 200

mean1 = np.array([5, 5])
cov1 = np.array([
    [2, 4],
    [4, 2]
])
points = np.random.multivariate_normal(mean1, cov1, num_points)
labels = np.where((points[:, 0] >= 5.0) & (points[:, 1] >= 5.0), 1, 0)
noise = np.random.choice([0, 1], size=len(labels), p=[0.8, 0.2])
labels = np.where(noise == 1, 1 - labels, labels)


Nachdem wir das Datensatz (points, labels) haben, müssen wir es in trainings und validation Teile splitten und dann das Modell trainieren:


In [None]:
#Imports:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
#Split:
train_points, val_points, train_labels, val_labels = train_test_split(
    points, labels, test_size=0.5, random_state=42
)

In [None]:
#Training:
# Um die Tiefe explizit anzugeben: clf = DecisionTreeClassifier(max_depth=6)
clf = DecisionTreeClassifier()
clf.fit(train_points, train_labels)


In [2]:
#Validation:
accuracy = clf.score(val_points, val_labels)
print("Validation Accuracy:", accuracy)
print("Best Tree Height:", clf.get_depth())

NameError: name 'clf' is not defined

In [None]:
#Stelle die Vorhersagen grafisch dar.
plt.figure(figsize=(8, 8))
plt.scatter(points[:, 0], points[:, 1], c=labels, cmap="coolwarm", alpha=0.8)
plt.xlabel("size")
plt.ylabel("brightness")
plt.title("Decision tree trained without cross-validation.")
plt.xlim([0, 10])
plt.ylim([0, 10])

# Create a meshgrid to plot the decision boundary
h = 0.01
x_min, x_max = -0.1, 10.1
y_min, y_max = -0.1, 10.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict the labels for each point in the meshgrid
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, cmap="coolwarm", alpha=0.3)

plt.figure(figsize=(8, 8))
plt.scatter(points[:, 0], points[:, 1], c=labels, cmap="coolwarm", alpha=0.8)
plt.xlabel("size")
plt.ylabel("brightness")
plt.title("Decision tree trained with cross-validation.")
plt.xlim([0, 10])
plt.ylim([0, 10])

In [None]:
#Grafische Darstellung von dem Baum selber:
plt.figure(figsize=(10, 8))
plot_tree(clf, feature_names=["x", "y"], class_names=["0", "1"], filled=True)


### Cross-Validation

In [None]:
# Define the parameter grid for GridSearchCV
# max_depth ist ein Parameter von dem DecisionTreeClassifier() Objekt. Dies ist die Eigenschaft, die wir Grid Search Cross Validation optimieren möchten.
param_grid = {"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10]}

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(train_points, train_labels)

# Get the best estimator (decision tree classifier)
best_clf = grid_search.best_estimator_

# Evaluate the best classifier on the validation set
accuracy = best_clf.score(val_points, val_labels)
print("Validation Accuracy:", accuracy)
print("Best Tree Height:", best_clf.get_depth())

## Lineare Regression

In [None]:
df = pd.read_csv("all_superconductors.csv")

X = df.drop(['critical_temp'], axis=1) # Nehme alle Spalten ausser critical_temp, was der Class Label ist
y = df['critical_temp'] # nehme die class label

Der Datensatz ist jetzt (X,y). Wir gehen wieder die gleichen Schritte durch:

In [None]:
# Imports:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
#Split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Training:
linreg = LinearRegression()
linreg.fit(X_train, y_train)


In [None]:
#Validation:
y_pred = linreg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("R2 on test data: {:.3f}".format(r2))

## Logistische Regression

LogReg funktioniert genauso wie die LinReg Beispiel, deshalb ist es hier nicht tiefer erklärt.

In [None]:

df = pd.read_csv("magic04.csv")

X = df.drop(['class'], axis=1)
y = df['class']

#Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Trainig:
from sklearn.linear_model import LogisticRegression

# For the constructor, it is sometimes good to specify
# the number of iterations used by the training
# algorithm.
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

#Validation:
from sklearn.metrics import accuracy_score

y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of estimator on test data: {:.3f}".format(accuracy))