In [1]:
import pandas as pd

data = pd.read_csv('abalone_DE.csv', skiprows=3)

In [2]:
data = data[ data['Höhe [mm]'] <= 0.5 ]
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4175 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Geschlecht                  4175 non-null   object 
 1   Länge [mm]                  4175 non-null   float64
 2   Durchmesser [mm]            4175 non-null   float64
 3   Höhe [mm]                   4175 non-null   float64
 4   Gesamtgewicht [g]           4175 non-null   float64
 5   Gewicht geschält [g]        4175 non-null   float64
 6   Gewicht der Eingeweide [g]  4175 non-null   float64
 7   Gewicht der Schale [g]      4175 non-null   float64
 8   Ringe                       4175 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 326.2+ KB


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split der Daten in Trainingsdaten und Testdaten
data_train, data_test = train_test_split(data, random_state=42)

# Wahl des linearen Regressionsmodells
model = LinearRegression()

# Adaption der Daten für das lineare Regressionsmodell
X_train = data_train.loc[:, 'Länge [mm]' : 'Gewicht der Schale [g]']
y_train = data_train['Ringe']

# Training
model.fit(X_train, y_train)

# Validierung
r2_score_train = model.score(X_train, y_train)

X_test = data_test.loc[:, 'Länge [mm]' : 'Gewicht der Schale [g]']
y_test = data_test['Ringe']

r2_score_test = model.score(X_test, y_test)

print(f'R2-Score Trainingsdaten: {r2_score_train:.2f}')
print(f'R2-Score Testdaten: {r2_score_test:.2f}')

R2-Score Trainingsdaten: 0.53
R2-Score Testdaten: 0.56


In [4]:

from sklearn.preprocessing import PolynomialFeatures

for d in [2, 3, 4, 5]:
    polynom_transformator = PolynomialFeatures(degree = d)

    # Adaption der Daten für das lineare Regressionsmodell
    X_train = polynom_transformator.fit_transform(data_train.loc[:, 'Länge [mm]' : 'Gewicht der Schale [g]'])
    y_train = data_train['Ringe']

    # Training
    model.fit(X_train, y_train)

    # Validierung
    r2_score_train = model.score(X_train, y_train)

    X_test = polynom_transformator.transform(data_test.loc[:, 'Länge [mm]' : 'Gewicht der Schale [g]'])
    y_test = data_test['Ringe']

    r2_score_test = model.score(X_test, y_test)

    print(f'Grad: {d} ==> R2-Score Trainingsdaten: {r2_score_train:.2f} | R2-Score Testdaten: {r2_score_test:.2f}')

Grad: 2 ==> R2-Score Trainingsdaten: 0.57 | R2-Score Testdaten: 0.59
Grad: 3 ==> R2-Score Trainingsdaten: 0.61 | R2-Score Testdaten: 0.25
Grad: 4 ==> R2-Score Trainingsdaten: 0.67 | R2-Score Testdaten: -3.30
Grad: 5 ==> R2-Score Trainingsdaten: 0.75 | R2-Score Testdaten: -3196.66


In [5]:
# One-Hot-Kodierung des Geschlechts
data_kodiert = pd.get_dummies(data, columns=['Geschlecht'])

# Erneuter Split
data_train, data_test = train_test_split(data_kodiert, random_state=42)

# Training mit Geschlecht
X_train = data_train.drop(columns=['Ringe'])
y_train = data_train['Ringe']
model.fit(X_train, y_train)

# Validierung
X_test = data_test.drop(columns=['Ringe'])
y_test = data_test['Ringe']
r2_score_test = model.score(X_test, y_test)
print(f'R2-Score mit Geschlecht: {r2_score_test:.2f}')

R2-Score mit Geschlecht: 0.57
