In [1]:
import random
import pickle

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
SEED = 11485672
NO_DUPLICATE = False
SAVE_MODEL = False

random.seed(SEED)

In [3]:
df = pd.read_csv('../resources/satisfaction_data.csv')

#delete id column
df = df.drop(columns=['id'])
#df = df.drop(columns=['zufriedenheit_1', 'zufriedenheit_2'])

In [4]:
df.head()

Unnamed: 0,verein,ist_angekommen,zufriedenheit_1,zufriedenheit_2,zufriedenheit_3,zufriedenheit_4,zufriedenheit_5,action,y
0,Club B,False,-1,-1,-1,100,91,DRIVING,91
1,Club A,False,-1,-1,-1,100,97,DRIVING,97
2,Club B,False,-1,-1,-1,100,91,DRIVING,91
3,Club B,False,-1,-1,-1,100,91,DRIVING,91
4,Neutral,False,-1,-1,-1,100,94,DRIVING,94


In [5]:
print(f'Datenpunkte: {df.shape[0]}')
print(f'Features: {list(df.columns)}')

Datenpunkte: 25600
Features: ['verein', 'ist_angekommen', 'zufriedenheit_1', 'zufriedenheit_2', 'zufriedenheit_3', 'zufriedenheit_4', 'zufriedenheit_5', 'action', 'y']


In [6]:
if NO_DUPLICATE:
    df = df.drop_duplicates(keep='first')
    print(f'Datenpunkte: {df.shape[0]}')

In [7]:
df['ist_angekommen'] = df['ist_angekommen'].map({True: 1, False: 0})

In [8]:
X = df.drop(columns=['y'])  # Features
y = df['y']                 # Zielvariable

In [9]:
X = pd.get_dummies(X, columns=['verein', 'action'], dtype=int)

#X.head()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

print(X_train)

       ist_angekommen  zufriedenheit_1  zufriedenheit_2  zufriedenheit_3  \
3945                0                0                0                0   
6973                0                0                0                0   
5422                0               -1               -1               -1   
17787               0                0                0                0   
13734               0                0                0                0   
...               ...              ...              ...              ...   
16510               0               -1               -1              100   
5074                0                0                0                0   
12465               1                7                4                1   
12931               0                0                0                0   
12121               0               -1               -1              100   

       zufriedenheit_4  zufriedenheit_5  verein_Club A  verein_Club B  \
3945          

In [11]:
model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(y_pred)

[94.92101268 90.42424412  1.34172819 ... -0.36973019  1.34172819
 92.50210727]


In [12]:
# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")


mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Squared Error (MSE): 1.157002683759315
Mean Absolute Error (MAE): 0.7783838871708418


In [13]:
#speicher Modell ab

if SAVE_MODEL:
    with open('../resources/satisfaction_model.pkl', 'wb') as f:
        pickle.dump(model, f)