In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import plotly.io as pio

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
wea_geo = pd.read_csv('../Data/Wetterdaten_Custom/wea_geo.csv')
sales = pd.read_csv('../Data/Umsatzdaten/processed_umsatz.csv')

wea_geo['MESS_DATUM'] = pd.to_datetime(wea_geo['MESS_DATUM'])

sales['Datum'] = pd.to_datetime(sales['Datum']).dt.normalize()

# Weather data from 1994 onwards
wea_geo = wea_geo[wea_geo['MESS_DATUM'] >= '1994']

# Sales only from camping pitches
sales_camping = sales[sales['Gastgewerbe'] == 'WZ08-553'] 
sales_camping['year_month'] = sales_camping['Datum'].dt.to_period('M')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_camping['year_month'] = sales_camping['Datum'].dt.to_period('M')


In [3]:
# Modell evaluation
def print_evaluation(X_train, X_test, y_train, y_test, y_train_pred, y_test_pred):
    """ Ausgabe von R2-Wert, MSE, MAE und Accuracy für Trainings- und Testset """
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    print(
        f"{'':6} {'R²':>10} | {'MSE':>14} | {'MAE':>10} |  {'rows':>8} | {'columns':>8}\n"
        f"{'Train':6} {r2_train:10.5f} | {mse_train:14.2f} | {mae_train:10.2f} | {X_train.shape[0]:8}  | {X_train.shape[1]:8}\n"
        f"{'Test':6} {r2_test:10.5f} | {mse_test:14.2f} | {mae_test:10.2f} | {X_test.shape[0]:8}  | {X_test.shape[1]:8}\n"
    )

In [4]:
wea_geo = wea_geo.drop(columns=['QN_3', 'QN_4', 'STATIONS_ID', 'Stations_id', 'Stationshoehe', 'Geogr.Breite', 'Geogr.Laenge', 'von_datum', 'bis_datum', 'Stationsname', 'geometry' ])

# the weather of each day, regardless of the station
wea_days = wea_geo.groupby('MESS_DATUM').mean().reset_index()

# Add shifted by weeks
wea_days['TMK-1-week-before'] = wea_days['TMK'].shift(7)
wea_days['TMK-2-weeks-before'] = wea_days['TMK'].shift(14)
wea_days['TMK-3-weeks-before'] = wea_days['TMK'].shift(21)

wea_days['year_month'] = wea_days['MESS_DATUM'].dt.to_period('M')
wea_month = wea_days.groupby('year_month').mean().reset_index()
wea_month = wea_month.drop(columns=['MESS_DATUM'])
wea_month.head(3)

Unnamed: 0,year_month,FX,FM,RSK,RSKF,SDK,SHK_TAG,NM,VPM,PM,TMK,UPM,TXK,TNK,TGK,TMK-1-week-before,TMK-2-weeks-before,TMK-3-weeks-before
0,1994-01,15.732258,5.570565,3.223573,5.426799,1.549853,14.89847,6.425806,6.558313,954.895392,2.615881,84.695409,4.898759,0.347146,0.730645,2.447115,3.00362,3.104615
1,1994-02,11.806696,4.1125,1.403571,4.125,2.711688,22.767857,5.633442,5.087202,960.312755,-0.246429,77.967262,2.71369,-3.108631,-3.246753,-0.022505,1.285623,2.346886
2,1994-03,16.812097,5.874597,3.261559,5.204301,3.475953,23.40176,5.996188,7.550269,958.937327,5.966398,78.451613,9.390054,2.853763,2.84868,4.760753,2.633871,1.164247


In [5]:
# Merge weather data with sales data
sales_camping_wea = pd.merge(sales_camping, wea_month, left_on='year_month', right_on='year_month', how='left')
sales_camping_wea = sales_camping_wea[sales_camping_wea['year_month'] <= '2023-06']

sales_camping_wea['TMK-previous-month'] = sales_camping_wea['TMK'].shift(1)

sales_camping_wea.head(3)

Unnamed: 0,Gastgewerbe,Umsatz,Datum,year_month,FX,FM,RSK,RSKF,SDK,SHK_TAG,...,PM,TMK,UPM,TXK,TNK,TGK,TMK-1-week-before,TMK-2-weeks-before,TMK-3-weeks-before,TMK-previous-month
0,WZ08-553,68.4,1994-01-01,1994-01,15.732258,5.570565,3.223573,5.426799,1.549853,14.89847,...,954.895392,2.615881,84.695409,4.898759,0.347146,0.730645,2.447115,3.00362,3.104615,
1,WZ08-553,52.2,1994-02-01,1994-02,11.806696,4.1125,1.403571,4.125,2.711688,22.767857,...,960.312755,-0.246429,77.967262,2.71369,-3.108631,-3.246753,-0.022505,1.285623,2.346886,2.615881
2,WZ08-553,81.8,1994-03-01,1994-03,16.812097,5.874597,3.261559,5.204301,3.475953,23.40176,...,958.937327,5.966398,78.451613,9.390054,2.853763,2.84868,4.760753,2.633871,1.164247,-0.246429


In [6]:
# Umsatz normieren
durchschnitt_pro_jahr = sales_camping_wea.groupby(sales_camping_wea["Datum"].dt.year)['Umsatz'].mean().reset_index()

durchschnitt_pro_jahr.columns = ['Jahr', 'UmsatzDurchschnitt'] 

sales_camping_wea = sales_camping_wea.merge(durchschnitt_pro_jahr, left_on=sales_camping_wea["Datum"].dt.year, right_on='Jahr', suffixes=('', 'Durchschnitt'))

sales_camping_wea["UmsatzNorm"] = (sales_camping_wea["Umsatz"] / sales_camping_wea["UmsatzDurchschnitt"]) - 1

sales_camping_wea = sales_camping_wea.drop(columns=['UmsatzDurchschnitt', 'Umsatz'])

sales_camping_wea.head(3)

Unnamed: 0,Gastgewerbe,Datum,year_month,FX,FM,RSK,RSKF,SDK,SHK_TAG,NM,...,UPM,TXK,TNK,TGK,TMK-1-week-before,TMK-2-weeks-before,TMK-3-weeks-before,TMK-previous-month,Jahr,UmsatzNorm
0,WZ08-553,1994-01-01,1994-01,15.732258,5.570565,3.223573,5.426799,1.549853,14.89847,6.425806,...,84.695409,4.898759,0.347146,0.730645,2.447115,3.00362,3.104615,,1994,-0.144465
1,WZ08-553,1994-02-01,1994-02,11.806696,4.1125,1.403571,4.125,2.711688,22.767857,5.633442,...,77.967262,2.71369,-3.108631,-3.246753,-0.022505,1.285623,2.346886,2.615881,1994,-0.347092
2,WZ08-553,1994-03-01,1994-03,16.812097,5.874597,3.261559,5.204301,3.475953,23.40176,5.996188,...,78.451613,9.390054,2.853763,2.84868,4.760753,2.633871,1.164247,-0.246429,1994,0.023139


In [7]:
# Jahr und Monat jeweils als integer
sales_camping_wea['Monat'] = sales_camping_wea['Datum'].dt.month.astype(int)

sales_camping_wea['Jahr'] = sales_camping_wea['Jahr'].astype(int)

sales_camping_wea = sales_camping_wea.drop(columns=['Gastgewerbe', 'year_month', 'Datum'])

sales_camping_wea.fillna(sales_camping_wea.mean(), inplace=True)

## Modeltraining

In [8]:
X = sales_camping_wea.copy()

y = X.pop('UmsatzNorm')

features_to_scale = ['FX', 'FM', 'RSK', 'RSKF', 'SDK', 'SHK_TAG', 'NM', 'VPM', 'PM', 'TMK', 'UPM', 'TXK', 'TNK', 'TGK', 'TMK-1-week-before', 'TMK-2-weeks-before', 'TMK-3-weeks-before', 'TMK-previous-month']

scaler = StandardScaler()

# Normalize features
X[features_to_scale] = scaler.fit_transform(X[features_to_scale])

split_index = int(len(X) * 0.8)

X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]

y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

In [9]:
ridge_pipe = Pipeline([
    ("ridge", Ridge())
])

param_grid_ridge = {
    "ridge__fit_intercept": [False, True],  # Set fit_intercept to False for sparse data
    "ridge__alpha": [0.1, 1.0, 10.0],
}

gs_ridge = GridSearchCV(ridge_pipe, param_grid_ridge, cv=3, error_score='raise', verbose=True)
gs_ridge.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [10]:
# Beste Parameter
best_params = gs_ridge.best_params_
print("Beste Parameter:", best_params)

Beste Parameter: {'ridge__alpha': 0.1, 'ridge__fit_intercept': False}


In [11]:
# Vorhersagen für Train- und Testdaten 
y_test_pred_ridge = gs_ridge.predict(X_test)
y_train_pred_ridge = gs_ridge.predict(X_train)

# Evaluation
print("Evaluation Ridge:")
print_evaluation(X_train, X_test, y_train, y_test, y_train_pred_ridge, y_test_pred_ridge)

Evaluation Ridge:
               R² |            MSE |        MAE |      rows |  columns
Train     0.75372 |           0.04 |       0.16 |      283  |       20
Test      0.76455 |           0.05 |       0.18 |       71  |       20



In [12]:
# Beste Ridge-Regression-Instanz aus der GridSearchCV
best_ridge_model = gs_ridge.best_estimator_['ridge']

# Extrahiere die Koeffizienten
ridge_coefficients = best_ridge_model.coef_

# Füge das Datum zur Liste der Features hinzu
features = ['Jahr'] + ['Monat'] + features_to_scale

# DataFrame für die Koeffizienten erstellen
ridge_coefficients_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': ridge_coefficients
})

# Sortiere das DataFrame nach Koeffizientenbetrag absteigend
ridge_coefficients_df = ridge_coefficients_df.reindex(ridge_coefficients_df['Coefficient'].abs().sort_values(ascending=False).index)

# Visualisierung der Koeffizienten
fig = px.histogram(ridge_coefficients_df, x='Feature', y='Coefficient', labels={'Coefficient': 'Feature Importance'}, title='Feature Importance (Values > 0.02)')
fig.show()