In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [None]:
df = pd.read_parquet('C:/Users/fabik/OneDrive/Projekte/gaussian_copula_bachelor_thesis/data/raw/orderlines_dezember.parquet')[['Datum', 'Marktnummer', 'Artikelnummer', 'MengeInKolli', 'Markierung']]

## Silhouette Score für Marktcluster

In [None]:
# Gruppierung für das Cluster
df_markt = df.copy()

df_markt['order_id'] = df_markt.groupby(['Marktnummer', 'Datum']).ngroup()

df_markt = df_markt.groupby('Marktnummer').agg({
    'Artikelnummer': 'nunique',
    'MengeInKolli': 'mean',
    'order_id': 'nunique'
}).rename(columns={
    'Artikelnummer': 'diff_article',
    'MengeInKolli': 'avg_kolli',
    'order_id': 'orders'
}).reset_index()

In [None]:
df_markt

In [None]:
# Standardisieren und Bewertung des passenden k für KMeans
for k in range(2,10):

    df_markt_copy = df_markt[['diff_article', 'avg_kolli', 'orders']].copy()

    scalar = StandardScaler()
    x_scaled = scalar.fit_transform(df_markt_copy)

    model = KMeans(n_clusters=k, random_state=42)

    predicted_labels = model.fit_predict(df_markt_copy)

    silhoueutte_val = silhouette_score(df_markt_copy, predicted_labels) 

    print(f"Für Cluster: {k}, ergibt sich folgender Score {silhoueutte_val}")


In [None]:
# Analayse für k = 3
labels = df_markt[['diff_article', 'avg_kolli', 'orders']].copy()

scalar = StandardScaler()
x_scaled_k3 = scalar.fit_transform(labels)

model = KMeans(n_clusters=3, random_state=42)
predicted_labels_k3 = model.fit_predict(x_scaled_k3)

df_markt['cluster'] = predicted_labels_k3

df_markt.groupby('cluster').agg({
    'Marktnummer': 'count',
    'diff_article': 'mean',
    'orders': 'sum',
    'avg_kolli': 'mean'
}).rename(columns={
    'Marktnummer': 'count_customer',
    'diff_article': 'avg_diff_article',
    'orders': 'sum_orders',
})

In [None]:
'''
Kategorisierung:
cluster_0 = Drittkunden (geringe Artikelvielfalt, hohe Kolli Menge)
cluster_1 = REWE Center (extrem hohe Artikelvielfalt, kleine Menge je Artikel)
cluster_2 = Standard REWE (hohe Artikelvielfalt, viele Bestellungen, kleine Kolli Mengen)
'''

## Silhouette Score für Orderscluster

In [None]:
# Gruppierung für das Cluster 
df_order = df.copy()

df_order['order_id'] = df_order.groupby(['Marktnummer', 'Datum']).ngroup()

df_order['Sortiment'] = df_order['Artikelnummer'].astype(str).str[2:4]
df_order['Wochentag'] = pd.to_datetime(df_order['Datum']).dt.weekday

df_order = df_order.groupby('order_id').agg({
    'Wochentag': 'first',
    'MengeInKolli': 'sum',
    'Sortiment': 'nunique',
}).rename(columns={
    'Sortiment': 'diff_sortiment'
}).reset_index()

In [None]:
df_order

In [None]:
# Standardisieren und Bewertung des passenden k für KMeans
for k in range(2,10):

    df_order_copy = df_order[['Wochentag', 'MengeInKolli', 'diff_sortiment']].copy()

    scalar = StandardScaler()
    x_scaled = scalar.fit_transform(df_order_copy)

    model = KMeans(n_clusters=k, random_state=42)

    predicted_labels = model.fit_predict(df_order_copy)

    silhoueutte_val = silhouette_score(df_order_copy, predicted_labels) 

    print(f"Für Cluster: {k}, ergibt sich folgender Score {silhoueutte_val}")

In [None]:
# Analayse für k = 2
labels = df_order[['Wochentag', 'MengeInKolli', 'diff_sortiment']].copy()

scalar = StandardScaler()
x_scaled_k2 = scalar.fit_transform(labels)

model = KMeans(n_clusters=2, random_state=42)
predicted_labels_k2 = model.fit_predict(labels)

df_order['cluster'] = predicted_labels_k2

df_order.groupby('cluster').agg({
    'order_id': 'nunique',
    'MengeInKolli': 'sum',
    'diff_sortiment': 'mean'
}).rename(columns={
    'order_id': 'count_orders',
    'diff_sortiment': 'avg_diff_sortiment',
})

In [None]:
'''
Kategorisierung:
cluster_0 = Großbestellungen
cluster_1 = Regelbestellungen
'''