In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install mlxtend

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.cluster import *
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('dataset_clean.csv')
# df['Order Date'] = pd.to_datetime(df['Order Date'])
# df['year'] = df['Order Date'].dt.year
# df.drop('Order Date', axis=1, inplace=True)  # Drop the original datetime column after extraction
# df

In [None]:
num_cols_before = df.shape[1]
df = df.drop(['Order Zipcode','Shipping Date','Customer Zipcode'], axis=1)
num_cols_after = df.shape[1]
num_cols_dropped = num_cols_before - num_cols_after
print("Number of columns dropped:", num_cols_dropped)

In [None]:
column_names = df.columns
print(column_names)

In [None]:
from pandas.api.types import is_numeric_dtype

numeric_columns = [col for col in df.select_dtypes(include=np.number) if col not in df.filter(like='non_numeric')]
numeric_columns

  and should_run_async(code)


['Days for shipping (real)',
 'Days for shipment (scheduled)',
 'Sales per customer',
 'Late_delivery_risk',
 'Category Id',
 'Customer Id',
 'Customer Zipcode',
 'Department Id',
 'Latitude',
 'Longitude',
 'Order Customer Id',
 'Order Id',
 'Order Item Cardprod Id',
 'Order Item Discount',
 'Order Item Discount Rate',
 'Order Item Id',
 'Order Item Profit Ratio',
 'Order Item Quantity',
 'Sales',
 'Order Profit Per Order',
 'Product Card Id',
 'Product Category Id',
 'Product Price',
 'Product Status']

In [None]:
def get_column_dtypes(df):
  dtype_dict = {}
  for col in df.columns:
    # Get the data type
    dtype = df[col].dtype
    # Check if the data type is numeric
    if not is_numeric_dtype(dtype):
      dtype_dict[col] = 'category'
    else:
      # Remove 'dtype(' and ')' from the string representation
      dtype = str(dtype).strip("dtype(')").strip(")")
      dtype_dict[col] = dtype
  return dtype_dict


data_types = get_column_dtypes(df)
print(data_types)

In [None]:
data = pd.read_csv('dataset_clean.csv', delimiter=',', dtype=data_types)

numerical_features = numeric_columns

In [None]:
data.head()

In [None]:
def assign_urgency(shipping_mode):
    if shipping_mode in ['Same Day']:
        return 'High'
    elif shipping_mode in ['First Class', 'Second Class']:
        return 'Medium'
    else:
        return 'Low'

data['URGENCY'] = data['Shipping Mode'].apply(assign_urgency)

In [None]:
# Remove rows with missing values
num_cols_before = data.shape[1]
data = data.dropna()
num_cols_after = data.shape[1]
num_cols_dropped = num_cols_before - num_cols_after
print("Number of columns dropped:", num_cols_dropped)

## Generate combinations of numerical features

In [None]:
feature_comb = list(combinations(numerical_features, 2))

In [None]:
import os
def make_cluster(algorithm, data, features, figsize, **kwargs):
    fig, (ax1, ax2) = plt.subplots(2, figsize=figsize)
    fig.suptitle('Supply Chain Clustering')
    sns.scatterplot(ax=ax1, data=data, x=features[0], y=features[1], hue='Category Id', palette='viridis')
    ax1.set(title='Ground Truth', xlabel=features[0], ylabel=features[1])
    algorithm_instance = algorithm(**kwargs)
    algorithm_name = type(algorithm_instance).__name__
    data['Cluster'] = algorithm_instance.fit_predict(data[features])
    sns.scatterplot(ax=ax2, data=data, x=features[0], y=features[1], hue='Cluster', palette='viridis')
    ax2.set(title=f"Clustering by {algorithm_name}", xlabel=features[0], ylabel=features[1])
    filename = f"{algorithm_name}_{'_'.join(features)}.png"
    # Save the figure with the generated filename
    # Create the 'figures' directory if it doesn't exist
    if not os.path.exists("./figures"):
      os.makedirs("./figures")
    plt.savefig("./figures/" + filename)

In [None]:
import os
import seaborn as sns
import matplotlib.pyplot as plt

def make_cluster(algorithm, data, features, figsize, **kwargs):
    fig, ax2 = plt.subplots(figsize=figsize)

    fig.suptitle('Supply Chain Clustering')

    algorithm_instance = algorithm(**kwargs)
    algorithm_name = type(algorithm_instance).__name__
    data['Cluster'] = algorithm_instance.fit_predict(data[features])

    sns.scatterplot(ax=ax2, data=data, x=features[0], y=features[1], hue='Cluster', palette='viridis')
    ax2.set(title=f"Clustering by {algorithm_name}", xlabel=features[0], ylabel=features[1])

    filename = f"{algorithm_name}_{'_'.join(features)}.png"

    # Save the figure with the generated filename
    # Create the 'figures' directory if it doesn't exist
    if not os.path.exists("./figures"):
        os.makedirs("./figures")
    plt.savefig("./figures/" + filename)

    # Clear the plot to avoid memory issues if plotting multiple clusters
    plt.clf()  # Clear the figure to prevent memory buildup



## DBSCAN

In [None]:
for fc in feature_comb:
    features = [fc[0], fc[1]]
    make_cluster(DBSCAN, data, features, (12, 10), eps=3, min_samples=5)

## Kmeans

In [None]:
for fc in feature_comb:
    features = [fc[0], fc[1]]
    make_cluster(KMeans, data, features, (12, 10), n_clusters=3, random_state=42)

## Agglomerative Clustering

In [None]:
for fc in feature_comb:
    features = ['Category Id', fc[0]]
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data[features])
    make_cluster(AgglomerativeClustering, data, features, (12, 10), n_clusters=3, metric='euclidean', linkage='ward')

## Apriori

In [None]:
delivery = []
category_name = []
order_region = []
order_status = []
for suffix in df['Delivery Status'].unique():
  delivery.append(f"Delivery Status_{suffix}")

for suffix in df['Category Name'].unique():
  category_name.append(f"Category Name_{suffix}")

for suffix in df['Order Region'].unique():
  order_region.append(f"Order Region_{suffix}")


for suffix in df['Order Status'].unique():
  order_status.append(f"Order Status_{suffix}")
print(len(delivery))
print(len(category_name))
print(len(order_region))
print(len(order_status))

In [None]:
from itertools import product

combinations = product(delivery,order_region,order_status)
list_of_tuples = []
for combination in combinations:
  new_list = list(combination)
  list_of_tuples.append(new_list)
  print('\n')

final_combination = []
for new_list in list_of_tuples:
  for category in category_name:
    final_combination.append(new_list + [category])

for combination in final_combination:
  print(list(combination))
  print('\n')

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
df.drop(df.columns.difference(['Category Name','Delivery Status','Order Region','Order Status']), axis=1, inplace=True)

df_apriori = pd.get_dummies(df)

df_apriori.head()
results = []
for combination in final_combination:
  columns=combination
  # Apply Apriori algorithm
  frequent_itemsets = apriori(df_apriori[columns], min_support=0.05, use_colnames=True)
  if len(frequent_itemsets) < 1:
    print(f"No rules found for {columns}")
    continue
  # Generate association rules
  association_rules_df = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

  # Sort rules by lift
  association_rules_df = association_rules_df.sort_values(by='lift', ascending=False)

  # Print the top 10 interesting patterns
  top_patterns = association_rules_df.head(10)
  results.append(top_patterns)

In [None]:
for i, df in enumerate(results):
  filename = f"/content/apriori/top_patterns_{i}.csv"
  df.to_csv(filename, index=False)