#Set Up

In [0]:
#Import SPARK Libraries
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *

#import Python
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



##Load Data

In [0]:
df = spark.table('sales_db.descriptive_clustering_summary_input_data_458')

In [0]:
df.show(4)

+----+----+----------+-----------+------------------+-------------+------------+--------------------+
|YEAR|WEEK|      DATE|CUSTOMER_ID|       MATERIAL_ID|WEEKLY_DEMAND|CLUSTER_PRED|             COMB_ID|
+----+----+----------+-----------+------------------+-------------+------------+--------------------+
|2022|  44|2022-11-03| 0500264326|000000000000103029|        0.083|         174|0500264326_000000...|
|2022|  49|2022-12-08| 0500264326|000000000000103029|        0.166|         174|0500264326_000000...|
|2022|  17|2022-04-28| 0500264326|000000000000103029|        0.083|         174|0500264326_000000...|
|2022|  41|2022-10-13| 0500264326|000000000000103029|        0.083|         174|0500264326_000000...|
+----+----+----------+-----------+------------------+-------------+------------+--------------------+
only showing top 4 rows



In [0]:
#used to filter when cluster terminated half way through
df = df.filter( (col('cluster_pred') < 40))

#Functions

##clean_Up  
Description
- We can drop the separate material and customer id's
- we need to combine year-week into one column
- there are several dates within one week so we need to aggregate

Parameters
- dataframe loaded from file storage

Returns
- full cleaned dataframe with necessary columns

In [0]:
def clean_up(df):
    #drop columns
    df = df.drop('customer_id', 'material_id')
    
    #make year-week
    df = df.withColumn("yearweek",F.concat_ws('_',col('year'),col('week'))).drop('year','week')
    
    #aggregate to week level
    df = df.groupby('comb_id','yearweek','cluster_pred').sum('weekly_demand')\
    .withColumnRenamed('sum(weekly_demand)', 'demand')
    
    #clean column names
    df = df.withColumnRenamed('cluster_pred', 'cluster')
    
    return df

##filter_for_cluster  
  
Parameters
- the full clean dataframe
- the cluster to filter for
    
Returns 
- a pandas dataframe for one cluster where each row is a week and each column is a material-customer id
- df is sorted by yearweek to be ready for plotting

In [0]:
def filter_for_cluster(df, cluster):
    dfc = df.filter(col('cluster') == cluster) #filter for the given cluster
    
    dfp = (dfc.drop('cluster') #drop cluster column
           .toPandas() #convert to pandas
           .pivot(index = 'yearweek', columns = 'comb_id', values = 'demand') #pivot materials in cluster
          )

    dfp.reset_index(inplace=True) #convert yearweek index to column
    
    dfp.fillna(0, inplace=True) #convert nulls to 0
    
    dfp.sort_values(by='yearweek', ascending=True, inplace=True) #return the dataframe sorted for charts
    
    return dfp

##build_plot  
  
Parameters
- a pandas df filtered for one cluster 
- the cluster we filtered on
    
Returns
- saves a plot of demand over time in dbfs/filestore/cluster_images titled "Cluster_n"

In [0]:
def build_plot(df, cluster):
    #define x-axis labels
    x_start_loc = 0
    x_end_loc = len(df) - 1
    x_start_label = df.iloc[0,0]
    x_end_label = df.iloc[-1,0]
    
    #define title
    num_id = len(df.columns) - 1
    title = 'Cluster ' + str(cluster) + ': ' + str(num_id) + ' id\'s'
    path = 'Cluster_' +str(cluster)
    
    #set figure size and save variable
    plt.figure(figsize=(15,5))

    #set the style of plot
    plt.style.use('seaborn-bright')

    #loop through plotting columns
    for column in df.drop('yearweek', axis=1):
        plt.plot(df['yearweek'], df[column], marker='', linewidth=1, alpha=0.9)

    # Add titles
    plt.title(title, loc='left', fontsize=12, fontweight=0)
    plt.xlabel("year_week", fontsize = 10)
    plt.ylabel("demand", fontsize = 10)

    #adjust x-axis
    plt.xticks(ticks=[x_start_loc,x_end_loc],
              labels = [x_start_label,x_end_label])
    
    #save plot to file storage
    plt.savefig(f"/dbfs/FileStore/cluster_images_458/{path}.png")


#Job

##create_images 
  
Parameters
- raw data from dbfs
    
Returns
- runs the full job

In [0]:
def create_images(df):
    #clean the raw data
    dfc = clean_up(df)
    
    #make list of all clusters
    min_clust = dfc.groupby().min('cluster').collect()[0][0]
    max_clust = dfc.groupby().max('cluster').collect()[0][0]
    cluster_list = list(range(min_clust, max_clust+1)) #Add 1 to be inclusive of last cluster
    
    #note that building the list this way assumes clusters are sequential
    
    for i in cluster_list:
        #convert to pandas
        dfp = filter_for_cluster(dfc,i)
        #build and save plot
        build_plot(dfp,i)

In [0]:
%%capture 
#this caption prevents code from returing plot to console
create_images(df)

In [0]:
def display_plot(_list):
    for i in _list:
        path = f"<img src ='files/cluster_images_458/Cluster_{i}.png'>"
        displayHTML(path)

In [0]:
start = 0
end = 40

display_plot(list(range(start,end)))

In [0]:
df = spark.createDataFrame([(1, "John Doe"), (2, "Jane Doe")], ["id type", "first name"])

In [0]:
df.show()

+-------+----------+
|id type|first name|
+-------+----------+
|      1|  John Doe|
|      2|  Jane Doe|
+-------+----------+



In [0]:
#updatedDF = df

for col in df.columns:
    df = df.withColumnRenamed(col, col.replace(" ", "_"))

In [0]:
df.show()

+-------+----------+
|id_type|first_name|
+-------+----------+
|      1|  John Doe|
|      2|  Jane Doe|
+-------+----------+



In [0]:
def fill_with_underscore(df):
    
    for col in df.columns:
        df = df.withColumnRenamed(col, col.replace(" ", "_"))
        
    return df

In [0]:
df = fill_with_underscore(df)

In [0]:
df.show()

+-------+----------+
|id_type|first_name|
+-------+----------+
|      1|  John Doe|
|      2|  Jane Doe|
+-------+----------+

