# Market Basket Analysis

## Features
- plotly
- - dash
- pandas

## Data
    source = instacart-market-basket-analysis
    author = jeremy stanley, Meg Risdal, sharathrao, Will Cukierski
    title = Instacart Market Basket Analysis
    publisher = Kaggle
    year = 2017
    url = https://kaggle.com/competitions/instacart-market-basket-analysis

In [1]:
# Import libraries
import os
import pandas as pd
import dash
from dash import dcc
from dash import html
import plotly.express as px
import plotly.graph_objs as go
from wordcloud import WordCloud
from io import BytesIO
import base64
from PIL import Image
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import dash_cytoscape as cyto
from itertools import combinations

In [2]:
# set path for data files
path = "C:\\Users\\Min Dator\\NodBootcamp\\BC#3\\Projects\\3. MarketBasketAnalysis\\data"
os.chdir(path)

In [3]:
# create dataframes from CSV files
departments = pd.read_csv('departments.csv')
aisles = pd.read_csv('aisles.csv')
products = pd.read_csv('products.csv')
orders = pd.read_csv('orders.csv', low_memory=False)
market_basket_sample = pd.read_csv('market_basket_sample.csv')
market_basket_sample

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department,aisle
0,2,202279,3,5,9,8.0,33120,1,1,Organic Egg Whites,dairy eggs,eggs
1,2,202279,3,5,9,8.0,28985,2,1,Michigan Organic Kale,produce,fresh vegetables
2,2,202279,3,5,9,8.0,9327,3,0,Garlic Powder,pantry,spices seasonings
3,2,202279,3,5,9,8.0,45918,4,1,Coconut Butter,pantry,oils vinegars
4,2,202279,3,5,9,8.0,30035,5,0,Natural Sweetener,pantry,baking ingredients
...,...,...,...,...,...,...,...,...,...,...,...,...
3238,340,197300,10,6,9,16.0,1215,17,1,Kidz All Natural Baked Chicken Nuggets,frozen,frozen appetizers sides
3239,340,197300,10,6,9,16.0,37029,18,0,Cream Cheese Spread,dairy eggs,other creams cheeses
3240,340,197300,10,6,9,16.0,49683,19,1,Cucumber Kirby,produce,fresh vegetables
3241,341,163059,6,1,10,19.0,26165,1,1,Electrolyte Enhanced Water,beverages,water seltzer sparkling water


## Most Popular or Frequently Ordered Products

In [5]:
# Get product frequencies
product_freq = market_basket_sample['product_name'].value_counts().to_dict()

# Generate word cloud
wc = WordCloud(width=400, height=400, background_color='white').generate_from_frequencies(product_freq)
img = Image.new('RGB', (wc.width, wc.height), (255, 255, 255))
img.paste(wc.to_image())

# Convert to Base64 for displaying in Dash
buffer = BytesIO()
img.save(buffer, format='PNG')
image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8').replace('\n', '')

# Initialize the Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.Img(src=f'data:image/png;base64,{image_base64}', style={'width':'50%', 'height':'50%'})
])

if __name__ == '__main__':
    app.run_server(mode='inline')



The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.



## Products Frequently Bought Together
Key analysis for strategies like product placement, promotions, and recommendations

In [6]:
# Convert order data to a list of lists
transactions = market_basket_sample.groupby('order_id')['product_name'].apply(list).tolist()

# Encode the transactions into a boolean matrix
encoder = TransactionEncoder()
encoded_data = encoder.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(encoded_data, columns=encoder.columns_)

# Generate frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

# Visualization
app = dash.Dash(__name__)

# Prepare data for the graph
nodes = []
edges = []

# Create nodes for antecedents and consequents, and edges between them
for _, row in rules.iterrows():
    antecedent = list(row['antecedents'])[0]
    consequent = list(row['consequents'])[0]
    
    nodes.append({"data": {"id": antecedent, "label": antecedent}})
    nodes.append({"data": {"id": consequent, "label": consequent}})
    edges.append({"data": {"source": antecedent, "target": consequent}})

elements = nodes + edges

# Define layout for the app
app.layout = html.Div([
    cyto.Cytoscape(
        id='cytoscape',
        layout={'name': 'circle'},
        style={'width': '100%', 'height': '400px'},
        elements=elements
    )
])

if __name__ == '__main__':
    app.run_server(mode='inline')
    

## Product Hierarchies Based on Order Frequency

In [12]:
# Get order frequencies
order_frequencies = (market_basket_sample.groupby(['department', 'aisle', 'product_name'])
                     .size()
                     .reset_index(name='order_frequency'))

# Create the sunburst chart
fig = px.sunburst(order_frequencies,
                  path=['department', 'aisle', 'product_name'], 
                  values='order_frequency',
                  title='Product Hierarchies Based on Order Frequency')

app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(figure=fig)
])

if __name__ == '__main__':
    app.run_server(mode='inline')


In [6]:
# Create department pairs for each order
department_pairs = market_basket_sample.groupby('order_id')['department'].apply(list).reset_index()
department_pairs['department_pairs'] = department_pairs['department'].apply(lambda x: list(combinations(x, 2)))


# Flatten the dataframe so that each row corresponds to one pair
all_pairs = []
#for _, row in department_pairs.iterrows():
    #all_pairs.extend(row['department_pairs'])
for _, row in aisle_pairs.iterrows():
    all_pairs.extend(row['aisle_pairs'])

# Count frequency of each pair
pairs_df = pd.DataFrame(all_pairs, columns=['source', 'target'])
pair_counts = pairs_df.groupby(['source', 'target']).size().reset_index(name='value')

# Filter to only get the most popular flows (this number can be adjusted based on your requirements)
threshold = pair_counts['value'].quantile(0.95)
filtered_pairs = pair_counts[pair_counts['value'] > threshold]

# Create Sankey diagram
#labels = list(departments['department'])
labels = list(aisles['aisle'])
source_indices = filtered_pairs['source'].apply(lambda x: labels.index(x))
target_indices = filtered_pairs['target'].apply(lambda x: labels.index(x))

fig = go.Figure(go.Sankey(
    node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=labels),
    link=dict(source=source_indices, target=target_indices, value=filtered_pairs['value'])
))

app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(figure=fig)
])

if __name__ == '__main__':
    app.run_server(mode='inline')

In [13]:
# Calculate reorder ratio for products
product_counts = market_basket_sample['product_name'].value_counts().reset_index()
product_counts.columns = ['product_name', 'order_count']

reorders = market_basket_sample[market_basket_sample['reordered'] == 1]['product_name'].value_counts().reset_index()
reorders.columns = ['product_name', 'reorder_count']

product_reorders = product_counts.merge(reorders, on='product_name')
product_reorders['reorder_ratio'] = product_reorders['reorder_count'] / product_reorders['order_count']
product_reorders = product_reorders.merge(products, on='product_name')

# Categorize reorder_ratio
bins = [0, 0.25, 0.5, 0.75, 1]
labels = ['0-25%', '25-50%', '50-75%', '75-100%']
product_reorders['reorder_ratio_category'] = pd.cut(product_reorders['reorder_ratio'], bins=bins, labels=labels)

data = market_basket_sample.merge(product_reorders[['product_name', 'reorder_ratio_category']], on='product_name')

# Create the Parallel Categories Diagram
fig = px.parallel_categories(data, dimensions=['department', 'reorder_ratio_category', 'order_dow', 'order_hour_of_day'], 
                             color="order_dow",
                             labels={'order_dow':'Day of Week', 'order_hour_of_day':'Hour of Day'},
                             title='Parallel Categories Diagram for Instacart Orders')

app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(figure=fig)
])

if __name__ == '__main__':
    app.run_server(mode='inline')



iteritems is deprecated and will be removed in a future version. Use .items instead.



In [4]:
# Calculate order count and reorder frequency
product_metrics = market_basket_sample.groupby('product_id').agg(
    order_count=('order_id', 'count'),
    reorder_count=('reordered', 'sum')
).reset_index()

product_metrics = product_metrics.merge(products[['product_id', 'product_name']], on='product_id')

# Create 3D Scatter Plot
fig = px.scatter_3d(product_metrics, x='order_count', y='reorder_count', z='product_id',
                    text='product_name', opacity=0.7,
                    labels={'order_count': 'Order Count', 'reorder_count': 'Reorder Frequency', 'product_id': 'Product ID'},
                    title='3D Scatter Plot of Order Count, Product Frequency, and Reorder Frequency')

# Making the background white for better clarity
fig.update_layout(scene=dict(bgcolor='white'))

app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(figure=fig)
])

if __name__ == '__main__':
    app.run_server(mode='inline')


In [5]:

# Getting the top N products
N = 20
top_products = market_basket_sample['product_name'].value_counts().head(N).index.tolist()
filtered_data = market_basket_sample[market_basket_sample['product_name'].isin(top_products)]

# Animated scatter plot
fig3 = px.scatter(filtered_data, 
                 x='order_hour_of_day', 
                 y='product_name', 
                 animation_frame='order_dow', 
                 size='order_id', 
                 category_orders={"order_dow": [0,1,2,3,4,5,6]},
                 title="Popularity of Top N Products Over the Week",
                 size_max=40)

# Dash application
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(figure=fig3)
])

if __name__ == '__main__':
    app.run_server(mode='inline')
