<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/quality_complaints_dashboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import packages

In [1]:
#Importing necessary packages
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Add interactivity to graphs
import ipywidgets as widgets
from IPython import display
from ipywidgets import interact, interactive, fixed, interact_manual

In [3]:
#Customizing data visualization styles

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title

In [4]:
#Customizing dark theme

#Applying dark SH style
plt.style.use(['dark_background'])
plt.rcParams['axes.facecolor'] = (0,0,0,0)
plt.rcParams['figure.facecolor'] = '#383838' #To match google colab native dark theme
plt.rcParams["legend.frameon"] = False
plt.rcParams['image.cmap'] = 'viridis'
plt.rcParams["legend.facecolor"] = (0,0,0,0)
#plt.rcParams["font.family"] = 'sans-serif'
#plt.rcParams['font.sans-serif'] = ['Open Sans', 'Source Sans Pro', 'Noto Sans']
plt.rcParams['hist.bins'] = 50
plt.rcParams['lines.markersize'] = 10
plt.rcParams['axes3d.grid'] = False
plt.rcParams['figure.subplot.hspace'] = 0.3 #default = 0.2

from cycler import cycler

plt.rcParams['axes.prop_cycle'] = (cycler(color=['#e74c3c', '#b86dd6', '#fca821', '#3498db', '#f1c40f', '#ffa847', '#c4ef7a', '#e195e2', '#ced9ed', '#fff29b'])\
                                   + cycler(linestyle=['-', '--', ':', '-.', '-', '--', ':', '-.', '-', '--']))

#Upload raw data

In [None]:
#Remove previous versions of the uploaded excel file
!rm reklamer.xlsx

In [5]:
#Uploading file from local drive
from google.colab import files
uploaded = files.upload()

Saving reklamer.xlsx to reklamer.xlsx


In [6]:
#Storing dataset in a Pandas Dataframe
import io
df = pd.read_excel(io.BytesIO(uploaded['reklamer.xlsx']), parse_dates=True)

#Data cleaning and formatting



In [None]:
#Dropping unnecessary columns (if needed)
#col_ix =[0, 12, 23, 25, 29, 32, 33]
#df = df.drop(columns=df.columns[col_ix], axis=1)

In [7]:
#Parsing dates from columns 'Production Date' and 'Best use by Date'
df[['Production Date', 'Best Used By Date']] = df[['Production Date', 'Best Used By Date']].apply(pd.to_datetime, dayfirst=True)

In [8]:
#Getting the dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2654 entries, 0 to 2653
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Month Description            2654 non-null   object        
 1   Incident Date                2654 non-null   datetime64[ns]
 2   Reason Type                  2654 non-null   object        
 3   Reason Category              2654 non-null   object        
 4   Reason Sub Category          2654 non-null   object        
 5   Primary Reason               2654 non-null   object        
 6   Sensitive Complaint Flag     2654 non-null   object        
 7   Primary Root Cause           2654 non-null   object        
 8   Sector                       2654 non-null   object        
 9   Sub Sector                   2654 non-null   object        
 10  Segment                      2654 non-null   object        
 11  Sub Segment                  2654 non-null 

##Data consistency and preliminary checks

In [None]:
#Explore lot code variability with endswith
df['Lot code'].apply(lambda x: str(x).endswith('-1')).value_counts(normalize=True)

True     0.752072
False    0.247928
Name: Lot code, dtype: float64

In [None]:
#Explore Lot code variability of values through length
df['Lot code'].str.len().value_counts(dropna=False)

12.0    2237
NaN      348
11.0      45
9.0       13
3.0        4
15.0       2
10.0       2
7.0        1
17.0       1
8.0        1
Name: Lot code, dtype: int64

In [None]:
#Explore Lot code variability of normalized values through length
df['Lot code'].str.len().value_counts(normalize=True, dropna=False)

12.0    0.843255
NaN     0.131123
11.0    0.016579
9.0     0.004898
3.0     0.001507
15.0    0.000754
10.0    0.000754
7.0     0.000377
17.0    0.000377
8.0     0.000377
Name: Lot code, dtype: float64

##Custom functions to clean and format the data

In [None]:
#Create a copy of the dataframe and compute the production date and traceability
lot_codes = df.copy()
lot_codes = df[df['Lot code'].str.len() == 12].reset_index(drop=True)
lot_codes.shape

(2238, 35)

In [None]:
#Create a function to extract information from the lot code and clean the data from erroneous lot codes

def extract_lot(df):
  """Function that extracts information from the lot code by Line, Year, Week, Day, Shift and Machine
  and cleans the lot code from incorrect values"""
  
  #Split lot code and incident date into subcategories if 'Lot code' is found in columns
  if 'Lot code' in df.columns:
    lot = df.copy()
    lot = lot.assign(Line = lambda x: x['Lot code'].str[-9:-7],
                    Year = lambda x: x['Lot code'].str[-7:-6],
                    Week = lambda x: x['Lot code'].str[-6:-4],
                    Day = lambda x: x['Lot code'].str[-4:-3],
                    Shift = lambda x: x['Lot code'].str[-3:-2],
                    Machine = lambda x: x['Lot code'].str[-1:],
                    Incident_year = lambda x: x['Incident Date'].dt.year,
                    Incident_week = lambda x: x['Incident Date'].dt.isocalendar().week,
                    Incident_month = lambda x: x['Incident Date'].dt.month_name(),
                    Incident_day = lambda x: x['Incident Date'].dt.day)
    
    #Creating patterns to clean the data
    pattern_1 = '1|2|3' #==> Allow for only machine or shifts to be 1, 2 or 3
    pattern_days = '1|2|3|4|5|6|7' #==> Allow for only days from 1 to 7.
    pattern_week = "|".join([str(elem) for elem in ["0"+str(i) if len(str(i))==1 else str(i) for i in range(1,53)]]) #==> Allow only from weeks 1 to 52.
    pattern_year = "8|9|0|1" #==> Allow for only years 2018, 2019, 2020, 2021

    #Apply patterns to filter data from incorrect values
    lot = lot[(lot['Shift'].str.contains(pattern_1, regex=True)) &
              (lot['Machine'].str.contains(pattern_1, regex=True)) &
              (lot['Day'].str.contains(pattern_days, regex=True)) &
              (lot['Week'].str.contains(pattern_week, regex=True)) &
              (lot['Year'].str.contains(pattern_year, regex=True))]

    return lot
  
  else:
    Print("Key Error: 'Lote code' not found in dataframe columns")

In [None]:
#Extract lot code information and top lots with more complaints
lot_df = extract_lot(df=lot_codes)
lot_df.tail(2)

In [None]:
#Testing new values for one computed column from lot code
lot_df['Day'].value_counts(dropna=False)

4    448
3    444
2    433
1    406
5    350
6     93
7     36
Name: Day, dtype: int64

In [None]:
#Create a function to format data extracted from lot code function

def format_lot(df):
  """Function that formats data extracted from lot code"""
  
  if 'Year' and 'Week' and 'Day' in df.columns:

    #Creating a copy of the dataframe
    lot_df = df.copy()

    #Creating relationships to convert data to desired output
    years = {'8': 2018, '9': 2019, '0': 2020, '1': 2021, '2': 2022}
    sundays = {'7': '0'}
    day_names = {'1':'Monday', '2':'Tuesday', '3':'Wednesday', '4':'Thursday', '5':'Friday', '6':'Saturday', '0':'Sunday'}
    shift = {'1': 'Day', '2': 'Afternoon', '3': 'Night'}

    #Transforming year from single integer to full year
    lot_df = lot_df.assign(Year = lambda x: x['Year'].replace(years))
    
    #Parse lot code columns into datetime
    lot_df = lot_df.assign(Day = lambda x: x['Day'].replace(sundays))
    lot_df = lot_df.assign(Lot_date = lambda x: x['Year'].astype(int)*1000 + x['Week'].astype(int)*10 + x['Day'].astype(int))
    lot_df = lot_df.assign(Lot_date = pd.to_datetime(lot_df['Lot_date'], format='%Y%W%w'))

    #Calculating lot turnover (days) from production date to complaint date
    lot_df = lot_df.assign(Lot_turnover_days = lambda x: (x['Incident Date'] - x['Lot_date']).abs())
    lot_df = lot_df.assign(Lot_turnover_weeks = lambda x: round((x['Lot_turnover_days'].dt.days)/7, 2))

    #Transforming day and shift from numbers to string representation
    lot_df = lot_df.assign(Day = lambda x: x['Day'].replace(day_names),
                           Shift = lambda x: x['Shift'].replace(shift))
  
    #Formatting columns from string to integer
    lot_df = lot_df.assign(Week = lambda x: x['Week'].astype(int)).reset_index(drop=True)

    return lot_df
  
  else:
    Print("Key Error: 'Year, Week, Day' not found in dataframe columns")

In [None]:
#Transform and format data extracted from lot code information
lot_df = format_lot(df=lot_df)
lot_df.tail(2)

In [None]:
#Exporting to excel into local disk
from google.colab import files
lot_df.to_excel('rekl_cleaned.xlsx', index=False) #==> Excluding index from file
files.download('rekl_cleaned.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Consistenty checks from function calls

In [None]:
lot_df[lot_df['Brand']=='JENSEN 5']['Lot_turnover_weeks'].describe()

In [None]:
lot_df['Lot_date'][-1:].dt.isocalendar().week

In [None]:
lot_df.info()

In [None]:
lot_df[lot_df['Reason Sub Category']=='FAT BLOOM']['Reason Category'].value_counts(dropna=False)

In [None]:
lot_df['Primary Root Cause'].value_counts(dropna=False)

In [None]:
lot_df.columns

Index(['Month Description', 'Incident Date', 'Reason Type', 'Reason Category',
       'Reason Sub Category', 'Primary Reason', 'Sensitive Complaint Flag',
       'Primary Root Cause', 'Sector', 'Sub Sector', 'Segment', 'Sub Segment',
       'Product Cluster Description', 'Brand', 'Sub Brand',
       'Product Description', 'UPC EAN Code', 'Kraft Item Code',
       'Plant Location Name', 'Production Date', 'Best Used By Date',
       'Lot code', 'Store Name', 'Store State', 'Consumer City',
       'Consumer State', 'Consumer Country', 'Consumer Area',
       'Consumer Region', 'Component', 'Native Verbatim', 'Incident Number',
       'Contact ID', 'Column1', 'Weeknumber', 'Line', 'Year', 'Week', 'Day',
       'Shift', 'Machine', 'Incident_year', 'Incident_week', 'Incident_month',
       'Incident_day', 'Lot_date', 'Lot_turnover_days', 'Lot_turnover_weeks'],
      dtype='object')

In [None]:
lot_df['Segment'].value_counts(dropna=False)

#Exploratory Data Analysis

##Upload cleaned data

In [None]:
#Remove previous versions of the uploaded excel file
!rm rekl_cleaned.xlsx

In [9]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving rekl_cleaned.xlsx to rekl_cleaned.xlsx


In [10]:
#Storing dataset in a Pandas Dataframe
import io
lot_df = pd.read_excel(io.BytesIO(uploaded1['rekl_cleaned.xlsx']), parse_dates=True)

In [11]:
lot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2210 entries, 0 to 2209
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Incident Date             2210 non-null   datetime64[ns]
 1   Reason Category           2210 non-null   object        
 2   Reason Sub Category       2210 non-null   object        
 3   Primary Reason            2210 non-null   object        
 4   Sensitive Complaint Flag  2210 non-null   object        
 5   Primary Root Cause        2210 non-null   object        
 6   Sub Sector                2210 non-null   object        
 7   Brand                     2210 non-null   object        
 8   Sub Brand                 2210 non-null   object        
 9   Product Description       2210 non-null   object        
 10  Plant Location Name       2210 non-null   object        
 11  Production Date           2210 non-null   object        
 12  Best Used By Date   

##Function to plot the starting point of complaints

In [12]:
#Creating a function to plot the starting point of complaints

def plot_lot(year, brand):
  """Custom function to plot the starting point of complaints tracked from the
  production lots and select visualizations according to Year and Line categories"""
  
  #Creating a slice and copy of input dataframe
  shifts_df = lot_df[['Year', 'Week', 'Day', 'Shift', 'Machine', 'Brand', 'Sub Brand', 'Incident Date']].copy()
  shifts_df = shifts_df[(shifts_df['Year']==year) & (shifts_df['Brand']==brand)]
  
  print(shifts_df['Incident Date'].count())
  #Transforming dataframes for visualizations
  weeks_df = shifts_df.groupby(['Year', 'Week'])['Incident Date'].count().reset_index(name='Complaints')
  weeks_df = weeks_df.assign(Category = lambda x: np.where(x['Complaints']<12, 'Low', np.where(x['Complaints']<33, 'Medium', 'Large')))
    
  #Creating constants for plotting
  week_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
  shift_order = ['Day', 'Afternoon', 'Night']
  hue_comp = ['Low', 'Medium', 'Large']
  num_mach = [1, 2, 3]
  siblred = ['#C0C0C0', '#4374B3', '#FF0B04'] #=>#RdYlGn_r | nipy_spectral

  #Create four subplots and unpack the output array immediately
  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(20,10))

  #Set title for the figure
  fig.suptitle("Complaints tracking to starting point", fontsize=24)

  #Accessing axes objects and plotting
  sns.lineplot(x='Week', y='Complaints', data=weeks_df, estimator='sum', ci=False, alpha=0.5, ax=ax1)
  sns.scatterplot(x='Week', y='Complaints', hue='Category', hue_order=hue_comp, data=weeks_df, palette=siblred, ax=ax1)
  sns.countplot(x='Day', data=shifts_df, order=week_names, ax=ax2)
  sns.countplot(x='Shift', data=shifts_df, order=shift_order, ax=ax3)
  sns.countplot(x='Machine', data=shifts_df, order=num_mach, ax=ax4)
  
  #Customizing axes labels and titles for first axes figure
  ax1.set_xlabel('Week')
  ax1.set_ylabel('Complaints')
  ax1.set_title('By Week')
  ax1.xaxis.set_ticks(np.arange(1, 52, 2))

  #Customizing axes labels and titles for second axes figure
  ax2.set_xlabel('Days of Week')
  ax2.set_ylabel('Complaints')
  ax2.set_title('By Day of the Week')

  #Customizing axes labels and titles for third axes figure
  ax3.set_xlabel('Shifts')
  ax3.set_ylabel('Complaints')
  ax3.set_title('By Shift')

  #Customizing axes labels and titles for fourth axes figure
  ax4.set_xlabel('Machine')
  ax4.set_ylabel('Complaints')
  ax4.set_title('By Machine')

  #Prevent from axis to colliding or overlaping one another.
  fig.tight_layout()
  fig.subplots_adjust(top=0.88)

##Where are the starting points of our complaints?

In [14]:
#Generating widgets options
yr = [2020, 2021]
lines = list(lot_df['Brand'].unique())

#Plotting Quality Complaints with interactive widgets
interact(plot_lot, brand=widgets.Dropdown(options=lines, value=lines[0], description='Line', disabled=False),
         year=widgets.Dropdown(options=yr, value=yr[0], description='Year', disabled=False))
plt.show()

interactive(children=(Dropdown(description='Year', options=(2020, 2021), value=2020), Dropdown(description='Li…

##Function to plot the products and categories

In [15]:
#Creating a function to plot the values

def plot_products(year, brand, category, top_num):
  """Custom function to plot the worst products and select different visualizations
  according to Year, Brand and Number of products"""
  
  #Creating a slice and copy of input dataframe
  brand_df = lot_df[['Year', 'Brand', 'Sub Brand', 'Incident Date', 'Lot_date', 'Lot_turnover_weeks', 'Reason Category', 'Consumer Country', 'Primary Root Cause']].copy()
  brand_df = brand_df[(brand_df['Year']==year) & (brand_df['Brand']==brand) & (brand_df['Reason Category']==category)]
  
  print(brand_df['Brand'].count())
  #Transforming dataframes for visualizations
  sku_df = brand_df.groupby(['Sub Brand'])['Reason Category'].count().reset_index(name='Complaints').sort_values(by='Complaints', ascending=False)
  root_df = brand_df.groupby(['Primary Root Cause'])['Reason Category'].count().reset_index(name='Complaints').sort_values(by='Complaints', ascending=False)
  turnover_df =  brand_df.groupby(['Sub Brand'])['Lot_turnover_weeks'].mean().reset_index(name='Weeks').sort_values(by='Weeks', ascending=False)
  country_df = brand_df.groupby(['Consumer Country'])['Reason Category'].count().reset_index(name='Complaints').sort_values(by='Complaints', ascending=False)

  #Creating constants for plotting
  week_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

  #Create four subplots and unpack the output array immediately
  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(20,10))

  #Set title for the figure
  fig.suptitle(f"Top {top_num} Complaints in {brand} for {year}", fontsize=24)

  #Accessing axes objects and plotting
  sns.barplot(x='Complaints', y='Sub Brand', data=sku_df[:top_num], ci=False, ax=ax1)
  sns.barplot(x='Complaints', y='Primary Root Cause', data=root_df[:top_num], ci=False, ax=ax2)
  sns.barplot(x='Weeks', y='Sub Brand', data=turnover_df[:top_num], ci=False, ax=ax3)
  sns.barplot(x='Complaints', y='Consumer Country', data=country_df[:top_num], ci=False, ax=ax4)
    
  #Customizing axes labels and titles for first axes figure
  ax1.set_xlabel('Complaints')
  ax1.set_ylabel('Products')
  ax1.set_title('By Products')
  #ax1.xaxis.set_ticks(np.arange(1, 52, 2))

  #Customizing axes labels and titles for second axes figure
  ax2.set_xlabel('Complaints')
  ax2.set_ylabel('Primary Root Cause')
  ax2.set_title('By Root Cause')

  #Customizing axes labels and titles for third axes figure
  ax3.set_xlabel('Weeks')
  ax3.set_ylabel('Products')
  ax3.set_title('By Turnover Time')

  #Customizing axes labels and titles for fourth axes figure
  ax4.set_xlabel('Complaints')
  ax4.set_ylabel('Country')
  ax4.set_title('By Country')

  #Prevent from axis to colliding or overlaping one another.
  fig.tight_layout()
  fig.subplots_adjust(top=0.88)

## What are the top causes for our complaints?

In [16]:
#Generating widgets options
yr = [2020]
lines = list(lot_df['Brand'].unique())
cat = ['APPEARANCE', 'FOREIGN MATERIAL', 'INFESTATION', 'PACKAGE QUANTITY', 'PACKAGE QUALITY', 'PRODUCT QUANTITY', 'TASTE', 'TEXTURE']

#Plotting Complaints with interactive widgets
interact(plot_products, brand=widgets.Dropdown(options=lines, value=lines[0], description='Line', disabled=False),
         year=widgets.Dropdown(options=yr, value=yr[0], description='Year', disabled=False),
         category=widgets.Dropdown(options=cat, value=cat[0], description='Categories', disabled=False),
         top_num=widgets.IntSlider(value=5, min=1, max=10, step=1, description='Top', disabled=False))
plt.show()

interactive(children=(Dropdown(description='Year', options=(2020,), value=2020), Dropdown(description='Line', …

In [None]:
pip freeze