<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/chocolate_making_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#2RR Analytics

## Uploading necessary packages

In [None]:
#Upgrading Plotly
!pip install plotly --upgrade

In [1]:
#Uploading packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import interact

In [2]:
#Customizing data visualization styles
sns.set_theme(rc = {'figure.figsize': (8, 5)})

In [None]:
#Removing previous versions of the uploaded excel file
!rm 2021_2RR_particle_size_microns.xlsx

In [3]:
#Uploading file from local drive ==> !rm file
from google.colab import files
uploaded1_a = files.upload()

Saving 2021_2RR_particle_size_microns.xlsx to 2021_2RR_particle_size_microns.xlsx


In [4]:
#Storing dataset in a Pandas Dataframe
import io
df1_a = pd.read_excel(io.BytesIO(uploaded1_a['2021_2RR_particle_size_microns.xlsx']))

In [5]:
#Checking the first dataframe info
df1_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   event      164 non-null    datetime64[ns]
 1   date       164 non-null    datetime64[ns]
 2   time       164 non-null    object        
 3   equipment  164 non-null    object        
 4   name       164 non-null    object        
 5   microns    164 non-null    int64         
dtypes: datetime64[ns](2), int64(1), object(3)
memory usage: 7.8+ KB


##Data manipulation

In [6]:
#Creating categories from datetime for first dataframe
df1_a = df1_a.assign(shift = np.where((df1_a['event'].dt.hour)<15, 'Day', 'Afternoon'),
                     day = df1_a['event'].dt.dayofweek, 
                     weeknum = df1_a['event'].dt.isocalendar().week)

In [7]:
#Counting the values for different shifts
df1_a['shift'].value_counts()

Day          136
Afternoon     28
Name: shift, dtype: int64

In [8]:
#Transforming numeric day into day of the week
days_dict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df1_a['day'].replace(days_dict, inplace=True)

##Interactive Visualization

In [9]:
#Plotting the average microns vs time
@interact(Weeknum = df1_a['weeknum'].unique())
def plot_2RR_bar(Weeknum):
  df = df1_a[df1_a['weeknum']==Weeknum].copy()
  df = df.groupby(by=['day', 'shift']).agg(avg_microns = ('microns', 'mean')).reset_index()
  #df = df.sort_values(by='day')
  x_orders = {'day': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
              'shift': ['Day', 'Afternoon']}
  fig1_a = px.bar(data_frame=df, x='day', y='avg_microns', color='shift', barmode='group',
                  title=f'2RR average particle size in week {Weeknum}', height=500, width=800,
                  color_discrete_sequence=px.colors.qualitative.D3, category_orders=x_orders)
  fig1_a.add_hrect(y0=200, y1=250, line_color="black", fillcolor="red", opacity=0.2, annotation_text='High_zone')
  fig1_a.show()

interactive(children=(Dropdown(description='Weeknum', options=(46, 47, 48), value=46), Output()), _dom_classes…

In [12]:
#Plotting the variability of microns vs time
@interact(Weeknum = df1_a['weeknum'].unique())
def plot_2RR_microns(Weeknum):
  df = df1_a[df1_a['weeknum']==Weeknum].copy()
  fig1_b = px.box(data_frame=df, x='day', y='microns', color='shift',
                  title=f'2RR particle size microns in week {Weeknum}', height=500, width=800,
                  color_discrete_sequence=px.colors.qualitative.D3)
  fig1_b.add_hrect(y0=200, y1=250, line_color="black", fillcolor="red", opacity=0.2, annotation_text='High_zone')
  fig1_b.show()

interactive(children=(Dropdown(description='Weeknum', options=(46, 47, 48), value=46), Output()), _dom_classes…

In [11]:
#Creating the microns vs time
@interact(Weeknum = df1_a['weeknum'].unique())
def plot_2RR_histogram(Weeknum):
  df = df1_a[df1_a['weeknum']==Weeknum].copy()
  fig1_c = px.histogram(data_frame=df, x='microns', color='shift', marginal='box', histnorm='probability',
                        nbins=12, title=f'2RR particle microns histogram in week {Weeknum}', height=500, width=800,
                        color_discrete_sequence=px.colors.qualitative.D3)
  fig1_c.add_vline(x=200, line_color="red")
  fig1_c.show()

interactive(children=(Dropdown(description='Weeknum', options=(46, 47, 48), value=46), Output()), _dom_classes…

In [29]:
#Plotting the average microns by week
def plot_2RR_weekly():
  df = df1_a.copy()
  fig1_d = px.box(data_frame=df, x='weeknum', y='microns',
                  title=f'2RR average particle size by week in {2021}', height=500, width=800,
                  color_discrete_sequence=px.colors.qualitative.D3)
  fig1_d.add_hrect(y0=200, y1=250, line_color="black", fillcolor="red", opacity=0.2, annotation_text='High_zone')
  fig1_d.update_xaxes(type='category') #=> Forcing weeknum to be categorial and not decimal number
  fig1_d.show()

plot_2RR_weekly()

#Particle Size Analytics

## Uploading necessary packages

In [None]:
#Upgrading Plotly
!pip install plotly --upgrade

In [None]:
#Uploading packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import interact

In [None]:
#Customizing data visualization styles
sns.set_theme(rc = {'figure.figsize': (8, 5)})

In [None]:
#Remove previous versions of the uploaded excel file
!rm 2017_2021_mass_quality_data.xlsx
!rm 2019_2021_malvern_D_90.xlsx

In [None]:
#Uploading file from local drive ==> !rm file
from google.colab import files
uploaded2_a = files.upload()

In [None]:
#Storing dataset in a Pandas Dataframe
import io
df2_a = pd.read_excel(io.BytesIO(uploaded2_a['2017_2021_mass_quality_data.xlsx']))

In [None]:
#Checking the first dataframe info
df2_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Plant                 439 non-null    object        
 1   Material              439 non-null    int64         
 2   Material_Description  439 non-null    object        
 3   Physical_sample       439 non-null    int64         
 4   Production_date       439 non-null    datetime64[ns]
 5   Short_text            26 non-null     object        
 6   MDLZ_spec             439 non-null    float64       
 7   Heerenthals_spec      439 non-null    float64       
 8   Fat_content_%         439 non-null    float64       
 9   USL_particle_size_um  439 non-null    int64         
 10  Recorded_results      439 non-null    float64       
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 37.9+ KB


In [None]:
#Uploading file from local drive ==> !rm file
from google.colab import files
uploaded2_b = files.upload()

In [None]:
#Storing dataset in a Pandas Dataframe
import io
df2_b = pd.read_excel(io.BytesIO(uploaded2_b['2019_2021_conch_measurements_full.xlsx']))

In [None]:
#Checking the second dataframe info
df2_b.info()

##Data manipulation

In [None]:
#Creating categories from datetime for first dataframe
df2_a = df2_a.assign(Year = df8_a['Production_date'].dt.year,
                 Month = df2_a['Production_date'].dt.month_name(),
                 Quarter = df2_a['Production_date'].dt.quarter,
                 Weeknum = df2_a['Production_date'].dt.isocalendar().week)

In [None]:
#Creating categories from datetime for first dataframe
df2_b = df2_b.assign(Year = df2_b['Sample_date'].dt.year,
                     Month = df2_b['Sample_date'].dt.month_name(),
                     Quarter = df2_b['Sample_date'].dt.quarter,
                     Weeknum = df2_b['Sample_date'].dt.isocalendar().week)

In [None]:
#Transforming quarters into categories
quarter_dict = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}
df8_a['Quarter'].replace(quarter_dict, inplace=True)
df8_b['Quarter'].replace(quarter_dict, inplace=True)

In [None]:
#Creating list of categories
year_list = list(sorted(df8_a['Year'].unique(), reverse=True))
#month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'Jul', 'August', 'September', 'October', 'Novemeber', 'December']
quarter_list = list(df8_a['Quarter'].unique())
period_list = ['Month', 'Quarter', 'Weeknum']
cat_list = ['Recorded_results', 'Fat_content_%']

In [None]:
#Aggregating dataframe 1 with avergage values
df8_a_g = df8_a.groupby(by=['Year', 'Weeknum']).agg(Fat_avg = ('Fat_content_%', 'mean'), Malvern_avg = ('Recorded_results', 'mean')).reset_index()

In [None]:
#Aggregating dataframe 2 with avergage values
df8_b_g = df8_b[df8_b['Short_text']=='PS; Malvern D 90'].groupby(by=['Year', 'Weeknum']).agg(Conch_avg = ('Recorded_results', 'mean')).reset_index()

In [None]:
#Merging dataframe 1 and 2 on Year and Weeknum
df8_merged = df8_a_g.merge(df8_b_g, on=['Year', 'Weeknum'])

In [None]:
#Checking the merged dataframe tail
df8_merged.tail(12)

Unnamed: 0,Year,Weeknum,Fat_avg,Malvern_avg,Conch_avg
109,2021,34,54.68,29.185,29.86
110,2021,35,54.175,26.2425,30.5
111,2021,36,53.99,28.85,30.175
112,2021,37,54.316667,25.33,29.516667
113,2021,38,53.9225,26.2175,29.5
114,2021,39,54.47,27.05,29.857143
115,2021,40,54.3775,25.475,28.825
116,2021,41,54.133333,24.926667,29.1
117,2021,42,53.753333,27.62,30.233333
118,2021,43,54.285,25.675,30.366667


In [None]:
#Correlation matrix of merged variables
df8_merged.corr()

Unnamed: 0,Year,Weeknum,Fat_avg,Malvern_avg,Conch_avg
Year,1.0,-0.108695,0.258201,-0.657037,0.117723
Weeknum,-0.108695,1.0,-0.338801,0.200783,-0.032219
Fat_avg,0.258201,-0.338801,1.0,-0.231841,0.03743
Malvern_avg,-0.657037,0.200783,-0.231841,1.0,0.190715
Conch_avg,0.117723,-0.032219,0.03743,0.190715,1.0


##Particle Size Interactive Plot

In [None]:
#Creating interactive visualization
@interact(Year=year_list, Period=period_list, Cat=cat_list)
def plot_particle_size(Year, Period, Cat):
  df = df8_a[df8_a['Year']==Year].copy()
  if Cat == 'Recorded_results':
    fig8_a = px.box(data_frame=df, x=Period, y=Cat, color='Quarter',
                  title=f'Particle Size Results in {Year}', width=800, height=600)
    fig8_a.add_hrect(y0=24, y1=27) #, line_color="black" fillcolor="green", opacity=0.1)
  else:
    fig8_a = px.box(data_frame=df, x=Period, y=Cat, color='Quarter',
                  title=f'Fat Content % Results in {Year}', width=800, height=600)
    fig8_a.add_hline(y=54.5)
  fig8_a.show()

interactive(children=(Dropdown(description='Year', options=(2021, 2020, 2019, 2018, 2017), value=2021), Dropdo…

##Conch Interactive Plot

In [None]:
#Conching Interactive visualization
@interact(Year=year_list, Period=period_list)
def plot_particle_size(Year, Period):
  df = df8_b[(df8_b['Year']==Year)].copy()
  fig8_b = px.box(data_frame=df, x=Period, y='Recorded_Results', color='Quarter',
                  title=f'Conch Malvern D90 in {Year}', width=400, height=600)
  fig8_b.add_hrect(y0=28, y1=30, line_color="black", fillcolor="red", opacity=0.2, annotation_text='High_zone')
  fig8_b.update_yaxes(range=[24,34])
  fig8_b.show()

interactive(children=(Dropdown(description='Year', options=(2021, 2020, 2019, 2018, 2017), value=2021), Dropdo…

##Quarterly Visualizations Interactive Plot

In [None]:
#Creating quarterly visualization
@interact(Cat=cat_list, Quarter=quarter_list)
def plot_quarter_results(Cat, Quarter):
  df = df8[df8['Quarter']==Quarter].copy()
  if Cat == 'Recorded_results':
    fig8_c = px.box(data_frame=df, y=Cat, color='Year',
                    title=f'Particle Size Results in {Quarter} by Year', width=800, height=400)
  else:
    fig8_c = px.box(data_frame=df, y=Cat, color='Year',
                    title=f'Fat Content % Results in {Quarter} by Year', width=800, height=400)
  fig8_c.show()

interactive(children=(Dropdown(description='Cat', options=('Recorded_results', 'Fat_content_%'), value='Record…

##Histogram Interactive Plot

In [None]:
#Creating Histogram visualization
@interact(Year=year_list)
def plot_marginal_histogram(Year):
  df8_c = df8[(df8['Year']==Year)].copy()
  fig8_c = px.histogram(data_frame=df8_c, x='Recorded_results', color='Quarter', marginal='box', histnorm='percent',
                        nbins=20, title=f'Histogram of Particle Size Results', width=800, height=450)
  fig8_c.show()

interactive(children=(Dropdown(description='Year', options=(2021, 2020, 2019, 2018, 2017), value=2021), Output…

##Empirical CDF Interactive Plot

In [None]:
#Creating ECDF visualization
@interact(Year=year_list, Quarter=quarter_list)
def plot_ecdf_seaborn(Year, Quarter):
  df = df8[(df8['Year']==Year) & (df8['Quarter']==Quarter)].copy()
  sns.ecdfplot(data=df, x='Recorded_results', hue='Quarter', stat='proportion', palette=['purple'], complementary=False)
  plt.title(f'Empirical CDF of Particle Size in {Quarter} - {Year}')
  plt.show()

interactive(children=(Dropdown(description='Year', options=(2021, 2020, 2019, 2018, 2017), value=2021), Dropdo…

##BI-variate Interactive Plot

In [None]:
#Plotting bi-variate interactive visualization
@interact(Year = year_list, Cat=['Malvern_avg', 'Fat_avg'])
def plot_particle_scatter(Year, Cat):
  sns.scatterplot(x='Weeknum', y='Conch_avg', data=df8_merged[df8_merged['Year']==Year])
  sns.scatterplot(x='Weeknum', y=Cat, data=df8_merged[df8_merged['Year']==Year])
  plt.title(f'Malvern Bi-Variate Analysis in {Year}')
  plt.show()

#Fine Refiners Analytics

## Uploading necessary packages

In [None]:
#Upgrading Plotly
!pip install plotly --upgrade

In [None]:
#Uploading packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import interact

In [None]:
#Customizing data visualization styles
sns.set_theme(rc = {'figure.figsize': (8, 5)})

In [None]:
#Remove previous versions of the uploaded excel file
!rm 2021_fine_refiners.xlsx

In [None]:
#Uploading file from local drive ==> !rm file
from google.colab import files
uploaded3_a = files.upload()

Saving 2021_fine_refiners.xlsx to 2021_fine_refiners.xlsx


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df3_a = pd.read_excel(io.BytesIO(uploaded3_a['2021_fine_refiners.xlsx']))

In [None]:
#Checking the first dataframe info
df3_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            150 non-null    datetime64[ns]
 1   roll_num        150 non-null    object        
 2   fine_refiner_1  90 non-null     float64       
 3   fine_refiner_2  90 non-null     float64       
 4   fine_refiner_3  90 non-null     float64       
 5   fine_refiner_4  85 non-null     float64       
 6   fine_refiner_5  90 non-null     float64       
 7   fine_refiner_6  90 non-null     float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 9.5+ KB


##Data manipulation

In [None]:
#Creating categories from datetime for first dataframe
df3_a = df3_a.assign(day = df3_a['date'].dt.dayofweek,
                     weeknum = df3_a['date'].dt.isocalendar().week)

In [None]:
#Transforming numeric day into day of the week
days_dict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df3_a['day'].replace(days_dict, inplace=True)

In [None]:
#Melting the dataframe to long format
df3_a = pd.melt(df3_a, id_vars=['date', 'roll_num', 'day', 'weeknum'], value_vars=['fine_refiner_1', 'fine_refiner_2', 'fine_refiner_3', 'fine_refiner_4', 'fine_refiner_5', 'fine_refiner_6'],
        var_name='fine_refiner', value_name='temperature')

##Interactive Visualization

In [None]:
#Plotting the temperature by fine refiner
@interact(Refiner = df3_a['fine_refiner'].unique())
def plot_fine_refiners(Refiner):
  df = df3_a[df3_a['fine_refiner']==Refiner].copy()
  fig3_a = px.line(data_frame=df, x='date', y='temperature', color='roll_num',
                  title=f'Temperatures trends in {Refiner}', height=500, width=800,
                  color_discrete_sequence=px.colors.qualitative.D3)
  fig3_a.show()

interactive(children=(Dropdown(description='Refiner', options=('fine_refiner_1', 'fine_refiner_2', 'fine_refin…