In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from PIL import Image, ImageDraw
from math import pi, ceil
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
#read data from csv
df = pd.read_csv('McDonald_Menu_Nutrient_Raw.csv')

In [None]:
df.head()

In [None]:
#Data Preparation step
#look if there any wrong data type
print(df.dtypes) #as seen, category column can be treated as category data type
print(df['Category'].unique()) #double crosscheck if category is realy unique to be interpreted as category
df['Category'] = df['Category'].astype('category') #convert to category data type (so it can be used for multi-indexing)

#look if there any duplicated data (Item can be used as primary key) 
print(df.duplicated(subset = ['Item']).any()) #as seen, 

#look if there any missing value in dataset
DataFrame = df.isnull()
col_name = []
for column in DataFrame.columns.values.tolist():
    dum = DataFrame[column].value_counts().to_dict()
    if True in dum:
        foo = dum[True]/df_raw.shape[0]
        print("{}: {:.2f}% data is missing".format(column, foo))
        if foo > 0.33: #threshold for column to drop if more than 33% of its data are missing 
            col_name.append(column) #this store column name that need to be drop

#Creating multi-index with Category as parent index and item as child index            
df = df.set_index([df.columns[0], df.columns[1]]) 
df.sort_index()

In [None]:
def stacked_bar_plot(xdata, ydata, xlabel, ylabel, legend, legend_title, color, width = 0.8, legend_loc = 'upper left'):
    """
    #this function to make stacked bar plot
    
    :xdata -> data in x axis
    :ydata -> data in y axis wich a list of collected data in y label
    :xlabel -> label name for data in x axis
    :ylabel -> label name for data in y axis
    :legend -> a list of legend item name
    :legend_title -> title for legend
    :color -> a list of color palette for data visualization
    :width -> bar plot width, initial = 0.8
    :legend_loc -> location for lagend position, initial = 'upper left'
    
    :return: plot figure
    """
    
    bottom = np.zeros(len(ydata[0]))
    
    for i in range(len(ydata)):
        plt.bar(xdata, ydata[i], bottom = bottom, color = color[i], edgecolor = 'white', width = width, label = legend[i])
        bottom += ydata[i]
        
    plt.legend(title = legend_title, loc = legend_loc, frameon = False)
    plt.xticks(rotation = 45, ha = 'right')

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    return plt

In [None]:
#now we visualize the average calories for each menu categories from dataset
#get data value for x axis
categories = df.index.get_level_values('Category').categories.copy().tolist()

#filter column name which like calories (calories & calories from fat)
col_name = df.filter(like = 'Calories', axis = 1).columns.values.copy()

#get value from filtered column name
df_calories = df[col_name].copy()

#calculate calories from other source
df_calories['Calories from Other'] = df_calories[col_name[0]] - df_calories[col_name[1]]

#creating color palete, legend and, label
palette = ['#729D7C', '#A5B99A']
legend = ['Other', 'Fat']
legend_title = 'Source'
xlabel = 'Categories'
ylabel = 'Calories'

#get data value for y axis
other_calories = []
fat_calories = []
for s in categories:
    dum = df_calories.loc[s].describe().mean().copy().tolist()
    other_calories.append(dum[-1])
    fat_calories.append(dum[1])    

#preparing data for input
ydata = [other_calories, fat_calories]

#plotting data
plot = stacked_bar_plot(xdata = categories, ydata = ydata, xlabel = xlabel, ylabel = ylabel, legend = legend, legend_title = legend_title, color = palette)

# Add a title
plot.title('McDonnald Menu\'s Averages Calories', size=14, y = 1.1)
    
# Save it
filename='calories_average.png'
plot.savefig(filename, dpi=96, bbox_inches='tight')

In [None]:
#now we visualize the average fat for each menu categories from dataset
#get data value for x axis
categories = df.index.get_level_values('Category').categories.copy().tolist()

#filter column name which like calories (calories & calories from fat)
col_name = df.filter(like = 'Fat', axis = 1).columns.values[1::2].copy()

#get value from filtered column name
df_fat = df[col_name].copy()

#calculate unsaturated fat
df_fat['Unsaturated Fat'] = df_fat[col_name[0]] - (df_fat[col_name[1]] + df_fat[col_name[2]])

#creating color palete, legend and, label
palette = ['#8F2D56', '#58586B', '#218380']
legend = ['Unsaturated Fat', 'Saturated Fat', 'Trans Fat']
legend_title = 'Type'
legend_loc = 'upper right'
xlabel = 'Categories'
ylabel = 'Fat'

#get data value for y axis
unsaturated_fat = []
saturated_fat = []
trans_fat = []
for s in categories:
    dum = df_fat.loc[s].describe().mean().copy().tolist()
    unsaturated_fat.append(dum[-1])
    saturated_fat.append(dum[1])
    trans_fat.append(dum[2])    

#preparing data for input
ydata = [unsaturated_fat, saturated_fat, trans_fat]

#plotting data
plot = stacked_bar_plot(xdata = categories, ydata = ydata, xlabel = xlabel, ylabel = ylabel, legend = legend, legend_title = legend_title, color = palette, legend_loc = legend_loc)

# Add a title
plot.title('McDonnald Menu\'s Averages Fat', size=14, y = 1.1)
    
# Save it
filename='fat_average.png'
plot.savefig(filename, dpi=96, bbox_inches='tight')

In [None]:
def correlation(DataFrame, top_n = None, method = 'spearman', remove_duplicates = True, remove_self_correlations = True):
    """
    #Compute the feature correlation and sort feature pairs based on their correlation
    
    :DataFrame -> The dataframe with the predictor variables
    :type DataFrame: pandas.core.frame.DataFrame
    :top_n -> Top N feature pairs to be reported (if None, all of the pairs will be returned)
    :method -> Correlation compuation method
    :type method: str
    :remove_duplicates -> Indicates whether duplicate features must be removed
    :type remove_duplicates: bool
    :remove_self_correlations -> Indicates whether self correlations will be removed
    :type remove_self_correlations: bool

    :return: pandas.core.frame.DataFrame
    """
    corr_matrix_abs = DataFrame.corr(method=method).abs()
    corr_matrix_abs_us = corr_matrix_abs.unstack()
    sorted_correlated_features = corr_matrix_abs_us \
        .sort_values(kind="quicksort", ascending=False) \
        .reset_index()

    # Remove comparisons of the same feature
    if remove_self_correlations:
        sorted_correlated_features = sorted_correlated_features[
            (sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
        ]

    # Remove duplicates
    if remove_duplicates:
        sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]

    # Create meaningful names for the columns
    sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation'] 

    if top_n:
        return sorted_correlated_features[:top_n]

    return sorted_correlated_features

In [None]:
'''
From dataset, there are similar data like total fat in kcal an total fat in % daily value.
we can suspect these variable to be droped later if it's act as redundant data,
in which, there are high correlation between each others.
There also some variable that's only stated as % daily value, so even it has dependency on daily value need,
which we'll discused later, we'll just keep it right now until further analyzing.

first wee need to standardize or scaling the data to find their correlation for each others
'''
def standardize(DataFrame):
    #standardize data 
    std_scale = StandardScaler().fit(DataFrame)
    df_std = pd.DataFrame(std_scale.transform(DataFrame))
    df_std.index = DataFrame.index.copy()
    df_std.columns = DataFrame.columns.copy()
    return df_std

def normalize(DataFrame):
    #normalize data 
    minmax_scale = MinMaxScaler().fit(DataFrame)
    df_minmax = pd.DataFrame(minmax_scale.transform(DataFrame))
    df_minmax.index = DataFrame.index.copy()
    df_minmax.columns = DataFrame.columns.copy()
    return df_minmax

df_minmax = normalize(df[df.columns[1:]]) #lets skip the first column since its data types is not numerical
df_std = standardize(df[df.columns[1:]])

#now look for data corelation
corr = correlation(df_std, method = 'spearman') #lets use spearman method because all data is in numerical and they're ordinal/discrete in majority
print("\n" + "\033[91m"+ "\033[1m" + "Standardize Corelation" + "\033[0m" + "\n")
print("\033[92m"+ "\033[1m" + "{}".format(corr[(corr['Correlation'] >= 0.667) & (corr['Correlation'] < 1.0)]) + "\033[0m") #only show data that has corelation with calories

#now what if we use normalized data instead of standardize one to look for data correlation
corr = correlation(df_minmax, method = 'spearman')
print("\n" + "\033[94m"+ "\033[1m" + "Normalize Corelation" + "\033[0m" + "\n")
print("\033[36m"+ "\033[1m" + "{}".format(corr[(corr['Correlation'] >= 0.667) & (corr['Correlation'] < 1.0)]) + "\033[0m") #only show data that has corelation with calories

'''
from what we get, using normalize or standardize for finding corelation in this dataset seems not to be an issue.
also we can treated feature that has similarity and high corelation for each other as redundant data and drop it.
redundant data wich stated as % daily value will be droped because it has dependency for other feature that's not in this dataset.
but let's keep feature like vitamin a, vitamin c, calcium and iron even if it's in % daily value because some value have cross corelation which is good if we want to make prediction or something.
also we'll drop saturated fat, trans fat, and calories from fat and keep Total fat and Total calories instead.
for better analizing we'll drop Serving Size columns
'''

#now we drop another redundant/similar data
df = df[df.columns[1:]].drop(df.filter(like = '%').columns[:-4], axis = 1) #be carefull if you need to run this again in jupyter make it a commen or it'll raised error since the column name has been droped from the previous one. as for spyder user it's not an issue seen all syntax will be run from the start.
df.drop(['Calories from Fat', 'Saturated Fat', 'Trans Fat'], axis = 1, inplace = True)

In [None]:
def spider_plot(DataFrame, num = None, label = None, color = [None, None, None, None], maxval = None):
    '''
    #this function make spider plot and save it
    
    :DataFrame -> data to plot
    :num -> number for figure save file
    :label -> figure label name
    '''
    # Set data 
    df = DataFrame
    str1, str2 = df.index.values
    str1 = np.asarray(str1.split(" "))
    str2 = np.asarray(str2.split(" "))
    legend_label = [' '.join([s for s in str1 if (s not in str2)]), ' '.join([s for s in str2 if (s not in str1)])] if label is None else label
    # ------- PART 1: Create background
 
    # number of variable
    categories=list(df)
    N = len(categories)
 
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
 
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)
 
    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
 
    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], categories, color = color[2])
    ticks = np.linspace(0, 2*np.pi, N, endpoint=False)
    for label,rot,i in zip(ax.get_xticklabels(),ticks, range(N)):
        if i % int(N/2) != 0:
            if i // int(N/2) == 0:
                label.set_horizontalalignment("left")
            else:
                label.set_horizontalalignment("right")

    # Draw ylabels
    maxval = ceil(df.max().max()*10)/10 if maxval == None else maxval
    label = np.linspace(0,maxval,5)[1:-1]
    ax.set_rlabel_position(0)
    plt.yticks(label, list(filter(None, str(label).strip('[]').split(' '))), color="grey", size=7)
    plt.ylim(0,maxval)
 
    # ------- PART 2: Add plots
 
    # Plot each individual = each line of the data

    # Ind1
    values=df.iloc[0].values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, color = color[0], linewidth=1, linestyle='solid', label=legend_label[0])
    ax.fill(angles, values, color = color[0], alpha=0.2)
 
    # Ind2
    values=df.iloc[1].values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, color = color[1], linewidth=1, linestyle='solid', label=legend_label[1])
    ax.fill(angles, values, color = color[1], alpha=0.2)

    # Add a title
    plt.title(' '.join([s for s in str2 if (s in str1)]), size=14, color=color[3], y = 1.2)

    # Add legend
    plt.legend(bbox_to_anchor=(1.45, 1.15))
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    # Save it
    filename=legend_label[0]+'_'+legend_label[1]+str(num+1)+'.png'
    plt.savefig(filename, dpi=96, bbox_inches='tight')
    plt.close()

In [None]:
#now we'll see how food feature affect the food nutrients
#filter data to compare grilled chicken vs crispy chicken in sandwich can affect food nutrients
df_minmax = normalize(df)
df_dummy = df_minmax.filter(like = 'Sandwich', axis = 0).copy()
df1 = df_dummy.filter(like = 'Grilled Chicken', axis = 0).copy()
df2 = df_dummy.filter(like = 'Crispy Chicken', axis = 0).copy()

images = []

#make spider plot
color = ['#3B3923', '#BCB076', '#8F4020', '#4C061D']
for i in range(min(df1.shape[0], df2.shape[0])):
    spider_plot(pd.concat([df1.iloc[i:i+1], df2.iloc[i:i+1]]).sort_index().droplevel(level = 0), num = i, color = color, maxval = 0.6)

for i in range(min(df1.shape[0], df2.shape[0])):
    im = Image.open("Crispy_Grilled"+str(i+1)+".png")    
    images.append(im)

images[0].save('Crispy_Grilled.gif', save_all=True, append_images=images[1:], optimize=False, duration=1000, loop=0)

In [None]:
#now we'll see how food feature affect the food nutrients
#filter data to compare grilled chicken vs crispy chicken in sandwich can affect food nutrients
df_minmax = normalize(df)
df_dummy = df_minmax.filter(like = 'Egg', axis = 0).droplevel(level = 0).copy()
df2 = df_dummy.filter(like = 'Egg White', axis = 0).copy()
df1 = df_dummy.drop(df2.index).copy()

#manual filter to drop uncomparable menus
df1 = df1.drop([df1.index[0], df1.index[2], df1.index[7], df1.index[-1]], axis = 0)
df2 = df2.iloc[1:9]

images = []

#make spider plot
color = ['#5B3758', '#83B692', '#C65B7C', '#F9888F']
for i in range(min(df1.shape[0], df2.shape[0])):
    spider_plot(pd.concat([df1.iloc[i:i+1], df2.iloc[i:i+1]]).sort_index(), num = i, label = ['Whole Egg', 'Egg White'], color = color, maxval = 0.6)
    
for i in range(min(df1.shape[0], df2.shape[0])):
    im = Image.open("Whole Egg_Egg White"+str(i+1)+".png")    
    images.append(im)

images[0].save('Whole Egg_Egg White.gif', save_all=True, append_images=images[1:], optimize=False, duration=1000, loop=0)

In [None]:
#last let's see how each feature corelated each other by plotting each of them in scattered pairplod
#for this one, only plot feature that has cross corelation (fat corelated to calories, and calories corelated to sodium, and sodium corelated to fat, etc)
df_std = standardize(df)
df_std = df_std.reset_index().set_index('Item')
sns_plot = sns.pairplot(df_std[['Category', 'Protein', 'Calories', 'Total Fat', 'Sodium', 'Cholesterol', 'Carbohydrates']], kind="scatter", hue='Category', palette="Set2")
sns_plot.savefig("Feature_Pairplot.png")

In [None]:
#let's save it for now and let's analize it further next time
df.to_csv('McDonald_Menu_Nutrient.csv')