In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
#this read dataset
df_menu = pd.read_csv('McDonald_Menu_Nutrient.csv')

# Code Below to Evaluate what Machine Learning Algorithm that Best Suite for The First Dataset, McDOnald Menu Nutrient Info

In [None]:
#let's have a look on main dataset, McDonald Menu Nutrient
df_menu.head()

In [None]:
'''
Now what we can do about the dataset? as far as we know, these feature have high corelation, 
thus making prediction about menu nutrient seems possible, but that's not the problem we want to explore.
By looking at these dataset, clustering might be done but that only good for excercise, 
since these menu have been categorized, clustering become unecessary.
Next option we can also categorized the dataset into new category such as junk food, healthy food etc, 
but again, we want go deeper.

How about we make menu recomendation by making prediction model? some people have their diet procedure, 
but still want cheating their agenda and enjoy everything good in restaurant. Since we have these dataset,
we can tell people who's young, teen, adults, or old; man or woman; or whoever that like to excercise, 
doing hard works or just laying on bed about their diet menu. People often have daily nutrients need and
we like to help them choosing the menu without worry about over-nutrient.

for the first step, let's we prepare these data and make a model by train them with dataset. for the output, 
we'll chose menu item. since those menus are in category, we'll make the category as input too. thus the user
will be free to chose they want smoothies & shake in their dietary program or not.

after choosing the machine learning algorithm, we'll add another data set so we can connect user information
into this nutrient menu dataset to make recomendation.
'''

In [None]:
def correlation(DataFrame, top_n = None, method = 'spearman', remove_duplicates = True, remove_self_correlations = True):
    """
    #Compute the feature correlation and sort feature pairs based on their correlation
    
    :DataFrame -> The dataframe with the predictor variables
    :type DataFrame: pandas.core.frame.DataFrame
    :top_n -> Top N feature pairs to be reported (if None, all of the pairs will be returned)
    :method -> Correlation compuation method
    :type method: str
    :remove_duplicates -> Indicates whether duplicate features must be removed
    :type remove_duplicates: bool
    :remove_self_correlations -> Indicates whether self correlations will be removed
    :type remove_self_correlations: bool

    :return: pandas.core.frame.DataFrame
    """
    corr_matrix_abs = DataFrame.corr(method=method).abs()
    corr_matrix_abs_us = corr_matrix_abs.unstack()
    sorted_correlated_features = corr_matrix_abs_us \
        .sort_values(kind="quicksort", ascending=False) \
        .reset_index()

    # Remove comparisons of the same feature
    if remove_self_correlations:
        sorted_correlated_features = sorted_correlated_features[
            (sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
        ]

    # Remove duplicates
    if remove_duplicates:
        sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]

    # Create meaningful names for the columns
    sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation'] 

    if top_n:
        return sorted_correlated_features[:top_n]

    return sorted_correlated_features

In [None]:
def standardize(DataFrame):
    #standardize data 
    std_scale = StandardScaler().fit(DataFrame)
    df_std = pd.DataFrame(std_scale.transform(DataFrame))
    df_std.index = DataFrame.index.copy()
    df_std.columns = DataFrame.columns.copy()
    return df_std

def normalize(DataFrame):
    #normalize data 
    minmax_scale = MinMaxScaler().fit(DataFrame)
    df_minmax = pd.DataFrame(minmax_scale.transform(DataFrame))
    df_minmax.index = DataFrame.index.copy()
    df_minmax.columns = DataFrame.columns.copy()
    return df_minmax

In [None]:
#First we make category as onehot model and concate them with our dataset
#df_menu = pd.read_csv('McDonald_Menu_Nutrient.csv')
df_menu = pd.concat([df_menu, pd.get_dummies(df_menu['Category'])], axis = 1)

#now since item is unique id, it's not wise to make them as onehot model too.
#binning them into small category will make us lost our purpose since we want get the exact item name not it's approximation
#since the item is not in some kind of order (not comparable like color) still we can rank it based on it's feature.
#here we're talking about calories. so we rank menu item based on them, and if it's tie, we compare them with other feature which highly correlated with calories
df_menu = df_menu.set_index(['Category', 'Item']) #make this code as comment if it's been run 1 because it'll raise error in jupyter since dataframe have been change. but it'll be okay if run in spyder
df_minmax = normalize(df_menu) #lets skip the first two column since its data types is not numerical
df_std = standardize(df_menu)

#looking for feature that high correlated to calories
corr = correlation(df_minmax, method = 'spearman') #lets use spearman method because all data is in numerical and they're ordinal/discrete in majority
print("\n" + "\033[94m"+ "\033[1m" + "Feature Corelation" + "\033[0m" + "\n")
print("\033[92m"+ "\033[1m" + "{}".format(corr[(corr['Correlation'] >= 0.667) & (corr['Correlation'] < 1.0)])) #only show data that has corelation with calories

#Rank the item in each category
df_menu['Product'] = 1
col_name = ['Calories'] + corr[(corr['Feature 1'] == 'Calories') | (corr['Feature 1'] == 'Calories')]['Feature 2'].values.copy().tolist()
for ix in df_menu.index.get_level_values('Category').unique().values.copy().tolist():
    df_menu.loc[ix] = df_menu.loc[ix].sort_values(col_name).values.copy()
    df_menu.loc[ix, 'Product'] = np.arange(1, df_menu.loc[ix].shape[0]+1)
    df_menu.loc[ix] = df_menu.loc[ix].sort_index(axis = 0, level = 'Item').values.copy()

In [None]:
col_name = df_menu.columns.values.copy().tolist()
col_name = col_name[:-9] + [col_name[-1]]
df_minmax = normalize(df_menu[col_name])
#looking for feature that high correlated to calories
corr = correlation(df_minmax, method = 'spearman') #lets use spearman method because all data is in numerical and they're ordinal/discrete in majority
print("\n" + "\033[94m"+ "\033[1m" + "Feature Corelation" + "\033[0m" + "\n")
print("\033[92m"+ "\033[1m" + "{}".format(corr[(corr['Correlation'] >= 0.5) & (corr['Correlation'] < 1.0)]) + "\033[0m") #only show data that has corelation with calories

In [None]:
"""
After looking at the correlation, now let's create machine learning model.
Since what we aim is making menu recommendation, thus we need model for prediction such as regression or neural network.

Let's start with the easier one, a regression model.
because there're so many independent variable, then we need to use multiple linear regression model.
we're choosing linear instead of non-linear, because each independent variable shows strong correlation,
and product have good corelation with some feature. after that, let's we compare 
when the model use normalize dataset and when the model use stadardize dataset.

Next let's try modeling with neural network anda make comparation like befor.
we also compare machine learning algorithm that used for modelling. The best one will show minimal mse
"""

In [None]:
#multi regression for predict product item using normalize dataset
df_minmax = normalize(df_menu)
col_name = df_minmax.columns.values.copy().tolist()
x = df_minmax[col_name[:-1]] # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets
y = df_minmax['Product']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

# with sklearn
regr = LinearRegression()
regr.fit(x_train, y_train)
print("\033[95m"+ "\033[1m" + "Means Squared Error when Training using Normalize Dataset" + "\033[0m" + "\n")
print("\033[94m"+ "\033[1m" + "Intercept: \n" + "\033[36m"+ "\033[1m" + "{}".format(regr.intercept_) + "\033[0m" + "\n")
print("\033[94m"+ "\033[1m" + "Coefficients: \n" + "\033[36m"+ "\033[1m" + "{}".format(regr.coef_) + "\033[0m" + "\n")

# prediction with sklearn
Product_pred = [regr.predict([val]) for val in x_test.values.copy().tolist()]
print("\033[94m"+ "\033[1m" + "MSE: \n" + "\033[36m"+ "\033[1m" + "{}".format(mean_squared_error(y_test.values.copy().tolist(), Product_pred)) + "\033[0m" + "\n")

In [None]:
#multi regression for predict product item using standardize dataset
df_std = standardize(df_menu)
col_name = df_std.columns.values.copy().tolist()
x = df_std[col_name[:-1]] # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets
y = df_std['Product']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

# with sklearn
regr = LinearRegression()
regr.fit(x_train, y_train)
print("\033[93m"+ "\033[1m" + "Means Squared Error when Training using Standardize Dataset" + "\033[0m" +"\n")
print("\033[91m"+ "\033[1m" + "Intercept: \n" + "\033[92m"+ "\033[1m" + "{}".format(regr.intercept_) + "\033[0m" + "\n")
print("\033[91m"+ "\033[1m" + "Coefficients: \n" + "\033[92m"+ "\033[1m" + "{}".format(regr.coef_) + "\033[0m" + "\n")

# prediction with sklearn
Product_pred = [regr.predict([val]) for val in x_test.values.copy().tolist()]
print("\033[91m"+ "\033[1m" + "MSE: \n" + "\033[92m"+ "\033[1m" + "{}".format(mean_squared_error(y_test.values.copy().tolist(), Product_pred)) + "\033[0m" + "\n")

In [None]:
#neural network for predict product item using normalize dataset
df_minmax = normalize(df_menu)

#select feature for independent variable
col_name = df_menu.columns[:-1].values.copy().tolist()
x = np.asarray(df_minmax[col_name])

#select feature for dependent variable
col_name = 'Product'
y = np.asarray(df_minmax[col_name])

hlayer_row = []
hlayer_col = []
train_mse = []
test_mse = []

for i in np.arange(1, 21):
    for j in np.arange(1, 21):
        hlayer_row.append(i)
        hlayer_col.append(j)
        #split data for training and testing
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
        #Train Model and Predict
        net = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(i, j), random_state=1)
        #clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(i, j), random_state=1)

        net.fit(x_train, y_train)

        train_mse.append(mean_squared_error(y_train, net.predict(x_train)))
        test_mse.append(mean_squared_error(y_test, net.predict(x_test)))
        
df_net_minmax=pd.DataFrame({'Hidden Layer Row': hlayer_row, 'Hidden Layer Column': hlayer_col, 'Train MSE': train_mse, 'Test MSE': test_mse})
print("\033[94m"+ "\033[1m" + "Means Squared Error when Training using Normalize Dataset" + "\033[0m" + "\n")
print("\033[36m"+ "\033[1m" + "{}".format(df_net_minmax.sort_values(['Test MSE', 'Train MSE']).head()) + "\033[0m")

In [None]:
#neural network for predict product item using standardize dataset
df_std = standardize(df_menu)

#select feature for independent variable
col_name = df_menu.columns[:-1].values.copy().tolist()
x = np.asarray(df_std[col_name])

#select feature for dependent variable
col_name = 'Product'
y = np.asarray(df_std[col_name])

hlayer_row = []
hlayer_col = []
train_mse = []
test_mse = []

for i in np.arange(1, 21):
    for j in np.arange(1, 21):
        hlayer_row.append(i)
        hlayer_col.append(j)
        #split data for training and testing
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
        #Train Model and Predict
        net = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(i, j), random_state=1)
        #clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(i, j), random_state=1)

        net.fit(x_train, y_train)

        train_mse.append(mean_squared_error(y_train, net.predict(x_train)))
        test_mse.append(mean_squared_error(y_test, net.predict(x_test)))
        
df_net_std=pd.DataFrame({'Hidden Layer Row': hlayer_row, 'Hidden Layer Column': hlayer_col, 'Train MSE': train_mse, 'Test MSE': test_mse})
print("\033[91m"+ "\033[1m" + "Means Squared Error when Training using Standardize Dataset" + "\033[0m" +"\n")
print("\033[92m"+ "\033[1m" + "{}".format(df_net_std.sort_values(['Test MSE', 'Train MSE']).head()) + "\033[0m")

# Code Below to Prepared the Additional Dataset that Needed to Make Food Recommendation for Daily Nutrients Need Fullfilment

In [None]:
'''
As seen, using normalize dataset will give better performance in model training,
also even learning method can be perform with linear regression, neural network still give better prediction,
because it give results with minimal error. we also get training parameter that best fit when using neural network,
where we using (9x20) hidden layer.

Now let's prepare other data set so we can make deployment easier. we won't expect for user to input their nutrients need
one by one, sp we make prediction for that too. we need to know calory needs based on user age, sex, and activity.
after that we need to know other food nutrients when we only know user calories. based on what we've done before, using
linear regression to find other corelated food nutrient will make the solution.
'''

In [None]:
#let's import the dataset
#this dataset used for predict user calory needs
df_fcal = pd.read_csv('Female_Calories_Needs.csv')
df_mcal = pd.read_csv('Male_Calories_Needs.csv')

#this dataset used for predict user nutrient food other than calory
df_nut = pd.read_csv('Daily_Nutrients_Needs.csv')

In [None]:
print('\033[94m' + '\033[1m' + '{}'.format(df_mcal.head()) + '\033[0m' + '\n')
print('\033[91m' + '\033[1m' + '{}'.format(df_fcal.head()) + '\033[0m')

In [None]:
df_nut.head()

In [None]:
def prepare(DataFrame, bin_column = 'Age'):
    #Corecting Column names
    col_name = DataFrame.columns.values.copy()
    col_name[0] = col_name[0].capitalize()
    col_name[1:] = range(3)
    DataFrame.columns = col_name
    
    #Correcting age value
    DataFrame[bin_column] = DataFrame[bin_column].str[:2]
    
    #remove comma separator
    DataFrame = pd.DataFrame([DataFrame[col].str.replace(',','') for col in DataFrame.columns.values.copy()]).transpose()
    
    #data type correction
    DataFrame = DataFrame.astype('int64')
    
    #binning age category
    bins = np.arange(1,100,5)
    labels = np.arange(1,len(bins-1))
    DataFrame[bin_column+'_Bin'] = pd.cut(DataFrame[bin_column], bins=bins, labels=labels)
    DataFrame[bin_column+'_Bin'] = DataFrame[bin_column+'_Bin'].astype('int64')
    
    DataFrame = DataFrame.groupby(bin_column+'_Bin').mean()
    
    #data type correction
    DataFrame = DataFrame.astype('int64')
    
    #drop redundant column
    DataFrame.drop(bin_column, axis = 1, inplace = True)
    
    #creating activity category
    DataFrame = pd.DataFrame(DataFrame.stack()).reset_index()
    DataFrame = DataFrame.rename(columns={'level_1': 'Activity', 0: 'Calories'})
    
    return DataFrame

In [None]:
df_mcal = prepare(df_mcal)
df_mcal.head()

In [None]:
df_fcal = prepare(df_fcal)
df_fcal.head()

In [None]:
def stacking(DataFrame, num_stack, col_name):
    #creating activity category
    DataFrame = DataFrame.set_index(DataFrame.columns[:-num_stack].values.copy().tolist()) #this finish data preparation
    DataFrame = pd.DataFrame(DataFrame.stack()).reset_index()
    DataFrame = DataFrame.rename(columns={DataFrame.columns[-2]: col_name})
    DataFrame.drop(0, axis = 1, inplace = True)
    
    return DataFrame

In [None]:
df_mcal[1] = 1
df_fcal[0] = 1
df_cal = pd.concat([df_mcal, pd.DataFrame(df_fcal[0])], axis = 1)

df_cal = stacking(df_cal, 2, col_name = 'Sex')
df_cal = df_cal.set_index(['Sex', 'Age_Bin', 'Activity']).sort_index()
df_cal = df_cal.reset_index().set_index(['Sex'])

df_fcal = df_fcal.set_index(['Age_Bin', 'Activity']).sort_index()
df_fcal = df_fcal.reset_index()

df_cal.loc[0, 'Calories'] = df_fcal['Calories'].values.copy()
df_cal = df_cal.reset_index()
df_cal.head()

In [None]:
"""
foot note:
the calories are in kcal
the sex is 0 for female and 1 for male
the age_bin are 1 for 1-5 yr, 2 for 5-10 yr, . . ., 15 for 51++ (if i'm not mistaken)
the activity is 0 for Sedentary, 1 for moderately active, and 2 for active

next we prepare nutrient data needs for dietary people (i mean daily need)
"""

In [None]:
df_nut = pd.read_csv('dietary.csv')
df_nut = df_nut.transpose()
df_nut = df_nut.rename(columns=df_nut.iloc[0])
df_nut.drop(df_nut.index[0:2], inplace = True)

In [None]:
#look if there any missing value in dataset
DataFrame = df_nut.isnull().copy()
col_name = []
for column in DataFrame.columns.values.copy().tolist():
    dum = DataFrame[column].value_counts().to_dict()
    if True in dum:
        foo = dum[True]/df_nut.shape[0]
        print("{}: {:.2f}% data is missing".format(column, foo))
        if foo > 0.33: #threshold for column to drop if more than 33% of its data are missing 
            col_name.append(column) #this store column name that need to be drop

In [None]:
#drop unused nutrient info
#macronutrient, minerals and vitamins are category name but in bad format, that's why all value is missing
#but we drop those still since we don't need it.
df_nut.drop(col_name + ['Protein,\xa0%\xa0kcal', 'Carbohydrate, %\xa0kcal', 'Saturated fat, %\xa0kcal', 'Linoleic acid, g', 
                       'Linolenic acid, g', 'Magnesium, mg', 'Phosphorus, mg', 'Potassium, mg', 'Zinc, mg', 
                       'Copper, mcg', 'Manganese, mg', 'Selenium, mcg', 'Vitamin E, mg\xa0AT', 'Vitamin D, IU', 
                       'Thiamin, mg', 'Riboflavin, mg', 'Niacin, mg', 'Vitamin B6, mg', 'Vitamin B12, mcg', 
                       'Choline, mg', 'Vitamin K, mcg', 'Folate, mcg\xa0DFE'], axis = 1, inplace = True)

#rename columns name
df_nut = df_nut.reset_index()
col_name = ['Sex_Age', 'Calories (kcal)', 'Protein (g)', 'Carbohydrate (g)', 'Dietary fiber (g)', 'Sugars (% kcal)', 
            'Total Fat (% kcal)', 'Calcium (mg)', 'Iron (mg)', 'Sodium (mg)', 'Vitamin A (mcg)', 'Vitamin C (mg)']
df_nut.columns = col_name

In [None]:
#let's do some data preparation
df_nut['Calcium (mg)'] = df_nut['Calcium (mg)'].str.replace(',', '')
df_nut['Sodium (mg)'] = df_nut['Sodium (mg)'].str.replace(',', '')

df_nut['Sugars (% kcal)'] = df_nut['Sugars (% kcal)'].str.replace('<', '').str.replace('%', '')

df_dummy = df_nut['Total Fat (% kcal)'].str.split('-', expand = True)
df_dummy.columns = ['Fat_lo', 'Fat_up']
df_nut = pd.concat([df_nut, df_dummy], axis = 1)

df_dummy = df_nut['Sex_Age'].str.split(' ', expand = True)
df_dummy.columns = ['Sex', 'Age']
df_nut = pd.concat([df_nut, df_dummy], axis = 1)

df_nut = df_nut.append(df_nut.iloc[0].copy(), ignore_index = True)
df_nut['Sex'][df_nut['Sex'] == 'Child'] = ['Female', 'Male']
row = df_nut.index.values.copy().tolist()
row = [row[0], row[-1]] + row[1:-1]
df_nut = df_nut.iloc[row]
df_nut.drop('Sex_Age', axis = 1, inplace = True)
df_nut = df_nut.set_index('Sex')

df_dummy = df_nut['Age'].str.split('[+|-]', expand = True)
df_dummy.columns = ['Age_lo', 'Age_up']
df_dummy.iloc[-2:, df_dummy.columns.get_loc('Age_up')] = df_dummy.iloc[-2:, df_dummy.columns.get_loc('Age_up')].str.replace('', '51')
df_nut = pd.concat([df_nut, df_dummy], axis = 1)

df_nut['Calories_lo'] = df_nut['Calories (kcal)'].str[:5].copy()
df_nut['Calories_up'] = df_nut['Calories (kcal)'].str[-5:].copy()
df_nut['Calories_lo'] = df_nut['Calories_lo'].str.replace(',', '')
df_nut['Calories_up'] = df_nut['Calories_up'].str.replace(',', '')

df_nut['Calcium (mg)'] = df_nut['Calcium (mg)'].str.replace('b', '')

name = ['Calories (kcal)', 'Protein (g)', 'Total Fat (% kcal)', 'Age']
sub = ['Calories', 'Protein', 'Fat', 'Age']

col_name = df_nut.filter(like = '_lo', axis = 1).columns.values.copy().tolist()
col_name += df_nut.filter(like = '_up', axis = 1).columns.values.copy().tolist()
df_nut[col_name] = df_nut[col_name].astype('int64')

for i in range(len(name)):
    df_nut[name[i]] = df_nut[df_nut.filter(like = sub[i], axis = 1).columns.values.copy().tolist()[-2:]].mean(axis=1)
    
df_nut.drop(col_name, axis = 1, inplace = True)

#binning age category
bins = np.arange(1,100,5)
labels = np.arange(1,len(bins-1))
df_nut['Age_Bin'] = pd.cut(df_nut['Age'], bins=bins, labels=labels)
df_nut['Age_Bin'] = df_nut['Age_Bin'].astype('int64')

col_name = df_nut.columns.values.copy().tolist()
col_name = [col_name[-1]] + col_name[0:-2]
df_nut = df_nut[col_name]

df_nut = df_nut.reset_index()
df_nut['Sex'] = df_nut['Sex'].map({'Female': 0, 'Male': 1})

df_nut['Dietary fiber (g)'] = df_nut['Dietary fiber (g)'].astype('float64')
df_nut[df_nut.columns.difference(['Dietary fiber (g)'])] = df_nut[df_nut.columns.difference(['Dietary fiber (g)'])].astype('int64')
df_nut.head()
#df.to_csv('Diet_Prepared.csv', index = False)

In [None]:
#Saving Files
df_menu.to_csv('McDonald_Menu_Nutrients_Clean.csv')
df_cal.to_csv('Daily_Calory_Needs_Clean.csv', index = False)
df_nut.to_csv('Daily_Nutrients_Needs_Clean.csv')

""""
#age binning info
    :bins = np.arange(1,100,5)
    :labels = np.arange(1,len(bins-1))
    :DataFrame[bin_column+'_Bin'] = pd.cut(DataFrame[bin_column], bins=bins, labels=labels)

#sex -> 0 = Female; 1 = Male

#Activities -> 0: Sedentary
               1: Moderately Active
               2: Active

#use this if you need some creativity with text output
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print(color.BOLD + 'Hello World !' + color.END)
"""