In [None]:
# Import basic packages
import pandas as pd
import numpy as np

In [None]:
#Read dataset
df = pd.read_csv('/kaggle/input/healthy-diet-recipes-a-comprehensive-dataset/All_Diets.csv',sep=',')
df.head()

In [None]:
# Check null
df.isna().any()

In [None]:
# Check duplicated
df.duplicated().value_counts()

In [None]:
# Load duplicated
df[df.duplicated(keep=False) == True]

In [None]:
# Dropout duplicated and double check
df_nodup = df.drop_duplicates()
df_nodup.duplicated().value_counts()

In [None]:
# Check Diet_type
df_nodup['Diet_type'].value_counts(normalize=True)

In [None]:
# Check Cuisine_type
df_nodup['Cuisine_type'].value_counts(normalize=True)

In [None]:
# Check Protein(g), Carbs(g) and Fat(g)
df_nodup.describe()

In [None]:
# Relationship between Diet_type and Protein(g),Carbs(g) and Fat(g)
import matplotlib
import matplotlib.pyplot as plt

In [None]:
# Box Plot
plotdata = pd.DataFrame({
    "Protein(g)":list(df_nodup[['Protein(g)']].squeeze()),
    "Carbs(g)":list(df_nodup[['Carbs(g)']].squeeze()),
    "Fat(g)":list(df_nodup[['Fat(g)']].squeeze())})

plt.figure(figsize =(11, 6))

bplots = plt.boxplot(plotdata, vert = 0, patch_artist = True, labels=["Protein","Carbs","Fat"])

colors = ["cadetblue","darkslategray","skyblue"]
for i, bplot in enumerate(bplots['boxes']):
    bplot.set(color='gray', linewidth=2)
    bplot.set(facecolor = colors[i])

for whisker in bplots['whiskers']:
    whisker.set(color='gray', linewidth=2)

for cap in bplots['caps']:
    cap.set(color='gray', linewidth=2)

for median in bplots['medians']:
    median.set(color='gray', linewidth=2)

plt.title("Boxplot for Nutritious Information", loc="center", fontsize=18)
plt.xlabel("Amount (in grams)")
plt.ylabel("Nutritious Information")

In [None]:
import seaborn as sns
sns.set_style("white")

In [None]:
# Nutrition Information
analise1 = df_nodup.groupby("Diet_type").aggregate({'Protein(g)':'mean','Carbs(g)':'mean','Fat(g)':'mean'})
analise1_frame = analise1.reset_index()

# Plot
plotdata = pd.DataFrame({
    "Protein":list(analise1_frame[['Protein(g)']].squeeze()),
    "Carbs":list(analise1_frame[['Carbs(g)']].squeeze()),
    "Fat":list(analise1_frame[['Fat(g)']].squeeze())
    }, 
    index = ['Dash', 'Keto', 'Mediterranean','Paleo', 'Vegan']
)

plotdata.plot(kind="bar", color=["cadetblue","darkslategray","skyblue"],figsize=(10, 6),rot=0)
plt.title("Diet Type X Nutrition Information",loc="center", fontsize=18)#fontsize="x-large",weight="bold")
plt.xlabel("Diet Type",fontsize="x-large")
plt.ylabel("Amount (in grams)", fontsize="x-large");

In [None]:
# Correlation between Nutrition Information
x = df_nodup['Protein(g)']
y = df_nodup['Carbs(g)']
z = df_nodup['Fat(g)']
xyz = [x,y,z]

corr_matrix = np.corrcoef(xyz).round(decimals=2)
corr_matrix

In [None]:
fig, ax = plt.subplots()
im = ax.imshow(corr_matrix)
im.set_clim(-1, 1)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1, 2), ticklabels=('Protein', 'Carbs', 'Fat'))
ax.yaxis.set(ticks=(0, 1, 2), ticklabels=('Protein', 'Carbs', 'Fat'))
ax.set_ylim(2.5, -0.5)
for i in range(3):
    for j in range(3):
        ax.text(j, i, corr_matrix[i, j], ha='center', va='center',
                color='b')
cbar = ax.figure.colorbar(im, ax=ax, format='% .2f')

In [None]:
#Recioe with low and high fat
rec_low_fat = df_nodup.groupby(["Diet_type","Recipe_name","Cuisine_type"]).aggregate({'Fat(g)':'min'}).reset_index().sort_values(by=['Diet_type','Fat(g)'])

# Plot Top 3 recipe with high fat
dash_hf = rec_low_fat[rec_low_fat['Diet_type'] == 'dash'][-4:-1:]
med_hf = rec_low_fat[rec_low_fat['Diet_type'] == 'mediterranean'][-4:-1:]
vegan_hf = rec_low_fat[rec_low_fat['Diet_type'] == 'vegan'][-4:-1:]
keto_hf = rec_low_fat[rec_low_fat['Diet_type'] == 'keto'][-4:-1:]
paleo_hf = rec_low_fat[rec_low_fat['Diet_type'] == 'paleo'][-4:-1:]

# Prepare Data
df = pd.concat([med_hf, dash_hf, vegan_hf, keto_hf, paleo_hf])
df.reset_index(inplace=True)

# Draw plot
plt.figure(figsize=(16,10), dpi= 90)
# plt.hlines(y=df['Recipe_name']['dash'], xmin=0, xmax=df['Fat(g)'], 
#            color=["cadetblue","darkslategray","skyblue","dodgerblue","lightskyblue"],
#            alpha=0.8, linewidth=10)
plt.hlines(y=med_hf['Recipe_name'], xmin=0, xmax=med_hf['Fat(g)'], 
           color="cadetblue",
           alpha=0.8, linewidth=10)
plt.hlines(y=dash_hf['Recipe_name'], xmin=0, xmax=dash_hf['Fat(g)'], 
           color="darkslategray",
           alpha=0.8, linewidth=10)
plt.hlines(y=vegan_hf['Recipe_name'], xmin=0, xmax=vegan_hf['Fat(g)'], 
           color="skyblue",
           alpha=0.8, linewidth=10)
plt.hlines(y=keto_hf['Recipe_name'], xmin=0, xmax=keto_hf['Fat(g)'], 
           color="dodgerblue",
           alpha=0.8, linewidth=10)
plt.hlines(y=paleo_hf['Recipe_name'], xmin=0, xmax=paleo_hf['Fat(g)'], 
           color="lightskyblue",
           alpha=0.8, linewidth=10)

# Decorations
plt.title('Top 3 highest Fat Recipe Name', fontdict={'size':20},fontsize="x-large",weight="bold")
plt.grid(linestyle='--', alpha=0.1)
plt.xlabel('Unit(g)',fontsize="x-large")
plt.ylabel('Recipe Name',fontsize="x-large")
plt.legend(["Mediterranean","Dash","Vegan","keto","Paleo"], loc ="lower right")


## Tokenization

Devide paragrah into sentences, sentences into works (called token)

In [None]:
# Import tokenize 
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer

In [None]:
# Set the diet type into separate dataset
med = df[df['Diet_type'] == 'mediterranean']
dash = df[df['Diet_type'] == 'dash']
vegan = df[df['Diet_type'] == 'vegan']
keto = df[df['Diet_type'] == 'keto']
paleo = df[df['Diet_type'] == 'paleo']

# Split sentences into word (token) using word tokenze
med['wt'] = [word_tokenize(med['Recipe_name'][i]) for i in med.index]
dash['wt'] = [word_tokenize(dash['Recipe_name'][i]) for i in dash.index]
vegan['wt'] = [word_tokenize(vegan['Recipe_name'][i]) for i in vegan.index]
keto['wt'] = [word_tokenize(keto['Recipe_name'][i]) for i in keto.index]
paleo['wt'] = [word_tokenize(paleo['Recipe_name'][i]) for i in paleo.index]

In [None]:
# Word Punctuation Tokenizer
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

# Split sentences into word (token) using Word Punct Tokenizer (here splits punctuations)
med['wt2'] = [tokenizer.tokenize(med['Recipe_name'][i]) for i in med.index]
dash['wt2'] = [tokenizer.tokenize(dash['Recipe_name'][i]) for i in dash.index]
vegan['wt2'] = [tokenizer.tokenize(vegan['Recipe_name'][i]) for i in vegan.index]
keto['wt2'] = [tokenizer.tokenize(keto['Recipe_name'][i]) for i in keto.index]
paleo['wt2'] = [tokenizer.tokenize(paleo['Recipe_name'][i]) for i in paleo.index]

## REGEX

Use regex to dropout ponctuations

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import regexp_tokenize

tokenizerRE = RegexpTokenizer(r'\w+', gaps = False)

In [None]:
# Apply regex for punctuations
med['re'] = [tokenizerRE.tokenize(med['Recipe_name'][i]) for i in med.index]
dash['re'] = [tokenizerRE.tokenize(dash['Recipe_name'][i]) for i in dash.index]
vegan['re'] = [tokenizerRE.tokenize(vegan['Recipe_name'][i]) for i in vegan.index]
keto['re'] = [tokenizerRE.tokenize(keto['Recipe_name'][i]) for i in keto.index]
paleo['re'] = [tokenizerRE.tokenize(paleo['Recipe_name'][i]) for i in paleo.index]
paleo

## STOP WORDS

Common words that usually don't have meaning

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

In [None]:
# Apply stop word to column with regex applied
med_words = [palavra.lower() for i in med.index for palavra in med['re'][i] if palavra not in english_stops]
dash_words = [palavra.lower() for i in dash.index for palavra in dash['re'][i] if palavra not in english_stops]
vegan_words = [palavra.lower() for i in vegan.index for palavra in vegan['re'][i] if palavra not in english_stops]
keto_words = [palavra.lower() for i in keto.index for palavra in keto['re'][i] if palavra not in english_stops]
paleo_words = [palavra.lower() for i in paleo.index for palavra in paleo['re'][i] if palavra not in english_stops]
paleo_words

## Stemming

Stem removes suffix and prefix from words

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import RegexpStemmer

porter_stemmer = PorterStemmer()

In [None]:
# Apply Stemming
med_words = [porter_stemmer.stem(word) for word in med_words]
dash_words = [porter_stemmer.stem(word) for word in dash_words]
vegan_words = [porter_stemmer.stem(word) for word in vegan_words]
keto_words = [porter_stemmer.stem(word) for word in keto_words]
paleo_words = [porter_stemmer.stem(word) for word in paleo_words]
keto_words

In [None]:
# Word Cloud 
import wordcloud
from wordcloud import WordCloud

In [None]:
text_aux = [med_words, dash_words, vegan_words, keto_words, paleo_words]

In [None]:
text = ' '
for i in text_aux:
    for j in i:
        text = text + ' ' + ''.join(j)
text

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")