# <b> Food Recommendation Base On Emotion for College Student

In [1]:
import pandas as pd
import numpy as np

## Data Preparation for food dataset


In [2]:
food_recommend = pd.read_csv('dataset/food_coded.csv') 
food_recommend.info()
food_recommend.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   GPA                           123 non-null    object 
 1   Gender                        125 non-null    int64  
 2   breakfast                     125 non-null    int64  
 3   calories_chicken              125 non-null    int64  
 4   calories_day                  106 non-null    float64
 5   calories_scone                124 non-null    float64
 6   coffee                        125 non-null    int64  
 7   comfort_food                  124 non-null    object 
 8   comfort_food_reasons          123 non-null    object 
 9   comfort_food_reasons_coded    106 non-null    float64
 10  cook                          122 non-null    float64
 11  comfort_food_reasons_coded.1  125 non-null    int64  
 12  cuisine                       108 non-null    float64
 13  diet_

GPA                  2
Gender               0
breakfast            0
calories_chicken     0
calories_day        19
                    ..
type_sports         26
veggies_day          0
vitamins             0
waffle_calories      0
weight               2
Length: 61, dtype: int64

### Food dataframe

In [3]:
food_recommend = pd.read_csv('dataset/food_coded.csv', sep=',', usecols=['comfort_food', 'comfort_food_reasons'])

# only take two attributes comfort_food and comfort_food_reason and rename
food_recommend.rename(columns={'comfort_food': 'Food Types', 'comfort_food_reasons': 'Emotions'}, inplace=True)
food_recommend


Unnamed: 0,Food Types,Emotions
0,none,we dont have comfort
1,"chocolate, chips, ice cream","Stress, bored, anger"
2,"frozen yogurt, pizza, fast food","stress, sadness"
3,"Pizza, Mac and cheese, ice cream",Boredom
4,"Ice cream, chocolate, chips","Stress, boredom, cravings"
...,...,...
120,"wine. mac and cheese, pizza, ice cream",boredom and sadness
121,Pizza / Wings / Cheesecake,Loneliness / Homesick / Sadness
122,"rice, potato, seaweed soup",sadness
123,"Mac n Cheese, Lasagna, Pizza","happiness, they are some of my favorite foods"


## Data clean: clean or fill NaN values in 'comfort_food_reasons' and 'comfort_food' columns

#### There is some NaN in the dataset, it could being treat as int. So we need to clean that or fill with empty str

In [4]:
food_recommend["Emotions"] = food_recommend["Emotions"].fillna("")
food_recommend["Food Types"] = food_recommend["Food Types"].fillna("")

<div class="alert alert-block alert-warning">

## Data Processing with NLTK
- Apply NLTK Stopwords to filter out all common words. These are common words used in any language (such as "the", "is", "in", "and") that are often filtered out before processing text because they don't have important meaning and are very frequent. 
    - In this project, we are not only stop common word, but we also extended with some punctuation marks with "stop.update()
    - When processing each emotions_item, we split text into individual words and removes these stopwords. This helps in focusing on words that are more likely to hold specific meaning related to mood
- Apply NLTK Lemmatizer to reduce or filter the part in words base on it root. It's the process of reducing words to their base or dictionary form. It treats different forms of a word as the same item, which is useful in counting, searching, or categorizing.
    - In this project, WordNetLemmatizer is used to lemmatize each word in the "emotions" and "foods". So all words could convert to its base form
    - Example: sadness -> sad, boredome -> bore/bored

</div>

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Filter all common words
stop = set(stopwords.words('english'))
stop.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',''])
lemmatizer = WordNetLemmatizer()
food_count = {}

"""
    Finds the top comfort foods associated with a given mood.

    emotion (str): The emotion to search for.
    food_recommend (DataFrame): DataFrame containing 'Food Types' and 'Emotions' columns.

    Returns:
    list: A list of the top comfort foods for the given mood.
    
"""

# Processing
def preprocess_text(emotion, food_recommend):

    # Looping through the food data
    for i in range(len(food_recommend)):

        # PROCESS "comfort_food_reasons"
        emotions_item = food_recommend["Emotions"][i]
        # Convert all items in comfort_food_reasons to str included NaN value.
        # Split it into individual words, removes punctuation (. ,) and converts to lowercase
        # checks if each word is not a stop word. (and with NLTK, common words will be removed such as "I","and")
        if isinstance(emotions_item, str):
            emotions = emotions_item.lower().split()
            emotions = [lemmatizer.lemmatize(word.strip('.,')) for word in emotions if word not in stop]

        # PROCESS "comfort_food"
        # If the mood is found, the processed similarly: split into item, punctuation removed, converted to lowercase, and lemmatized
        if emotion in emotions:
            foods = food_recommend["Food Types"][i].lower().split(',')
            foods = [lemmatizer.lemmatize(food.strip().strip('.,')) for food in foods if food not in stop]

        # Add process food to food count and count food
        # If the item is new to the dictionary, added with a count of 1; if it already exists, its count is incremented
            for itemfood in foods:
                if itemfood not in food_count.keys():
                     food_count[itemfood] = 1 
                else:
                     food_count[itemfood] += 1

    # Now specified mood is already associated with food.
    # Sorting and selecting the top foods (most to least appearing food)
    top_foods = sorted(food_count, key=food_count.get, reverse=True)[:10]
    return top_foods


def food_result(emotion):
    topn = []
    topn = preprocess_text(emotion, food_recommend) #function create dictionary only for particular mood
    print(f"10 Popular Comfort Foods in {emotion} are:")
    # print(topn[0])
    # print(topn[1])
    # print(topn[2]) 
    # print(topn[3]) 
    # print(topn[4]) 
    # print(topn[5]) 
    for food in topn:
        print(food)



In [6]:
#food_result('bored')         10
# food_result('blue')          0
#food_result('yellow')        0
#food_result('satisfaction')  3
#food_result('late')           3
food_result('sad')

10 Popular Comfort Foods in sad are:
ice cream
pizza
chip
cheeseburger
french fries
fry
cereal
cooky
chicken wings
pasta


# Data preparation for restaurant dataset
<a href=https://towardsdatascience.com/load-yelp-reviews-or-other-huge-json-files-with-ease-ad804c2f1537>Link For Load yelp review (huge json file) </a>

In [7]:
# Convert yelp_business.json to csv
restaurant_location = pd.read_json('dataset/yelp_academic_dataset_business.json', lines=True)
restaurant_location.to_csv('dataset/restaurant_location.csv', index=False)

restaurant_location = pd.read_csv('dataset/restaurant_location.csv', sep=',')
# restaurant_location.info()
df = restaurant_location[['city','state','latitude','longitude','stars']]
df.query("`state`=='LA'")
#df.query("`state`=='OH'")



# a_pandas = []
# r_dtypes = {}

# with open('dataset/yelp_academic_dataset_business.json', 'r') as f:
#     df = pd.read_json(f, orient="records", lines=True, dtype=r_dtypes, chunksize=1000)

        
#     for chunk in df:
#         reduced_chunk = chunk.drop(columns=['business_id', 'address','review_count', 'attributes','hours'])\
#                              .query("`state` == 'OH'")
                            
#         a_pandas.append(reduced_chunk)
    
# a_pandas = pd.concat(a_pandas, ignore_index=True)
# a_pandas



FileNotFoundError: File dataset/yelp_academic_dataset_business.json does not exist

In [8]:
## unable to load Yelp reviews (other huge JSON yelp_review.json). Over 5gb
# restaurant_review = pd.read_json('dataset/yelp_academic_dataset_review.json', lines=True, chunksize=1000 )
# restaurant_review.to_csv('dataset/restaurant_review.csv', index=False)
# restaurant_review


b_pandas = []
r_dtypes = {"stars": np.float16}

with open('dataset/yelp_academic_dataset_review.json', 'r') as f:
    df = pd.read_json(f, orient="records", lines=True, dtype=r_dtypes, chunksize=1000)

    for chunk in df:
        reduced_chunk = chunk.drop(columns=['review_id', 'user_id', 'useful', 'funny', 'cool'])\
                             .query("`date` >= '2020-12-01'").query("`stars` >= 3.9")
        b_pandas.append(reduced_chunk)
    
b_pandas = pd.concat(b_pandas, ignore_index=True)
b_pandas


FileNotFoundError: [Errno 2] No such file or directory: 'dataset/yelp_academic_dataset_review.json'