# Your Title Here

**Name(s)**: Emma Shamir, Meera Sharma

**Website Link**: https://eshamir3.github.io/Recipe_analzying-/

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

import scipy
import sklearn.linear_model

from dsc80_utils import * # Feel free to uncomment and use this.

interactions_fp = Path("food_data") /"RAW_interactions.csv"
interactions_raw = pd.read_csv(interactions_fp)

recipes_fp = Path("food_data") / "RAW_recipes.csv"
recipes_raw = pd.read_csv(recipes_fp)


## Step 1: Introduction

We chose to analyze the recipes and ratings dataframe. We will investigate the relationship between number of steps in a recipe and the recipe rating. This is an interesting question because it gives us insight into wether there is a relationship between the number of steps a user must go through in order to make a recipe, and their overall rating of the food. If there is a relationship, it would be interesting to understand what might underlie the differences in ratings. 

In [3]:
#merge datasets 

recipe_interactions = recipes_raw.merge(interactions_raw, left_on='id', right_on= "recipe_id", how = 'left').drop(columns = {'recipe_id'})
data_types = pd.DataFrame(recipe_interactions.dtypes)

#We replaced ratings of 0 with np.nan, since ratings are made on a 1-5 scale. Therefore, if a rating is 0, it means that the rating is missing,
#and should be excluded from the dataset in order to not bias calculations done on the ratings column. 
recipe_interactions['rating']= recipe_interactions['rating'].replace(0.0, np.nan)

#add a column with mean ratings 
recipe_interactions['avg_rating'] = recipe_interactions.groupby('id')['rating'].transform('mean')

#date submitted to datetime
recipe_interactions['submitted']= pd.to_datetime(recipe_interactions['submitted'])

#date created to datetime
recipe_interactions['date']= pd.to_datetime(recipe_interactions['date'])

# recipe_interactions = recipe_interactions[recipe_interactions["minutes"] <= 1_440] #cleaning the insane outliers

In [4]:
data_types

Unnamed: 0,0
name,object
id,int64
minutes,int64
...,...
date,object
rating,float64
review,object


In [5]:
#Univariate Analysis: distribution of ratings

frequency = recipe_interactions['rating'].value_counts().reset_index()
frequency.columns = ['rating', 'Frequency']

fig = px.bar(frequency, x='rating', y='Frequency', 
             title='Frequency of Rating in Recipe Dataset',
             labels={'rating': 'Rating', 'Frequency': 'Count'})

#fig.show()

#testing exporting it as an HTML file 

fig.write_html('rating_distributions.html', include_plotlyjs='cdn')

In [6]:
#I have made changes, now I pray that this shit works!!!!

## Step 2: Data Cleaning and Exploratory Data Analysis

In [7]:
# TODO
#need to do bivariate analyis 

fig_2 = px.scatter(recipe_interactions, x = "n_ingredients", y = "n_steps")
#fig_2.show()


In [8]:
#plotting rating vs time 

fig_3 = px.scatter(recipe_interactions, x = "n_steps", y = 'minutes')

#fig_3.show()

#we might want to clean this value because it might be a high leverage point 

## Step 3: Assessment of Missingness

In [9]:
# TODO
#look at missing rating 
recipe_interactions["rating"]

#check whether rating is missing depending on another column 

0         4.0
1         5.0
2         5.0
         ... 
234426    1.0
234427    5.0
234428    NaN
Name: rating, Length: 234429, dtype: float64

In [10]:
#testing whether rating is dependent on number of ingredients 
# ing_test_df = recipe_interactions.copy()
# ing_test_df["missing_rating"] = ing_test_df["rating"].isna()
# ing_test_df

# #use the difference of means as out test stat 
# #want to know whether average number of minutes when the rating is missing is the same as the average number of minutes when the rating
# #is not missing 
# group_means = ing_test_df.groupby("missing_rating")["n_ingredients"].mean()
# obs_stat = np.abs(group_means.loc[True] - group_means.loc[False])
# # print(obs_stat)


# n_repetitions = 1000 

# ing_test_diffs = []

# for i in range(n_repetitions): 
#     ing_test_df["shuffled_missing"] = np.random.permutation(ing_test_df["missing_rating"])
#     emp_ing_stat = ing_test_df.groupby("shuffled_missing")["n_ingredients"].mean()
#     emp_ing_stat = np.abs(emp_ing_stat.loc[True] - emp_ing_stat.loc[False])
#     ing_test_diffs.append(emp_ing_stat)

# ing_test_p_val = np.mean(ing_test_diffs >= obs_stat)
# ing_test_p_val

# # print(ing_test_diffs)

# signficance_level = 0.05 

# significant = ing_test_p_val < signficance_level

# print(f"p value: {ing_test_p_val}")
# print(f"significance level: {signficance_level}")
# print(f"is rating MAR dependent on number of ingredients? {significant}")


In [11]:
# min_test_df = recipe_interactions.copy()
# min_test_df["missing_rating"] = min_test_df["rating"].isna()

# #use the difference of means as out test stat 
# #want to know whether average number of minutes when the rating is missing is the same as the average number of minutes when the rating
# #is not missing 
# group_means = min_test_df.groupby("missing_rating")["minutes"].mean()
# obs_stat = np.abs(group_means.loc[True] - group_means.loc[False])
# obs_stat

# min_test_diffs = []

# for i in range(n_repetitions): 
#     min_test_df["shuffled_missing"] = np.random.permutation(min_test_df["missing_rating"])
#     emp_min_stat = min_test_df.groupby("shuffled_missing")["minutes"].mean()
#     emp_min_stat = np.abs(emp_min_stat.loc[True] - emp_min_stat.loc[False])
#     min_test_diffs.append(emp_min_stat)

# min_test_p_val = np.mean(min_test_diffs >= obs_stat)

# signficance_level = 0.05 

# significant = min_test_p_val < signficance_level

# print(f"p value: {min_test_p_val}")
# print(f"significance level: {signficance_level}")
# print(f"is rating MAR dependent on number of minutes? {significant}")

In [12]:
#need to find a column that it is not MAR on 
#try date? 

# date_test_df = recipe_interactions.copy()
# date_test_df["missing_rating"] = 

#use the difference of means as out test stat 
#want to know whether average number of minutes when the rating is missing is the same as the average number of minutes when the rating
#is not missing 
# group_means = date_test_df.groupby("missing_rating")["date"].mean()
# obs_stat = np.abs(group_means.loc[True] - group_means.loc[False])
# obs_stat
# print(obs_stat)

# date_test_diffs = []

# for i in range(n_repetitions): 
#     date_test_df["shuffled_missing"] = np.random.permutation(date_test_df["missing_rating"])
#     emp_date_stat = date_test_df.groupby("shuffled_missing")["date"].mean()
#     emp_date_stat = np.abs(emp_date_stat.loc[True] - emp_date_stat.loc[False])
#     date_test_diffs.append(emp_date_stat)

# # date_test_p_val = np.mean(date_test_diffs >= obs_stat)
# date_test_p_val = 0
# for diff in date_test_diffs: 
#     if diff >= obs_stat: 
#         date_test_p_val.append(1)

# date_test_p_val = date_test_p_val / len(date_test_diffs)

# signficance_level = 0.05 

# significant = date_test_p_val < signficance_level

# print(f"p value: {date_test_p_val}")
# print(f"significance level: {signficance_level}")
# print(f"is rating MAR dependent on date? {significant}")

## Step 4: Hypothesis Testing

In [23]:
recipe_interactions

#null hypothesis: The rating does not effect the number of steps 
#alternative hypothesis: The rating does have an effect on number of steps 

import scipy.stats as stats

df = recipe_interactions

# Assuming your dataset is already loaded into a DataFrame called `df`
# Ensure that the 'rating' and 'steps' columns are correctly named in the DataFrame
df = df.dropna(subset=['rating', 'n_steps'])  # Drop rows with missing values in 'rating' or 'steps'

# Group data by rating and extract the number of steps for each group
rating_groups = [df[df['rating'] == rating]['n_steps'] for rating in range(1, 6)]

# Perform one-way ANOVA test
f_stat, p_value = stats.f_oneway(*rating_groups)

f_stat

np.float64(49.82922822486933)

In [21]:
df['n_steps'].max()

np.int64(100)

In [34]:
# TODO
#null hypothesis: the number of steps does not affect the rating of a recipe
#alternative hypothesis: the number of steps does affect the rating of a recipe

#first, we want to drop the rows where the rating is na (this will not help us for our permutation test)

hyp_df = recipe_interactions.copy()
hyp_df = hyp_df.dropna(subset= "rating")

#test statistic: tvd 
#the absolute deviation between the mean of each group and the total mean, divided bty 2 
obs_group_means = hyp_df.groupby("n_steps")["rating"].mean()
uniform_dist = [recipe_interactions["rating"].mean()] * len(obs_group_means)
obs_tvd =np.sum(
    np.abs(
    obs_group_means - uniform_dist
    )
) / 2

print(f"observed tvd: {obs_tvd}")

#making a new column of shuffled number test]
test_stats = []

for i in range(1000):
    hyp_df["shuffled_n_steps"] = np.random.permutation(hyp_df["n_steps"])
    test_group_means = hyp_df.groupby("shuffled_n_steps")["rating"].mean()
    emp_tvd = np.sum(np.abs(test_group_means - uniform_dist)) / 2
    test_stats.append(emp_tvd)

p_val = np.mean(test_stats >= obs_tvd)
print(f"p-value for permutation test: {p_val}")


observed tvd: 5.809777965286577
p-value for permutation test: 0.327


## Step 5: Framing a Prediction Problem

In [35]:
# TODO

## Step 6: Baseline Model

In [36]:
#TO DO: need to do all the data cleaning here or up at the beginning just so that the training data and the test data are the same 

#the only database that we're using is the recipes database --> we don't care about ratings or sentiment 

cleaned_recipes = recipes_raw.copy()

# cleaned_recipes["minutes_category"] = cleaned_recipes["minutes"].apply()

In [37]:
# TODO
from sklearn.linear_model import LinearRegression
#baseline model features: the number of steps and the number of ingredients

X = recipe_interactions[["n_steps", "n_ingredients"]]
y = recipe_interactions["minutes"]

model = LinearRegression()
model.fit(X, y)

model.score(X, y)



0.0003023857957842324

In [38]:
#adding onto the baseline model 
#let's add a new column to our dataframe that tells us whether the meal is a dessert or not 

recipe_interactions['dessert'] = recipe_interactions["description"].fillna("").str.contains("dessert").apply(int)

X1 = recipe_interactions[["n_steps", "n_ingredients", "dessert"]]
y = recipe_interactions["minutes"]

model_1 = LinearRegression()
model_1.fit(X1, y)
model_1.score(X1, y)

0.000314618924943888

In [39]:
#rounding our predictions to the nearest multiple of 5 to see whether we get better accuracy --> because minutes of cook time is ususally rounded to the nearest multiple of 5 
from sklearn.metrics import r2_score
predictions_unrounded = model_1.predict(X1)

#to round: 

rounded_predictions = np.round(predictions_unrounded / 5) * 5 

#now to calculate the r^2 of this (to get the score)
score = r2_score(rounded_predictions, recipe_interactions["minutes"])
score

# how the heck is score negative 

-3174.877849507372

In [40]:
#lowkey trying a KNN classifier trying to predict rating 
from sklearn.neighbors import KNeighborsClassifier

test = recipe_interactions.dropna(subset=["rating"])

cat_X = test[["n_steps", "n_ingredients"]]
y = test["rating"]

cat_model = KNeighborsClassifier()

cat_model.fit(cat_X, y)
cat_model.score(cat_X, y)


0.7180174390249461

In [41]:
#Cross validation in order to find the best hyperparameters to include in our model

#do some feature engineering for this 

In [42]:
#ANOTHER IDEA: make minutes into a categorical column and then try to predict the category that a recipe's time is in

#examples of categories: Weekday (less than an hour), Weekend (1-3 hours), Holiday/Special Occasion (3-12 hours), Long Term Recipes (> 12 hours)

categorical_mins_model = KNeighborsClassifier(n_neighbors= 6)

def categorize_mins(time): 
    if time < 60: 
        return "Weekday"
    elif time < 180: 
        return "Weekend"
    elif time < 720: 
        return "Holiday/Special Occasion"
    else:
        return "Long Term Recipes"
    
y_train = recipe_interactions["minutes"].apply(categorize_mins)

X_train = recipe_interactions[["n_steps", "n_ingredients"]]

categorical_mins_model.fit(X_train, y_train)

categorical_mins_model.score(X_train, y_train)

0.6436063797567707

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(recipe_interactions[["n_steps", "n_ingredients"]], recipe_interactions["minutes"].apply(categorize_mins))

new_mdl = KNeighborsClassifier()

new_mdl.fit(X_train, y_train)

new_mdl.score(X_test, y_test)

0.6883701883701884

In [44]:
recipe_interactions.columns

num_cols = ["n_steps", "n_ingredients"]

# recipe_interactions['nutrition']

#to add nutrition values 

#the values in the nutrition column are strings of lists --> want to transform them into a list 

def clean_nutrition(stri): 
    result = stri.strip("[]")
    result = result.split(",")
    result = [float(ele) for ele in result]
    return result

recipe_interactions['nutrition'].apply(clean_nutrition)


def get_calories(lst): 
    return lst[0]

def get_sodium(lst): 
    return lst[3]

nutrition_added_df = recipe_interactions.copy()
nutrition_added_df["nutrition"] = nutrition_added_df["nutrition"].apply(clean_nutrition)
nutrition_added_df["calories"] = nutrition_added_df["nutrition"].apply(get_calories)
nutrition_added_df["sodium"] = nutrition_added_df["nutrition"].apply(get_sodium)

nutrition_added_df

#hypothesis: things with higher sodium will take longer time to make 

Unnamed: 0,name,id,minutes,contributor_id,...,avg_rating,dessert,calories,sodium
0,1 brownies in the world best ever,333281,40,985201,...,4.0,0,138.4,3.0
1,1 in canada chocolate chip cookies,453467,45,1848091,...,5.0,0,595.1,22.0
2,412 broccoli casserole,306168,40,50969,...,5.0,0,194.8,32.0
...,...,...,...,...,...,...,...,...,...
234426,cookies by design sugar shortbread cookies,298509,20,506822,...,3.0,0,174.9,4.0
234427,cookies by design sugar shortbread cookies,298509,20,506822,...,3.0,0,174.9,4.0
234428,cookies by design sugar shortbread cookies,298509,20,506822,...,3.0,0,174.9,4.0


In [45]:
X = nutrition_added_df[["n_ingredients", "n_steps", "dessert", "calories", "sodium"]]
y = nutrition_added_df["minutes"].apply(categorize_mins)

#making polynomial features 
#hypothesis: calories^2  or calories ^3 may predict the minutes better than just calories alone 

#iterative check which polynomial fit is the best for us for minutes. 

#doing cross validation manually 
# from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import make_pipeline

# poly_degrees = range(1, 11)

# def eval_polynomial_col(X, y, poly_degrees, col, cv = 5): 

#     cv_dict = {"degree":[], "accuracy": []}

#     for deg in poly_degrees:

#         col_transform = ColumnTransformer(
#             transformers = [ ("poly", PolynomialFeatures(deg), [col])], remainder = "passthrough"
#         )

#         model = make_pipeline(col_transform, KNeighborsClassifier())

#         scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

#         cv_dict["degree"].append(deg)
#         cv_dict["accuracy"].append(np.mean(scores))

#     return pd.DataFrame(cv_dict)



In [46]:
# calorie_cv_results = eval_polynomial_col(X, y, poly_degrees, "calories")

In [47]:
# calorie_cv_results["accuracy"].apply(type)

## Step 7: Final Model

In [48]:
# TODO

#only need to do cross validation for hyperparameters like n

#use grid search CV 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [49]:
#FINAL MODEL

#transformers: 
#want to standardize calories and sodium 
#desert is one hot encoded already technically 
#using n_ingredients and n_steps as is 

final_col_transform = ColumnTransformer(
    transformers= [("std_cals", StandardScaler(), ["calories"]),("std_sodium", StandardScaler(), ['sodium']) 
                   ], remainder = "passthrough"
)

final_pipe = Pipeline([
    ("column_transformations", final_col_transform), ("model", KNeighborsClassifier())
    ])

In [50]:
feature_matrix = X
actual_vals = y 

X_final_train, X_final_split, y_final_train, y_final_split = train_test_split(feature_matrix, actual_vals)

final_pipe.fit(X_final_train, y_final_train)

final_pipe.score(X_final_split, y_final_split)

0.8326508326508326

In [51]:
#Grid Search CV in order to find the best hyperparameters for the KNN classifier 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 10],  # List of neighbors to try
    'weights': ['uniform', 'distance'],  # Weight options
    'metric': ['euclidean', 'manhattan']}  # Distance metric options 
grid_search = GridSearchCV(estimator= KNeighborsClassifier(), param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=1)

grid_search.fit(X_final_train, y_final_train)


best_params = grid_search.best_params_
best_final_model = grid_search.best_estimator_

best_final_model.score(X_final_split, y_final_split)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


0.9168543543543544

In [None]:
best_params

{'metric': 'euclidean', 'n_neighbors': 10, 'weights': 'distance'}

In [None]:
final_model_predicts = best_final_model.predict(X_final_split)

array(['Weekday', 'Weekday', 'Weekday', ..., 'Weekday', 'Weekday',
       'Weekend'], dtype=object)

## Step 8: Fairness Analysis

In [57]:
# TODO
X_final_split

Unnamed: 0,n_ingredients,n_steps,dessert,calories,sodium
80040,13,10,0,698.4,30.0
122306,8,12,0,446.1,4.0
189102,9,3,0,481.0,154.0
...,...,...,...,...,...
230554,3,2,0,160.4,1.0
201570,12,45,0,1124.9,20.0
179597,6,5,0,266.1,10.0


In [58]:
result = X_final_split
result["low_rating"] = result["rating"].apply(lambda x: x < 3) #saying that a rating less than 3 is a low rating 

result["predicted_label"] = final_model_predicts


KeyError: 'rating'