# Your Title Here

**Name(s)**: Emma Shamir, Meera Sharma

**Website Link**: https://eshamir3.github.io/Recipe_analzying-/

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

import scipy
import sklearn.linear_model

from dsc80_utils import * # Feel free to uncomment and use this.

interactions_fp = Path("food_data") /"RAW_interactions.csv"
interactions_raw = pd.read_csv(interactions_fp)

recipes_fp = Path("food_data") / "RAW_recipes.csv"
recipes_raw = pd.read_csv(recipes_fp)


## Step 1: Introduction

We chose to analyze the recipes and ratings dataframe. We will investigate the relationship between number of steps in a recipe and the recipe rating. This is an interesting question because it gives us insight into wether there is a relationship between the number of steps a user must go through in order to make a recipe, and their overall rating of the food. If there is a relationship, it would be interesting to understand what might underlie the differences in ratings. 

In [120]:
#merge datasets 

recipe_interactions = recipes_raw.merge(interactions_raw, left_on='id', right_on= "recipe_id", how = 'left').drop(columns = {'recipe_id'})
data_types = pd.DataFrame(recipe_interactions.dtypes)

#We replaced ratings of 0 with np.nan, since ratings are made on a 1-5 scale. Therefore, if a rating is 0, it means that the rating is missing,
#and should be excluded from the dataset in order to not bias calculations done on the ratings column. 
recipe_interactions['rating']= recipe_interactions['rating'].replace(0.0, np.nan)

#add a column with mean ratings 
recipe_interactions['avg_rating'] = recipe_interactions.groupby('id')['rating'].transform('mean')

#date submitted to datetime
recipe_interactions['submitted']= pd.to_datetime(recipe_interactions['submitted'])

#date created to datetime
recipe_interactions['date']= pd.to_datetime(recipe_interactions['date'])

# recipe_interactions = recipe_interactions[recipe_interactions["minutes"] <= 1_440] #cleaning the insane outliers

In [5]:
#Univariate Analysis: distribution of ratings

frequency = recipe_interactions['rating'].value_counts().reset_index()
frequency.columns = ['rating', 'Frequency']

fig = px.bar(frequency, x='rating', y='Frequency', 
             title='Frequency of Rating in Recipe Dataset',
             labels={'rating': 'Rating', 'Frequency': 'Count'})

#fig.show()

#testing exporting it as an HTML file 

fig.write_html('rating_distributions.html', include_plotlyjs='cdn')

In [6]:
#I have made changes, now I pray that this shit works!!!!

## Step 2: Data Cleaning and Exploratory Data Analysis

In [7]:
# TODO
#need to do bivariate analyis 

fig_2 = px.scatter(recipe_interactions, x = "n_ingredients", y = "n_steps")
#fig_2.show()


In [8]:
#plotting rating vs time 

fig_3 = px.scatter(recipe_interactions, x = "n_steps", y = 'minutes')

#fig_3.show()

#we might want to clean this value because it might be a high leverage point 

## Step 3: Assessment of Missingness

In [152]:
#testing whether missingness of description is dependent on number of ingredients 
missing_test_df = recipe_interactions.copy()
missing_test_df["missing_description"] = missing_test_df['description'].isna()



group_means = missing_test_df.groupby("missing_description")["n_ingredients"].mean()
obs_stat = np.abs(group_means.loc[True] - group_means.loc[False])


n_repetitions = 1000 

missing_test_diffs = []

for i in range(n_repetitions): 
     missing_test_df["shuffled_missing"] = np.random.permutation(missing_test_df["missing_description"])
     description_stat = missing_test_df.groupby("shuffled_missing")["n_ingredients"].mean()
     description_stat = np.abs(description_stat.loc[True] - description_stat.loc[False])
     missing_test_diffs.append(description_stat)

missing_test_p_val = np.mean(missing_test_diffs >= obs_stat)


signficance_level = 0.05 

significant = missing_test_p_val < signficance_level

print(f"p value: {missing_test_p_val}")
print(f"significance level: {signficance_level}")
print(f"is description MAR dependent on number of ingredients? {significant}")


p value: 0.005
significance level: 0.05
is description MAR dependent on number of ingredients? True


In [149]:
#testing whether missingness of description is dependent on number of length of review
missing_test_df = recipe_interactions.copy()
missing_test_df["missing_description"] = missing_test_df['description'].isna()


missing_test_df['len_review'] = missing_test_df['review'].str.len()
group_means = missing_test_df.groupby("missing_description")["len_review"].mean()
obs_stat = np.abs(group_means.loc[True] - group_means.loc[False])


n_repetitions = 1000 

missing_test_diffs = []

for i in range(n_repetitions): 
     missing_test_df["shuffled_missing"] = np.random.permutation(missing_test_df["missing_description"])
     description_stat = missing_test_df.groupby("shuffled_missing")["len_review"].mean()
     description_stat = np.abs(description_stat.loc[True] - edescription_stat.loc[False])
     missing_test_diffs.append(emp_ing_stat)

missing_test_p_val = np.mean(missing_test_diffs >= obs_stat)


signficance_level = 0.05 

significant = missing_test_p_val < signficance_level

print(f"p value: {missing_test_p_val}")
print(f"significance level: {signficance_level}")
print(f"is rating MAR dependent on number of ingredients? {significant}")

p value: 0.888
significance level: 0.05
is rating MAR dependent on number of ingredients? False


## Step 4: Hypothesis Testing

In [13]:
recipe_interactions

#null hypothesis: The rating does not effect the number of steps 
#alternative hypothesis: The rating does have an effect on number of steps 

import scipy.stats as stats

df = recipe_interactions

# Assuming your dataset is already loaded into a DataFrame called `df`
# Ensure that the 'rating' and 'steps' columns are correctly named in the DataFrame
df = df.dropna(subset=['rating', 'n_steps'])  # Drop rows with missing values in 'rating' or 'steps'

# Group data by rating and extract the number of steps for each group
rating_groups = [df[df['rating'] == rating]['n_steps'] for rating in range(1, 6)]

# Perform one-way ANOVA test
f_stat, p_value = stats.f_oneway(*rating_groups)

f_stat

np.float64(49.82922822486933)

In [14]:
df['n_steps'].max()

np.int64(100)

In [15]:
# TODO
#null hypothesis: the number of steps does not affect the rating of a recipe
#alternative hypothesis: the number of steps does affect the rating of a recipe

#first, we want to drop the rows where the rating is na (this will not help us for our permutation test)

hyp_df = recipe_interactions.copy()
hyp_df = hyp_df.dropna(subset= "rating")

#test statistic: tvd 
#the absolute deviation between the mean of each group and the total mean, divided bty 2 
obs_group_means = hyp_df.groupby("n_steps")["rating"].mean()
uniform_dist = [recipe_interactions["rating"].mean()] * len(obs_group_means)
obs_tvd =np.sum(
    np.abs(
    obs_group_means - uniform_dist
    )
) / 2

print(f"observed tvd: {obs_tvd}")

#making a new column of shuffled number test]
test_stats = []

for i in range(1000):
    hyp_df["shuffled_n_steps"] = np.random.permutation(hyp_df["n_steps"])
    test_group_means = hyp_df.groupby("shuffled_n_steps")["rating"].mean()
    emp_tvd = np.sum(np.abs(test_group_means - uniform_dist)) / 2
    test_stats.append(emp_tvd)

p_val = np.mean(test_stats >= obs_tvd)
print(f"p-value for permutation test: {p_val}")


observed tvd: 5.809777965286577
p-value for permutation test: 0.316


## Step 5: Framing a Prediction Problem

In [16]:
# TODO

## Step 6: Baseline Model

In [17]:
#TO DO: need to do all the data cleaning here or up at the beginning just so that the training data and the test data are the same 

def categorize_mins(time): 
    if time < 60: 
        return "Weekday"
    elif time < 180: 
        return "Weekend"
    elif time < 720: 
        return "Holiday/Special Occasion"
    else:
        return "Long Term Recipes"
    
def clean_nutrition(stri): 
    result = stri.strip("[]")
    result = result.split(",")
    result = [float(ele) for ele in result]
    return result


def get_calories(lst): 
    return lst[0]

def get_sodium(lst): 
    return lst[3]


model_recipe_interactions = recipe_interactions.copy()
model_recipe_interactions["minutes_category"] = model_recipe_interactions["minutes"].apply(categorize_mins)
model_recipe_interactions['is_dessert'] = model_recipe_interactions["description"].fillna("").str.contains("dessert").apply(int) #fix this so that it's whether the tag containes dessert
model_recipe_interactions["nutrition"] = model_recipe_interactions["nutrition"].apply(clean_nutrition)
model_recipe_interactions["calories"] = model_recipe_interactions["nutrition"].apply(get_calories)
model_recipe_interactions["sodium"] = model_recipe_interactions["nutrition"].apply(get_sodium)

model_recipe_interactions.head()


Unnamed: 0,name,id,minutes,contributor_id,...,minutes_category,is_dessert,calories,sodium
0,1 brownies in the world best ever,333281,40,985201,...,Weekday,0,138.4,3.0
1,1 in canada chocolate chip cookies,453467,45,1848091,...,Weekday,0,595.1,22.0
2,412 broccoli casserole,306168,40,50969,...,Weekday,0,194.8,32.0
3,412 broccoli casserole,306168,40,50969,...,Weekday,0,194.8,32.0
4,412 broccoli casserole,306168,40,50969,...,Weekday,0,194.8,32.0


In [18]:
#creating the training and testing data sets 

from sklearn.model_selection import train_test_split

lin_X_train, lin_X_test, lin_y_train, lin_y_test = train_test_split(model_recipe_interactions.drop(columns = ["minutes"]), model_recipe_interactions["minutes"])

In [21]:
# TODO
from sklearn.linear_model import LinearRegression
#baseline model features: the number of steps and the number of ingredients

lin1_X_train = lin_X_train[["n_ingredients", 'n_steps']]
lin1_y_train = lin_y_train 

lin1_X_test = lin_X_test[["n_ingredients", "n_steps"]]
lin1_y_test = lin_y_test 

model = LinearRegression()
model.fit(lin1_X_train, lin1_y_train)

model.score(lin1_X_test, lin1_y_test)



0.0017035631246492544

In [22]:
#adding onto the baseline model 
#let's add a new column to our dataframe that tells us whether the meal is a dessert or not 

lin2_X_train = lin_X_train[["n_ingredients", "n_steps", "is_dessert"]]
lin2_y_train = lin_y_train

lin2_X_test = lin_X_test[["n_ingredients", "n_steps", "is_dessert"]]
lin2_y_test = lin_y_test

model_2 = LinearRegression()
model_2.fit(lin2_X_train, lin2_y_train)
model_2.score(lin2_X_test, lin2_y_test)

0.0016451063019666812

In [24]:
#rounding our predictions to the nearest multiple of 5 to see whether we get better accuracy --> because minutes of cook time is ususally rounded to the nearest multiple of 5 
from sklearn.metrics import r2_score
predictions_unrounded = model_2.predict(lin2_X_test)

#to round: 

rounded_predictions = np.round(predictions_unrounded / 5) * 5 

#now to calculate the r^2 of this (to get the score)
score = r2_score(rounded_predictions,lin2_y_test)
score

# how the heck is score negative 

-163.55122109497322

0.7180174390249461

In [31]:
#ANOTHER IDEA: make minutes into a categorical column and then try to predict the category that a recipe's time is in

#examples of categories: Weekday (less than an hour), Weekend (1-3 hours), Holiday/Special Occasion (3-12 hours), Long Term Recipes (> 12 hours)

#in order to do this, need to make new test and training data because we're now trying to predict a categorical variable instead of a numerical one 

cat_X_train, cat_X_test, cat_y_train, cat_y_test = train_test_split(model_recipe_interactions.drop(columns= ["minutes_category"]), model_recipe_interactions["minutes_category"])

cat_X_train.head()

Unnamed: 0,name,id,minutes,contributor_id,...,avg_rating,is_dessert,calories,sodium
221097,tuscan ribolitta with garlic parmesan croutons,517502,40,318262,...,5.0,0,414.7,76.0
97622,green papaya salad ala bobby flay,286558,32,546010,...,5.0,0,253.5,58.0
56344,colombo curry,318787,50,914943,...,4.0,0,299.5,25.0
38155,carob cocoa,300053,8,37449,...,5.0,0,226.1,5.0
216828,tom ka kai thai coconut chicken soup,303281,35,713116,...,4.5,0,478.4,60.0


In [34]:

#new base model: categorize minutes and make a KNN classifier
from sklearn.neighbors import KNeighborsClassifier

cat1_X_train = cat_X_train[["n_ingredients", "n_steps"]]
cat1_y_train = cat_y_train 

cat1_X_test = cat_X_test[["n_ingredients", "n_steps"]]
cat1_y_test = cat_y_test
new_mdl = KNeighborsClassifier()

new_mdl.fit(cat1_X_train, cat1_y_train)

new_mdl.score(cat1_X_test, cat1_y_test)

0.7000238875238876

Unnamed: 0,name,id,minutes,contributor_id,...,avg_rating,dessert,calories,sodium
0,1 brownies in the world best ever,333281,40,985201,...,4.0,0,138.4,3.0
1,1 in canada chocolate chip cookies,453467,45,1848091,...,5.0,0,595.1,22.0
2,412 broccoli casserole,306168,40,50969,...,5.0,0,194.8,32.0
...,...,...,...,...,...,...,...,...,...
234426,cookies by design sugar shortbread cookies,298509,20,506822,...,3.0,0,174.9,4.0
234427,cookies by design sugar shortbread cookies,298509,20,506822,...,3.0,0,174.9,4.0
234428,cookies by design sugar shortbread cookies,298509,20,506822,...,3.0,0,174.9,4.0


In [None]:
# calorie_cv_results = eval_polynomial_col(X, y, poly_degrees, "calories")

In [None]:
# calorie_cv_results["accuracy"].apply(type)

## Step 7: Final Model

In [39]:
# TODO

#only need to do cross validation for hyperparameters like n

#use grid search CV 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [40]:
#FINAL MODEL

#transformers: 
#want to standardize calories and sodium 
#desert is one hot encoded already technically 
#using n_ingredients and n_steps as is 

final_col_transform = ColumnTransformer(
    transformers= [("std_cals", StandardScaler(), ["calories"]),("std_sodium", StandardScaler(), ['sodium']) 
                   ], remainder = "passthrough"
)

final_pipe = Pipeline([
    ("column_transformations", final_col_transform), ("model", KNeighborsClassifier())
    ])

In [41]:
final_X_train = cat_X_train[["n_ingredients", "n_steps", "calories", "sodium", "is_dessert"]]
final_y_train = cat_y_train

final_X_test = cat_X_test[["n_ingredients", "n_steps", "calories", "sodium", "is_dessert"]]
final_y_test = cat_y_test



final_pipe.fit(final_X_train, final_y_train)

final_pipe.score(final_X_test, final_y_test)

0.8308933933933934

In [42]:
#Grid Search CV in order to find the best hyperparameters for the KNN classifier 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 10],  # List of neighbors to try
    'weights': ['uniform', 'distance'],  # Weight options
    'metric': ['euclidean', 'manhattan']}  # Distance metric options 
grid_search = GridSearchCV(estimator= KNeighborsClassifier(), param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=1)

grid_search.fit(final_X_train, final_y_train)


best_params = grid_search.best_params_
best_final_model = grid_search.best_estimator_

best_final_model.score(final_X_train, final_y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


0.9998009339043686

In [None]:
best_params

{'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}

In [44]:
final_model_predicts = best_final_model.predict(final_X_test)

In [45]:
best_final_model.score(final_X_test, final_y_test)

0.9171444171444172

## Step 8: Fairness Analysis

In [54]:
# TODO
fairness_df = cat_X_test.copy()
fairness_df["minutes_category"] = cat_y_test
fairness_df["low_rating"] = fairness_df["rating"].apply(lambda x: x < 3)

fairness_df["predicted_label"] = final_model_predicts

fairness_df.columns




Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'user_id', 'date', 'rating', 'review', 'avg_rating',
       'is_dessert', 'calories', 'sodium', 'minutes_category', 'low_rating',
       'predicted_label'],
      dtype='object')

In [56]:
from sklearn.metrics import accuracy_score

get_accuracy = lambda x: accuracy_score(x['predicted_label'], x['minutes_category'])

obs_stat = (fairness_df
       .groupby('low_rating')
       [['predicted_label', 'minutes_category']]
       .apply(get_accuracy)
       .diff()
       .iloc[-1])

diff_in_acc = []
for _ in range(500):
    s = (
        fairness_df[['low_rating', 'predicted_label', 'minutes_category']]
        .assign(is_low=np.random.permutation(fairness_df['low_rating']))
        .groupby('is_low')
        [['predicted_label', 'minutes_category']]
        .apply(get_accuracy)
        .diff()
        .iloc[-1]
    )
    
    diff_in_acc.append(s)


p_val = np.mean(diff_in_acc >= obs_stat)
p_val

significant = p_val >= 0.05

print(f"{p_val} is our p_value")
print(f"model fair?: {significant}")

0.974 is our p_value
model fair?: True
