In [39]:
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.decomposition import PCA

from tqdm.notebook import tqdm
import time
pd.options.display.max_rows = 100

In [49]:
#load dataset
file_path = Path("./data/raw/recipes.csv")
# file_path = Path("./data/raw/recipes_sample.csv") #For quick calculation
df = pd.read_csv(file_path)

## Data Cleanup

In [112]:
#dataset cleanup
df.dropna(subset=['AggregatedRating'], axis=0, inplace=True)
df.dropna(subset=['RecipeInstructions'], axis=0, inplace=True)
df.dropna(subset=['RecipeCategory'], axis=0, inplace=True)
df.drop(index=df[df['RecipeCategory'].str.lower().str.contains('dessert')].index,inplace=True)
temp = df['Images']=='character(0)'
df.drop(index=df[temp].index,inplace=True)
# dfdf.set_index
# df1.shape

In [126]:
df.shape

(123613, 28)

In [128]:
#Export filtered dataset
df.to_csv('./data/raw/recipes_filtered.csv',index=False)


In [129]:
#Rating count
df["AggregatedRating"].value_counts()


AggregatedRating
5.0    86059
4.5    18917
4.0    14530
3.0     2118
3.5     1401
2.0      253
2.5      163
1.0      160
1.5       12
Name: count, dtype: int64

In [130]:
df.shape

(123613, 28)

## Keyword Dummy generation

In [131]:
#String Splitting of each cell in order to run the Dummy code
keywords=df["Keywords"]

temp =keywords.str.split("\"",expand=True)
dummiesk=pd.get_dummies(temp,dtype=int)
dummiesk.columns=dummiesk.columns.str.lstrip('0123456789_').str.lower()

In [132]:
pd.DataFrame(dummiesk.columns).reset_index()

Unnamed: 0,index,0
0,0,
1,1,c(
2,2,< 15 mins
3,3,< 30 mins
4,4,< 4 hours
...,...,...
2250,2250,inexpensive
2251,2251,)
2252,2252,","
2253,2253,inexpensive


In [133]:
#Merging duplicate dummy columns converting all into binary
duplicate_rows=pd.DataFrame(dummiesk.columns).reset_index()
duplicate_rows2=duplicate_rows.copy()

for j in tqdm(duplicate_rows.index):
    try:
        drop_cols=[]
        for k in duplicate_rows2.index:
            if k<(j+1):continue
            if duplicate_rows.loc[j,0]==duplicate_rows.loc[k,0]:
                dummiesk.iloc[:,j]=dummiesk.iloc[:,j]+dummiesk.iloc[:,k]
                drop_cols.append(k) ,
        duplicate_rows2.drop(index=drop_cols, inplace=True)
    except:
        continue
dummiesk = dummiesk.iloc[:,duplicate_rows2.index]
dummiesk.drop(columns=['c(', ')', '', ', '], inplace=True)

  0%|          | 0/2255 [00:00<?, ?it/s]

In [134]:
#Export dummy file
df_trunc=pd.concat([df, dummiesk],axis=1)
df_trunc.to_csv('./data/dummies/Keyword_dummy.csv', index=False)

In [135]:
df_trunc.shape

(123613, 317)

## Ingredient Dummy Generation

In [136]:
#Similar flow to Keyword flow. 
#String Splitting of each cell in order to run the Dummy code
ingredients=df["RecipeIngredientParts"]

temp =ingredients.str.split("\"",expand=True)
dummies=pd.get_dummies(temp,dtype=int)
dummies.columns=dummies.columns.str.lstrip('0123456789_').str.lower()

In [137]:
pd.DataFrame(dummies.columns).reset_index()

Unnamed: 0,index,0
0,0,
1,1,c(
2,2,character(0)
3,3,% fat buttermilk
4,4,% fat cottage cheese
...,...,...
39812,39812,","
39813,39813,white pepper
39814,39814,","
39815,39815,chinese five spice powder


In [138]:
#Merging duplicate dummy columns converting all into binary
duplicate_rows=pd.DataFrame(dummies.columns).reset_index()
duplicate_rows2=duplicate_rows.copy()

for j in tqdm(duplicate_rows.index):
    try:
        drop_cols=[]
        for k in duplicate_rows2.index:
            if k<(j+1):continue
            if duplicate_rows.loc[j,0]==duplicate_rows.loc[k,0]:
                dummies.iloc[:,j]=dummies.iloc[:,j]+dummies.iloc[:,k]
                drop_cols.append(k) ,
        duplicate_rows2.drop(index=drop_cols, inplace=True)
    except:
        continue
dummies = dummies.iloc[:,duplicate_rows2.index]
dummies.drop(columns=['c(', ')', '', ', ', 'character(0)'], inplace=True)

  0%|          | 0/39817 [00:00<?, ?it/s]

In [186]:
#Truncate all original columns(str), keeping the binary columns for the algorithm and exporting dummy file
df_trunc=pd.concat([df, dummies],axis=1)
temp = df_trunc.iloc[:,28:].sum()<10
df_trunc.drop(columns=temp[temp].index,inplace=True)
df_trunc.to_csv('./data/dummies/ingredient_dummy.csv', index=False)

In [187]:
df_trunc.shape

(123613, 2624)