# Saving the Data And Model

### Imports and Function Definitions

In [19]:
import os
import joblib
import pandas as pd
import numpy as np

from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [20]:
def get_dataTypes_and_missingValues(df):
    info = pd.DataFrame()
    info['data_types'] =  df.dtypes
    info['unique_values'] = df.nunique()
    info['missing_values'] = df.isna().sum()
    return info

In [21]:
def read_recipe_data(path):
    '''Read in the data from the path and return a train, validation and test set'''
    
    df = pd.read_csv(path)

    df['high_traffic'].fillna("Low", inplace=True)
    df.drop('recipe', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)

    y = df['high_traffic']
    X = df.drop('high_traffic', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42, shuffle=True)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [22]:
def encode_categorical_features(categorical_features):
    '''
    Encodes the categorical features using OneHotEncoder
    '''

    ct = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'), categorical_features),
        remainder='passthrough'
    )
    
    return ct

def encode_label(y):
    '''
    Encodes the labels/targets using LabelEncoder
    '''

    enc = LabelEncoder()
    y_enc = enc.fit_transform(y)
    
    return y_enc

In [23]:
def train_model(X_train, y_train, model_algorithm, **model_kwargs):
    '''
    Trains a model and returns the model
    '''
    
    if 'random_state' in model_algorithm().get_params():
        model_kwargs['random_state'] = 124

    model = model_algorithm(**model_kwargs)

    categorical_cols = X_train.select_dtypes(exclude=np.number).columns.to_list()
    ct = encode_categorical_features(categorical_cols)

    pipeline = make_pipeline(ct, StandardScaler(), model)
    y_train_encoded  = encode_label(y_train)

    pipeline.fit(X_train, y_train_encoded)
    
    return pipeline

## Saving the Data

We start off with the data

In [24]:
data_path = os.path.join("..","data", "raw", "recipe_site_traffic.csv")
X_train, y_train, X_val, y_val, X_test, y_test = read_recipe_data(data_path)

train_df = pd.concat([X_train, y_train], axis=1)
missing_indices = train_df[train_df['calories'].isnull()].index
train_df.drop(missing_indices, inplace=True)

In [25]:
train_df['category'].replace('Chicken Breast', 'Chicken', inplace=True)
non_numeric_cols = train_df.drop('high_traffic', axis='columns').select_dtypes(exclude=np.number).columns.values
for col in non_numeric_cols:
    try:
        train_df[col].astype(int)
    except Exception:
        print(f"The column: {col} contains non-numeric characters")

mask = train_df['servings'].astype(str).str.contains(r'\D', regex=True)
non_numeric_values = train_df[mask]

print("Non-numeric values in servings")
non_numeric_values

The column: category contains non-numeric characters
The column: servings contains non-numeric characters
Non-numeric values in servings


Unnamed: 0,calories,carbohydrate,sugar,protein,category,servings,high_traffic
735,513.75,65.87,18.63,18.39,Lunch/Snacks,4 as a snack,Low


In [26]:
train_df.loc[mask, 'servings'] = 4
train_df['servings'] = train_df['servings'].astype(int)

The training data is cleaned up. Next, is the validation data.

In [27]:
val_df = pd.concat([X_val, y_val], axis=1)
missing_indices = val_df[val_df['calories'].isnull()].index
val_df.drop(missing_indices, inplace=True)

In [28]:
val_df['category'].replace('Chicken Breast', 'Chicken', inplace=True)
non_numeric_cols = val_df.drop('high_traffic', axis='columns').select_dtypes(exclude=np.number).columns.values
for col in non_numeric_cols:
    try:
        val_df[col] = val_df[col].astype(int)
    except Exception:
        print(f"The column: {col} contains non-numeric characters")

mask = val_df['servings'].astype(str).str.contains(r'\D', regex=True)
non_numeric_values = val_df[mask]

print("Non-numeric values in servings")
non_numeric_values

The column: category contains non-numeric characters
Non-numeric values in servings


Unnamed: 0,calories,carbohydrate,sugar,protein,category,servings,high_traffic


The validation data is also cleaned up.

In [29]:
cleaned_data_path = os.path.join("..","data", "cleaned")
try:
    os.makedirs(cleaned_data_path, exist_ok=True)

    # save data
    train_df.to_csv(os.path.join(cleaned_data_path, "train.csv"))
    val_df.to_csv(os.path.join(cleaned_data_path, "val.csv"))
except Exception as dir_creation_error:
    print(f"Error creating directory '{cleaned_data_path}': {dir_creation_error}")

## Saving the Model

In [30]:
merged_df = pd.concat([train_df, val_df], axis=0)
y_merged = merged_df['high_traffic']
X_merged = merged_df.drop('high_traffic', axis=1)

print(f"There are {train_df.shape[0]} rows and {merged_df.shape[1]} columns in the merged dataframe")

There are 614 rows and 7 columns in the merged dataframe


In [31]:
rfc_best_params = {
    'n_estimators': 677,
    'max_depth': 18,
    'min_samples_split': 0.0033905037200391095,
    'min_samples_leaf': 0.001650766749167092,
    'max_features': 0.6662238032752099
}

In [32]:
rf_model = train_model(X_merged, y_merged, RandomForestClassifier, **rfc_best_params)
rf_model

In [33]:
test = X_test.iloc[:2]
test

Unnamed: 0,calories,carbohydrate,sugar,protein,category,servings
325,718.34,16.61,8.54,4.52,Chicken,4
879,190.33,22.62,0.51,11.92,Meat,4


In [34]:
rf_model.predict(test)

array([1, 0])

In [35]:
y_test.iloc[:2]

325    Low
879    Low
Name: high_traffic, dtype: object

Out of the 2 options in the test data the model correctly predicted one. So far this is the best performing model. 

Let us save it so we can use it in comparisons with other models.

In [36]:
model_path = os.path.join("..","models")
try:
    os.makedirs(model_path, exist_ok=True)

    joblib.dump(rf_model, os.path.join(model_path, "randomforest.joblib"))
except Exception as dir_creation_error:
    print(f"Error creating directory '{model_path}': {dir_creation_error}")