In [1]:
import pandas as pd
import re
from scipy.sparse import hstack
from collections import defaultdict
from typing import Tuple

import pickle as pkl
import os

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

In [2]:
def get_detailed_products(verbose=False):
    if verbose == True: print('(Data Collection 0/2) Read relevant csv files ...')
    aisles = pd.read_csv('../instacart/aisles.csv')
    departments = pd.read_csv('../instacart/departments.csv')
    products = pd.read_csv('../instacart/products.csv')

    if verbose == True: print('(Data Collection 1/2) Merging csv files and drop irrelevant columns ...')
    detailed_products = products.merge(aisles, on='aisle_id', how='inner')
    detailed_products = detailed_products.merge(departments, on='department_id', how='inner')
    detailed_products = detailed_products.drop(columns=['aisle_id','department_id'])

    if verbose == True: print('(Data Collection 2/2) Dataset successfully prepared ...')
    return detailed_products

detailed_products = get_detailed_products()
detailed_products

Unnamed: 0,product_id,product_name,aisle,department
0,1,Chocolate Sandwich Cookies,cookies cakes,snacks
1,2,All-Seasons Salt,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,frozen meals,frozen
4,5,Green Chile Anytime Sauce,marinades meat preparation,pantry
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",spirits,alcohol
49684,49685,En Croute Roast Hazelnut Cranberry,frozen vegan vegetarian,frozen
49685,49686,Artisan Baguette,bread,bakery
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,cat food care,pets


In [3]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text_feature(data : pd.DataFrame, lowercase=True, stopword=True, lemma=False, stem=False, punc=True, number=True, url=False, email=False):

    punctuation = string.punctuation
    stopwordlist = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")

    def remove_urls(text):
        return re.sub(r"http\S+|www\S+|https\S+", '', text)

    def remove_emails(text):
        return re.sub(r'\S+@\S+', '', text)

    def text_cleaning(sentence):
        if not isinstance(sentence, str): return ""
        if lowercase:
            sentence = sentence.lower()
        if url: 
            sentence = remove_urls(sentence)
        if email: 
            sentence = remove_emails(sentence)
        words = word_tokenize(sentence)
        if punc: 
            words = [word for word in words if word not in punctuation]
        if stopword: 
            words = [word for word in words if word not in stopwordlist]
        if number: 
            words = [word for word in words if word.isalpha()]
        if lemma:
            pos_tags = pos_tag(words)
            words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        if stem:
            words = [stemmer.stem(word) for word in words]
        return " ".join(words)

    df = data.copy()
    df[df.columns[0]] = df[df.columns[0]].apply(lambda x: text_cleaning(x))
    return df

In [4]:
def encode(dataset : pd.DataFrame, sparse=False, verbose=False):
    if verbose == True: print(f"(Encode Features 0/3) Determine model filepath ...")
    data = dataset.copy()
    column = dataset.columns[0]
    model_path = f'../models/{column}_onehotencoder.pkl'

    if os.path.exists(model_path):
        if verbose == True: print(f"(Encode Features 1/3) Loading existing encoder from {model_path}...")
        encoder = pd.read_pickle(model_path)
    else:
        if verbose == True: print(f"(Encode Features 1/3) Training a new aisle encoder and saving it to {model_path}...")
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=sparse)
        encoder.fit(data)
        pd.to_pickle(encoder, model_path)
        
    if verbose == True: print(f"(Encode Features 2/3) Encode relevant features...")
    dataset_encoded = encoder.transform(data)
    returned = dataset_encoded if sparse else pd.DataFrame(dataset_encoded, columns=encoder.get_feature_names_out([column]), index=dataset.index)
    if verbose == True: print(f"(Encode Features 3/3) Feature Encoding finished...")
    return returned

In [5]:
def raw_to_tfidf(dataset: pd.DataFrame, sparse=False, verbose=False):
    if verbose == True: print(f"(Text Feature Extraction 0/3) Determine model filepath ...")
    column = dataset.columns[0]
    data = dataset[[column]].copy()
    model_path = f'../models/{column}_tfidf_vectorizer.pkl'

    if os.path.exists(model_path):
        if verbose == True: print(f"(Text Feature Extraction 1/3) Loading existing vectorizer from {model_path} ...")
        with open(model_path, 'rb') as file:
            vectorizer = pkl.load(file)
    else:
        if verbose == True: print(f"(Text Feature Extraction 1/3) Training new TF-IDF vectorizer and saving it to {model_path} ...")
        vectorizer = TfidfVectorizer()
        vectorizer.fit(data[column])
        with open(model_path, 'wb') as file:
            pkl.dump(vectorizer, file)

    if verbose == True: print(f"(Text Feature Extraction 2/3) Transform text data into TF-IDF numeric features ...")
    tfidf_matrix = vectorizer.transform(data[column])

    if verbose == True: print(f"(Text Feature Extraction 3/3) Feature Extraction Done ...")
    if sparse:
        return tfidf_matrix, vectorizer
    else:
        return pd.DataFrame(
            tfidf_matrix.toarray(),
            columns=vectorizer.get_feature_names_out([column]),
            index=data.index
        ), vectorizer

In [6]:
def extract_features(userproducts : pd.DataFrame, verbose=False):
    tfidf_matrix, _ = raw_to_tfidf(userproducts[['product_name']], sparse=True, verbose=verbose)
    encoded_aisle = encode(userproducts[['aisle']], sparse=True, verbose=verbose)
    encoded_department = encode(userproducts[['department']], sparse=True, verbose=verbose)
    id_map = userproducts[['product_id']]
    return hstack([tfidf_matrix, encoded_aisle, encoded_department]), id_map

features, id_map = extract_features(detailed_products)
id_map.head(5)

Unnamed: 0,product_id
0,1
1,2
2,3
3,4
4,5


In [7]:
def train_model(features) -> NearestNeighbors:
    print(f"(Model Training 0/1) Train NN model in progress ...")
    model = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=4)
    model.fit(features)
    print(f"(Model Training 1/1) Train NN model finished ...")
    return model

In [8]:
def split_dataset(userproducts):
    trainset, testset = train_test_split(userproducts, test_size=0.3, random_state=42, stratify=userproducts['department_id'])
    return trainset, testset

In [9]:
def get_userproducts():
    print('(Get UserProducts 0/5) Reading and merging csv dataset ...')
    orderproducts = pd.read_csv('../instacart/order_products__train.csv')
    orders = pd.read_csv('../instacart/orders.csv')

    print('(Get UserProducts 1/5) Filter to only include orders which is in train and test set...')
    orders = orders[orders['eval_set'].isin(['train','test'])]

    print('(Get UserProducts 2/5) Merge orderproducts and orders, then remove irrelevant columns...')
    userproducts = orderproducts.merge(orders, on='order_id', how='inner')
    userproducts = userproducts[['product_id','user_id','reordered']]

    print('(Get UserProducts 3/5) Remove user which only occur once to prevent error during testing...')
    uid_counts = userproducts['user_id'].value_counts()
    valid_uids = uid_counts[uid_counts > 1].index
    userproducts = userproducts[userproducts['user_id'].isin(valid_uids)]

    print('(Get UserProducts 4/5) Remove duplicated data and missing value if exists...')
    userproducts = userproducts.dropna(inplace=False)
    userproducts = userproducts.drop_duplicates(inplace=False)

    print('(Get UserProducts 5/5) Get UserProducts process is done!')
    return userproducts

In [10]:
def load_model(state="evaluation") -> Tuple[NearestNeighbors | None, pd.DataFrame | None]:

    if not (state == "evaluation" or state == "production"):
        print(f"Invalid state...")
        return None, None
    
    model_filename = f'../models/nn_model_{state}.pkl'
    iidmap_filename = f"../models/iid_mapping_{state}.pkl"

    if os.path.exists(model_filename) and os.path.exists(iidmap_filename):
        print('Model already exist, currently load it ...')
        with open(model_filename,'rb') as f:
            model = pkl.load(f)
        with open(iidmap_filename,'rb') as f:
            iid_map = pkl.load(f)

        print('Model successfully loaded ...')
        return model, iid_map
    else:
        print('Model doesnt exist yet. Train new model ...')
        detailed_products = get_detailed_products()
        preprocessed_products, iid_map = extract_features(detailed_products, verbose=True)
        model = train_model(preprocessed_products)

        print('Saving trained model ...')
        with open(model_filename,'wb') as f:
            pkl.dump(model, f)
        with open(iidmap_filename,'wb') as f:
            pkl.dump(iid_map, f)

        print('Model successfully loaded ...')
        return model, iid_map

In [11]:
def recommend(products, product_mapping, model):
    detailed_products = get_detailed_products()
    preprocessed_products, _ = extract_features(products)
    tally = defaultdict(list)
    for i in range(preprocessed_products.shape[0]):
        product_vector = preprocessed_products[i].reshape(1, -1)
        distances, indices = model.kneighbors(product_vector, n_neighbors=10)
        for dist, idx in zip(distances[0], indices[0]):
            tally[idx].append(dist)

    avg_distances = [(idx, sum(dists) / len(dists)) for idx, dists in tally.items()]
    sorted_indices = sorted(avg_distances, key=lambda x: x[1])
    recommended_ids = [product_mapping.iloc[idx]['product_id'] for idx, _ in sorted_indices]
    recommended_products = detailed_products.set_index('product_id').loc[recommended_ids].reset_index()

    result_df = pd.DataFrame(sorted_indices, columns=['index', 'avg_distance'])
    result_df['product_id'] = result_df['index'].apply(lambda i: product_mapping.iloc[i]['product_id'])
    final = result_df.merge(recommended_products, on='product_id')
    return final

test_products = detailed_products.iloc[[0]]
print(test_products, '\n')

model, iid_map = load_model(state="production")
recommended_products = recommend(test_products, iid_map, model)
recommended_products

   product_id                product_name          aisle department
0           1  Chocolate Sandwich Cookies  cookies cakes     snacks 

Model already exist, currently load it ...
Model successfully loaded ...


Unnamed: 0,index,avg_distance,product_id,product_name,aisle,department
0,0,0.0,1,Chocolate Sandwich Cookies,cookies cakes,snacks
1,23931,0.060354,23932,Chocolate Creme Sandwich Cookies,cookies cakes,snacks
2,12480,0.080673,12481,Oreo Chocolate Sandwich Cookies,cookies cakes,snacks
3,9293,0.085177,9294,Reduced Fat Chocolate Sandwich Cookies,cookies cakes,snacks
4,22413,0.088592,22414,Chocolate Mint Creme Sandwich Cookies,cookies cakes,snacks
5,1623,0.089686,1624,Chocolate Cookies,cookies cakes,snacks
6,11638,0.090158,11639,Chocolate Berry Creme Sandwich Cookies,cookies cakes,snacks
7,4690,0.09614,4691,Creme Sandwich Cookies,cookies cakes,snacks
8,33321,0.104094,33322,Chocolate Peanut Butter Creme Sandwich Cookies,cookies cakes,snacks
9,15758,0.107325,15759,Peanut Butter Sandwich Cookies,cookies cakes,snacks


In [12]:
def load_nn_model_and_mapping():
    with open('../models/nn_model_production.pkl', 'rb') as f:
        model = pkl.load(f)
    with open('../models/iid_mapping_production.pkl', 'rb') as f:
        product_mapping = pkl.load(f)
    return model, product_mapping

def get_userproduct_ids(user_id=17):
    orderproducts = pd.read_csv('../instacart/order_products__train.csv')
    orders = pd.read_csv('../instacart/orders.csv')
    userorders = orders[orders['user_id'].isin([user_id])]['order_id']
    userproducts = set(orderproducts[orderproducts['order_id'].isin(userorders)]['product_id'])
    return userproducts

def get_detailed_products():
    aisles = pd.read_csv('../instacart/aisles.csv')
    departments = pd.read_csv('../instacart/departments.csv')
    products = pd.read_csv('../instacart/products.csv')
    detailed_products = products.merge(aisles, on='aisle_id', how='inner')
    detailed_products = detailed_products.merge(departments, on='department_id', how='inner')
    detailed_products = detailed_products.drop(columns=['aisle_id','department_id'])
    return detailed_products

def recommendNN(user_id=17, top_n=50):
    detailed_products = get_detailed_products()
    userproduct_ids = get_userproduct_ids(user_id) # real product_id
    products = detailed_products[detailed_products['product_id'].isin(userproduct_ids)]
    model, product_mapping = load_nn_model_and_mapping()
    
    preprocessed_products, _ = extract_features(products)
    tally = defaultdict(list)
    for i in range(preprocessed_products.shape[0]):
        product_vector = preprocessed_products[i].reshape(1, -1)
        distances, indices = model.kneighbors(product_vector, n_neighbors=top_n)
        for dist, idx in zip(distances[0], indices[0]):
            tally[idx].append(dist)

    avg_distances = [(idx, sum(dists) / len(dists)) for idx, dists in tally.items()] # innermap idx
    sorted_indices = sorted(avg_distances, key=lambda x: x[1])[:top_n]
    recommended_ids = [(product_mapping.iloc[idx]['product_id'], score) for idx, score in sorted_indices]
    recommended_ids = [(idx, score) for idx, score in recommended_ids if idx not in userproduct_ids]

    print(userproduct_ids)
    return recommended_ids

recommendation = recommendNN(user_id=17)
print(recommendation)

{1217, 18534, 12720, 4374, 43352, 16797}
[(22282, 0.0), (11210, 0.0), (21137, 0.026766753906078744), (21879, 0.03152756532181167), (44177, 0.03152756532181167), (49439, 0.04251893524461359), (1890, 0.04251893524461359), (23410, 0.04750438821797354), (12149, 0.05072550412502874), (5782, 0.054216001300095185), (8066, 0.0554437542875581), (29176, 0.05608980263797614), (20051, 0.059279972348458054), (25487, 0.06092123205455979), (14999, 0.06092123205455979), (12709, 0.06213714574206319), (25097, 0.06355862507039722), (31654, 0.06535775915281938), (20247, 0.0679367748363433), (5262, 0.06795048241753543), (46654, 0.06908551095661086), (35921, 0.06908551095661086), (32684, 0.06908551095661086), (36550, 0.07310659969765143), (49478, 0.08119335695720187), (11390, 0.08183237375093677), (34791, 0.08204818980055872), (13949, 0.08206524546496852), (44025, 0.08465249783707285), (23341, 0.08501544047751963), (35595, 0.09124626784248435), (36186, 0.09124626784248435), (46620, 0.09390860936033518), (29