In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
#from surprise import Reader, Dataset, KNNBasic, NormalPredictor,BaselineOnly,KNNWithMeans,KNNBaseline
#from surprise import SVD, SVDpp, NMF, SlopeOne, CoClustering
#from surprise.model_selection import cross_validate
#from surprise.model_selection import GridSearchCV
#from surprise import accuracy

import random
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
#matplotlib.style.use('ggplot')
# for reading files from urls
import urllib.request
# display imports
from IPython.display import display, IFrame
from IPython.core.display import HTML

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from collections import defaultdict


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Creating helper function to get part of speech

In [2]:
#Function to preprocess the text: lowercase and remove punctuation
def preprocess_text(text):
    # Ensure that text is a string
    if not isinstance(text, str):
        return ""
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

#Reading the CSV files
brand_category_df = pd.read_csv('brand_category.csv')
categories_df = pd.read_csv('categories.csv')
offer_retailer_df = pd.read_csv('offer_retailer.csv')

#Preprocess the relevant columns using apply method
brand_category_df['BRAND'] = brand_category_df['BRAND'].apply(preprocess_text)
brand_category_df['BRAND_BELONGS_TO_CATEGORY'] = brand_category_df['BRAND_BELONGS_TO_CATEGORY'].apply(preprocess_text)
#Remove list behavior
brand_category_df['BRAND_BELONGS_TO_CATEGORY'] = brand_category_df['BRAND_BELONGS_TO_CATEGORY'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)


categories_df['PRODUCT_CATEGORY'] = categories_df['PRODUCT_CATEGORY'].apply(preprocess_text)

offer_retailer_df['OFFER'] = offer_retailer_df['OFFER'].astype(str).apply(preprocess_text)
offer_retailer_df['RETAILER'] = offer_retailer_df['RETAILER'].astype(str).apply(preprocess_text)
offer_retailer_df['BRAND'] = offer_retailer_df['BRAND'].astype(str).apply(preprocess_text)

#If I need to save them back to CSV:
brand_category_df.to_csv('processed_brand_category.csv', index=False)
categories_df.to_csv('processed_categories.csv', index=False)
offer_retailer_df.to_csv('processed_offer_retailer.csv', index=False)

#To check the processed data
print(brand_category_df.head())
print(categories_df.head())
print(offer_retailer_df.head())

              BRAND BRAND_BELONGS_TO_CATEGORY  RECEIPTS
0  caseys gen store          tobacco products   2950931
1  caseys gen store                    mature   2859240
2            equate              hair removal    893268
3         palmolive                bath  body    542562
4              dawn                bath  body    301844
                            CATEGORY_ID            PRODUCT_CATEGORY  \
0  1f7d2fa7-a1d7-4969-aaf4-1244f232c175             red pasta sauce   
1  3e48a9b3-1ab2-4f2d-867d-4a30828afeab  alfredo  white pasta sauce   
2  09f3decc-aa93-460d-936c-0ddf06b055a3             cooking  baking   
3  12a89b18-4c01-4048-94b2-0705e0a45f6b            packaged seafood   
4  2caa015a-ca32-4456-a086-621446238783            feminine hygeine   

  IS_CHILD_CATEGORY_TO  
0          Pasta Sauce  
1          Pasta Sauce  
2               Pantry  
3               Pantry  
4    Health & Wellness  
                                               OFFER            RETAILER  \
0       spe

Combining the Dataframes.

In [3]:
#This ends up with Offer and retailer mostly nan

# Assume you've loaded your dataframes as brand_category_df, categories_df, and offer_retailer
#combined_df = pd.merge(brand_category_df, offer_retailer_df, on='BRAND', how='left')
#combined_df.head(25)

Function for lemmatizing and removing stop words

In [4]:
def preprocess_and_normalize(text):
    # Ensure that text is a string
    if not isinstance(text, str):
        return ""

    #Convert text to lowercase
    text = text.lower()

    #Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    #Tokenize the text
    tokens = nltk.word_tokenize(text)

    #Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    #Lemmatize and remove stop words
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

    #Join the words back into a string
    return ' '.join(lemmatized_text)


Applying the processing and normalization function

In [5]:
#Apply to your DataFrame columns
brand_category_df['BRAND'] = brand_category_df['BRAND'].apply(preprocess_and_normalize)
brand_category_df['BRAND_BELONGS_TO_CATEGORY'] = brand_category_df['BRAND_BELONGS_TO_CATEGORY'].apply(preprocess_and_normalize)

categories_df['PRODUCT_CATEGORY'] = categories_df['PRODUCT_CATEGORY'].apply(preprocess_and_normalize)

offer_retailer_df['OFFER'] = offer_retailer_df['OFFER'].astype(str).apply(preprocess_and_normalize)
offer_retailer_df['RETAILER'] = offer_retailer_df['RETAILER'].astype(str).apply(preprocess_and_normalize)
offer_retailer_df['BRAND'] = offer_retailer_df['BRAND'].astype(str).apply(preprocess_and_normalize)


Saving the data for later.

In [6]:
#brand_category_df.to_csv('processed_brand_category.csv', index=False)
#categories_df.to_csv('processed_categories.csv', index=False)
#offer_retailer_df.to_csv('processed_offer_retailer.csv', index=False)


Trying to implement TF-IDF matrix for offers

In [7]:
#Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

#Create TF-IDF matrix for the 'OFFER' column
tfidf_matrix = vectorizer.fit_transform(offer_retailer_df['OFFER'])


Trying to process the search query

In [8]:
def process_query(query):
    processed_query = preprocess_and_normalize(query)
    query_tfidf = vectorizer.transform([processed_query])
    return query_tfidf


Computing the cosine similarity

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_offers(query):
    query_tfidf = process_query(query)
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    return cosine_similarities


Retrieving the top matches

In [10]:
def get_top_matches(query, top_n=5):
    cosine_similarities = find_similar_offers(query)
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    top_matches = offer_retailer_df.iloc[top_indices]
    top_scores = cosine_similarities[top_indices]
    return top_matches, top_scores

#Test and Examples
top_offers, scores = get_top_matches('hair color', top_n=5)
for offer, score in zip(top_offers['OFFER'], scores):
    print(f"Offer: {offer}, Score: {score}")


Offer: loréal paris hair color select variety spend 9 target, Score: 0.6221508599711729
Offer: loréal paris hair color select variety spend 25 target, Score: 0.5709308341966226
Offer: loréal paris excellence hair color target, Score: 0.5703850058255
Offer: loréal paris hair color select variety spend 19 target, Score: 0.5490325427051232
Offer: loréal paris men expert hair color spend 9 walmart, Score: 0.526288084399303
