### Required Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Load the dataset

In [2]:
df = pd.read_csv('tokenized_access_logs.csv')
df

Unnamed: 0,Product,Category,Date,Month,Hour,Department,ip,url
0,adidas Brazuca 2017 Official Match Ball,baseball & softball,9/1/2017 6:00,Sep,6,fitness,37.97.182.65,/department/fitness/category/baseball%20&%20so...
1,The North Face Women's Recon Backpack,hunting & shooting,9/1/2017 6:00,Sep,6,fan shop,206.56.112.1,/department/fan%20shop/category/hunting%20&%20...
2,adidas Kids' RG III Mid Football Cleat,featured shops,9/1/2017 6:00,Sep,6,apparel,215.143.180.0,/department/apparel/category/featured%20shops/...
3,Under Armour Men's Compression EV SL Slide,electronics,9/1/2017 6:00,Sep,6,footwear,206.56.112.1,/department/footwear/category/electronics/prod...
4,Pelican Sunstream 100 Kayak,water sports,9/1/2017 6:01,Sep,6,fan shop,136.108.56.242,/department/fan%20shop/category/water%20sports...
...,...,...,...,...,...,...,...,...
469972,Nike Men's Free TR 5.0 TB Training Shoe,as seen on tv!,10/9/2017 21:21,Oct,21,footwear,93.166.57.36,/department/footwear/category/as%20seen%20on%2...
469973,Under Armour Hustle Storm Medium Duffle Bag,fitness accessories,10/9/2017 21:21,Oct,21,footwear,126.175.2.58,/department/footwear/category/fitness%20access...
469974,Under Armour Hustle Storm Medium Duffle Bag,fitness accessories,10/9/2017 21:22,Oct,21,footwear,201.210.19.242,/department/footwear/category/fitness%20access...
469975,Nike Men's Dri-FIT Victory Golf Polo,women's apparel,10/9/2017 21:22,Oct,21,golf,56.81.167.135,/department/golf/category/women's%20apparel/pr...


In [3]:
df.shape

(469977, 8)

In [4]:
df.isna().sum()

Product       0
Category      0
Date          0
Month         0
Hour          0
Department    0
ip            0
url           0
dtype: int64

### Remove duplicates

In [5]:
df['Product'].value_counts()

Perfect Fitness Perfect Rip Deck                 27878
adidas Kids' RG III Mid Football Cleat           26200
Nike Men's Dri-FIT Victory Golf Polo             25627
Nike Men's CJ Elite 2 TD Football Cleat          25241
O'Brien Men's Neoprene Life Vest                 16194
                                                 ...  
Bag Boy M330 Push Cart                            1015
Polar Loop Activity Tracker                        930
Bushnell Pro X7 Jolt Slope Rangefinder             929
Garmin Forerunner 910XT GPS Watch                  877
Fitbit The One Wireless Activity & Sleep Trac      807
Name: Product, Length: 76, dtype: int64

In [6]:
df = df.drop_duplicates(subset=['Product'], keep='first')
df = df.reset_index()
df = df.drop(columns = 'index')
df

Unnamed: 0,Product,Category,Date,Month,Hour,Department,ip,url
0,adidas Brazuca 2017 Official Match Ball,baseball & softball,9/1/2017 6:00,Sep,6,fitness,37.97.182.65,/department/fitness/category/baseball%20&%20so...
1,The North Face Women's Recon Backpack,hunting & shooting,9/1/2017 6:00,Sep,6,fan shop,206.56.112.1,/department/fan%20shop/category/hunting%20&%20...
2,adidas Kids' RG III Mid Football Cleat,featured shops,9/1/2017 6:00,Sep,6,apparel,215.143.180.0,/department/apparel/category/featured%20shops/...
3,Under Armour Men's Compression EV SL Slide,electronics,9/1/2017 6:00,Sep,6,footwear,206.56.112.1,/department/footwear/category/electronics/prod...
4,Pelican Sunstream 100 Kayak,water sports,9/1/2017 6:01,Sep,6,fan shop,136.108.56.242,/department/fan%20shop/category/water%20sports...
...,...,...,...,...,...,...,...,...
71,TaylorMade 2017 Purelite Stand Bag,golf gloves,9/1/2017 14:05,Sep,14,outdoors,132.9.150.102,/department/outdoors/category/golf%20gloves/pr...
72,Polar Loop Activity Tracker,kids' golf clubs,9/1/2017 15:14,Sep,15,outdoors,47.207.237.120,/department/outdoors/category/kids'%20golf%20c...
73,Glove It Women's Imperial Golf Glove,golf balls,9/1/2017 16:17,Sep,16,outdoors,59.14.14.143,/department/outdoors/category/golf%20balls/pro...
74,Fitbit The One Wireless Activity & Sleep Trac,kids' golf clubs,9/1/2017 19:20,Sep,19,outdoors,181.48.20.117,/department/outdoors/category/kids'%20golf%20c...


In [7]:
df.shape

(76, 8)

In [8]:
df = df[['Product','Category']]

### Preprocess the text data

In [9]:
def preprocess_text(text):
    # Lowercasing and removing special characters
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    
    # Tokenization
    words = text.split()
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)


# Preprocess product names
df['Processed_Product'] = df['Product'].apply(preprocess_text)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Processed_Product'] = df['Product'].apply(preprocess_text)


Unnamed: 0,Product,Category,Processed_Product
0,adidas Brazuca 2017 Official Match Ball,baseball & softball,adidas brazuca official match ball
1,The North Face Women's Recon Backpack,hunting & shooting,north face woman recon backpack
2,adidas Kids' RG III Mid Football Cleat,featured shops,adidas kid rg iii mid football cleat
3,Under Armour Men's Compression EV SL Slide,electronics,armour men compression ev sl slide
4,Pelican Sunstream 100 Kayak,water sports,pelican sunstream kayak
...,...,...,...
71,TaylorMade 2017 Purelite Stand Bag,golf gloves,taylormade purelite stand bag
72,Polar Loop Activity Tracker,kids' golf clubs,polar loop activity tracker
73,Glove It Women's Imperial Golf Glove,golf balls,glove woman imperial golf glove
74,Fitbit The One Wireless Activity & Sleep Trac,kids' golf clubs,fitbit one wireless activity sleep trac


### Colaborative Filtering

In [10]:
# Create a mapping dictionary for original product names to cleaned text
product_mapping = dict(zip(df['Product'], df['Processed_Product']))

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Product'])

In [12]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
def get_original_product_name(cleaned_text):
    return [k for k, v in product_mapping.items() if v == cleaned_text][0]

def get_recommendations_with_similarity(product_name, cosine_sim=cosine_sim):
    processed_name = preprocess_text(product_name)
    idx = df.index[df['Processed_Product'] == processed_name].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar products (excluding itself)
    product_indices = [i[0] for i in sim_scores]
    similar_products = df['Processed_Product'].iloc[product_indices]
    similarity_values = [i[1] for i in sim_scores]
    return [get_original_product_name(product) for product in similar_products], similarity_values


In [14]:
product_name = "The North Face Women's Recon Backpack"
recommendations, similarity_values = get_recommendations_with_similarity(product_name)

for product, similarity in zip(recommendations, similarity_values):
    print(f"Product: {product}, Similarity Score: {similarity:.4f}")


Product: Under Armour Women's Ignite Slide, Similarity Score: 0.0942
Product: Top Flite Women's 2017 XL Hybrid, Similarity Score: 0.0831
Product: LIJA Women's Button Golf Dress, Similarity Score: 0.0816
Product: Glove It Women's Imperial Golf Glove, Similarity Score: 0.0807
Product: Under Armour Women's Micro G Skulpt Running S, Similarity Score: 0.0785
Product: Hirzl Women's Soffft Flex Golf Glove, Similarity Score: 0.0750
Product: LIJA Women's Eyelet Sleeveless Golf Polo, Similarity Score: 0.0729
Product: LIJA Women's Mid-Length Panel Golf Shorts, Similarity Score: 0.0714
Product: Merrell Women's Grassbow Sport Hiking Shoe, Similarity Score: 0.0703
Product: Glove It Women's Mod Oval Golf Glove, Similarity Score: 0.0694


In [15]:
product_name = "Clicgear 8.0 Shoe Brush"
recommendations, similarity_values = get_recommendations_with_similarity(product_name)

for product, similarity in zip(recommendations, similarity_values):
    print(f"Product: {product}, Similarity Score: {similarity:.4f}")
    

Product: Clicgear Rovic Cooler Bag, Similarity Score: 0.2930
Product: Ogio Race Golf Shoes, Similarity Score: 0.2089
Product: Nike Men's Free 5.0+ Running Shoe, Similarity Score: 0.2082
Product: Nike Men's Fingertrap Max Training Shoe, Similarity Score: 0.1723
Product: Merrell Women's Grassbow Sport Hiking Shoe, Similarity Score: 0.1655
Product: Nike Men's Free TR 5.0 TB Training Shoe, Similarity Score: 0.1616
Product: adidas Brazuca 2017 Official Match Ball, Similarity Score: 0.0000
Product: The North Face Women's Recon Backpack, Similarity Score: 0.0000
Product: adidas Kids' RG III Mid Football Cleat, Similarity Score: 0.0000
Product: Under Armour Men's Compression EV SL Slide, Similarity Score: 0.0000
