Assignment 4 NLP:  Similarity between words or docs

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import re

[nltk_data] Downloading package stopwords to C:\Users\Cristina
[nltk_data]     Ortega\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Cristina
[nltk_data]     Ortega\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\Cristina
[nltk_data]     Ortega\AppData\Roaming\nltk_data...


In [2]:
df = pd.read_csv("NikeProductDescriptions.csv")
df.head()

Unnamed: 0,Title,Subtitle,Product Description
0,Nike Air Force 1 '07,Men's Shoes,It doesn't get more legendary than this. Desig...
1,Nike Air Max Dawn SE,Men's Shoes,Find out what moves you with the Air Max Dawn....
2,Nike SB Dunk Low Pro Premium,Skate Shoes,Pack your style—on your feet. Bringing a fresh...
3,Nike Air Force 1 Mid '07 LX,Men's Shoes,The celebrations just keep coming. Unbox the A...
4,Nike Air Force 1 Mid '07,Men's Shoes,"Got your fave colour yet? No worries, the Colo..."


In [3]:
#Preprocessing

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = re.findall(r'\b\w+\b', text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['processed'] = df['Product Description'].astype(str).apply(preprocess)
df[['Product Description', 'processed']].head()

Unnamed: 0,Product Description,processed
0,It doesn't get more legendary than this. Desig...,doesnt get legendary designed turn head nike a...
1,Find out what moves you with the Air Max Dawn....,find move air max dawn rooted sporty athletics...
2,Pack your style—on your feet. Bringing a fresh...,pack style foot bringing fresh twist iconic sk...
3,The celebrations just keep coming. Unbox the A...,celebration keep coming unbox af1 fireside fee...
4,"Got your fave colour yet? No worries, the Colo...",got fave colour yet worry colour month program...


In [5]:
#Similarity
# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['processed'])

# Cosine similarity matrix
cos_sim_matrix = cosine_similarity(tfidf_matrix)

similarity_df = pd.DataFrame(cos_sim_matrix)
similarity_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,1.0,0.027543,0.016556,0.08686,0.035223,0.023649,0.154548,0.140641,0.045767,0.028719,...,0.025377,0.004368,0.015275,0.020578,0.10751,0.06408,0.021969,0.028344,0.003619,0.023673
1,0.027543,1.0,0.042535,0.0,0.035962,0.0,0.040576,0.0,0.076891,0.032762,...,0.010405,0.07542,0.009026,0.016744,0.034561,0.008607,0.042378,0.037284,0.086133,0.009706
2,0.016556,0.042535,1.0,0.022427,0.03722,0.05072,0.051814,0.0,0.026216,0.031952,...,0.0,0.0,0.013474,0.0,0.045642,0.033097,0.01162,0.046817,0.072048,0.0
3,0.08686,0.0,0.022427,1.0,0.147367,0.100282,0.015273,0.17124,0.075061,0.006384,...,0.021,0.007686,0.0,0.009587,0.010812,0.0,0.032668,0.011546,0.038257,0.01959
4,0.035223,0.035962,0.03722,0.147367,1.0,0.069928,0.063521,0.11529,0.067144,0.04024,...,0.0,0.005378,0.00994,0.0,0.014459,0.0,0.017821,0.037222,0.044163,0.0


In [6]:
#Jaccard Similarity

def jaccard_similarity(doc1, doc2):
    tokens1 = set(doc1.split())
    tokens2 = set(doc2.split())
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    if len(union) == 0:
        return 0
    return len(intersection) / len(union)

#Jaccard similarity matrix
jaccard_matrix = np.zeros((len(df), len(df)))

for i in range(len(df)):
    for j in range(len(df)):
        jaccard_matrix[i][j] = jaccard_similarity(df['processed'][i], df['processed'][j])

jaccard_df = pd.DataFrame(jaccard_matrix)
jaccard_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,1.0,0.029851,0.014493,0.063492,0.0375,0.032787,0.098039,0.118644,0.05618,0.042254,...,0.033898,0.015873,0.032787,0.03125,0.075472,0.057971,0.026667,0.04918,0.014706,0.030303
1,0.029851,1.0,0.031746,0.0,0.026316,0.0,0.02,0.0,0.047059,0.029851,...,0.018182,0.053571,0.017544,0.016667,0.019608,0.014925,0.043478,0.035088,0.066667,0.016129
2,0.014493,0.031746,1.0,0.016129,0.039474,0.035088,0.04,0.0,0.022727,0.029412,...,0.0,0.0,0.017241,0.0,0.039216,0.014706,0.013889,0.052632,0.048387,0.0
3,0.063492,0.0,0.016129,1.0,0.085714,0.056604,0.020833,0.156863,0.060976,0.015152,...,0.018868,0.017857,0.0,0.017241,0.020408,0.0,0.029412,0.017857,0.033333,0.016667
4,0.0375,0.026316,0.039474,0.085714,1.0,0.043478,0.048387,0.056338,0.061856,0.050633,...,0.0,0.013889,0.014085,0.0,0.015385,0.0,0.02381,0.028169,0.026316,0.0


In [7]:
# Top similar descriptions
product_idx = 0
top_cosine_indices = np.argsort(cos_sim_matrix[product_idx])[::-1][1:4]
top_jaccard_indices = np.argsort(jaccard_matrix[product_idx])[::-1][1:4]

print("Original:", df['Product Description'][product_idx])
print("\nTop 3 Cosine Similar Descriptions:")
for idx in top_cosine_indices:
    print(f"\n({cos_sim_matrix[product_idx][idx]:.2f}) {df['Product Description'][idx]}")

print("\nTop 3 Jaccard Similar Descriptions:")
for idx in top_jaccard_indices:
    print(f"\n({jaccard_matrix[product_idx][idx]:.2f}) {df['Product Description'][idx]}")

Original: It doesn't get more legendary than this. Designed to turn heads, the Nike Air Force 1 '07 crosses hardwood comfort with off-court flair. Its crisp leather upper looks sleek and fresh, while lustrous Swoosh logos give off an almost iridescent look to add the perfect amount of flash to make you shine. Consider them a slam dunk.

Top 3 Cosine Similar Descriptions:

(0.29) The radiance lives on in the Nike Air Force 1 '07, the basketball original that puts a fresh spin on what you know best: durably stitched overlays, clean finishes and the perfect amount of flash to make you shine.

(0.23) You'll score major points in this legendary classic. Crossing hardwood comfort with off-court flair, this hoops original pairs crisp leather with playful paisley-print accents for nothing-but-net style. Plus, hidden Nike Air units and durable '80s construction add the comfort you've come to expect from the AF-1.

(0.16) Created for the hardwood but taken to the streets, the '80s b-ball icon re