In [20]:
import pandas as pd
import string
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Load spaCy English model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Load Nike data
df = pd.read_csv("NikeProductDescriptions.csv")
print(f"Loaded dataset with {len(df)} rows.")
print("Columns found:", df.columns.tolist())

# Use the correct column: 'Product Description'
descriptions = df["Product Description"].dropna().astype(str).tolist()

# Preprocessing: use spaCy to tokenize, lowercase, lemmatize, remove stopwords and punctuation
def preprocess(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    return " ".join(tokens)

# Clean all descriptions
print("Preprocessing product descriptions with spaCy...")
descriptions_cleaned = []
for desc in tqdm(descriptions):
    cleaned = preprocess(desc)
    descriptions_cleaned.append(cleaned)

# TF-IDF
print("Computing TF-IDF vectors...")
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(descriptions_cleaned)
print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Cosine similarity
print("Calculating cosine similarity...")
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Jaccard similarity
def jaccard_sim(text1, text2):
    set1, set2 = set(text1.split()), set(text2.split())
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

print("Calculating Jaccard similarities...")
jaccard_sim_matrix = []
for i in tqdm(range(len(descriptions_cleaned))):
    row = []
    for j in range(len(descriptions_cleaned)):
        sim = jaccard_sim(descriptions_cleaned[i], descriptions_cleaned[j])
        row.append(sim)
    jaccard_sim_matrix.append(row)

# Show top 5 most similar products to the first one (Cosine)
print("\nTop 5 products most similar to the first one (Cosine similarity):")
first_cosine = cosine_sim_matrix[0]
top_indices_cosine = first_cosine.argsort()[::-1][1:6]
for idx in top_indices_cosine:
    print(f"Index {idx}: Cosine = {first_cosine[idx]:.3f}, Desc: {descriptions[idx][:60]}...")

# Show top 5 most similar products to the first one (Jaccard)
print("\nTop 5 products most similar to the first one (Jaccard similarity):")
first_jaccard = jaccard_sim_matrix[0]
top_indices_jaccard = sorted(range(len(first_jaccard)), key=lambda i: first_jaccard[i], reverse=True)[1:6]
for idx in top_indices_jaccard:
    print(f"Index {idx}: Jaccard = {first_jaccard[idx]:.3f}, Desc: {descriptions[idx][:60]}...")

# Optionally save results
# pd.DataFrame(cosine_sim_matrix).to_csv("cosine_similarity.csv", index=False)
# pd.DataFrame(jaccard_sim_matrix).to_csv("jaccard_similarity.csv", index=False)


Loaded dataset with 400 rows.
Columns found: ['Title', 'Subtitle', 'Product Description']
Preprocessing product descriptions with spaCy...


100%|██████████| 400/400 [00:03<00:00, 130.03it/s]


Computing TF-IDF vectors...
TF-IDF matrix shape: (400, 1833)
Calculating cosine similarity...
Calculating Jaccard similarities...


100%|██████████| 400/400 [00:01<00:00, 260.18it/s]


Top 5 products most similar to the first one (Cosine similarity):
Index 159: Cosine = 0.259, Desc: You'll score major points in this legendary classic. Crossin...
Index 14: Cosine = 0.206, Desc: The radiance lives on in the Nike Air Force 1 '07, the baske...
Index 343: Cosine = 0.197, Desc: Created for the hardwood but taken to the streets, the '80s ...
Index 164: Cosine = 0.159, Desc: The Air Jordan 1 Mid brings full-court style and premium com...
Index 27: Cosine = 0.158, Desc: Have you ever had déjà shoe? Flash back to one of the first ...

Top 5 products most similar to the first one (Jaccard similarity):
Index 159: Jaccard = 0.200, Desc: You'll score major points in this legendary classic. Crossin...
Index 14: Jaccard = 0.167, Desc: The radiance lives on in the Nike Air Force 1 '07, the baske...
Index 31: Jaccard = 0.130, Desc: More than perhaps any other silhouette, the Air More Uptempo...
Index 37: Jaccard = 0.130, Desc: Vintage details elevate an icon to bring you timeless sty




In [21]:
pd.DataFrame(cosine_sim_matrix).to_csv("cosine_similarity.csv", index=False)
pd.DataFrame(jaccard_sim_matrix).to_csv("jaccard_similarity.csv", index=False)


# Nike Product Description Similarity Analysis

- Filtered the dataset to keep product descriptions.
- Cleaned the text: lowercased, removed stopwords/punctuation, lemmatized.
- Built TF-IDF vectors and computed cosine similarities.
- Calculated Jaccard similarities on cleaned text.
- Identified top 5 most similar products for the first item using both metrics.