In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from sentence_transformers import InputExample, SentenceTransformer, losses, util
import re
from torch.utils.data import DataLoader
import torch

# Data Loading and Cleaning

In [None]:
pos_websites = ["anchal", "etica", "nudiejean", "terra", "happy", "joinye", "orbasic", "yesfr", "recreate", "fairindigo"]
neg_websites = ["mango", "nasty", "oldnavy", "fashionnova", "missguide", "yesst", "banana", "motelr", "zaful", "edikted"]

In [None]:
neg_samples = []

In [None]:
with open("output_neg.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
      for j in neg_websites:
        if j in line:
          all_words = line.rsplit(":",1)
          neg_samples.append(all_words)

In [None]:
neg_samples[-1]

['https://bananarepublic.gap.com/shop/womens-western-outerwear-0ajz03a?cl=true&nav=smartlink:women::Coats%20%26%20Jackets:Trench%20Coats%20%26%20Jackets',
 ' season, statement, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , \n']

In [None]:
pos_samples = []

In [None]:
with open("output_pos.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
      for j in pos_websites:
        if j in line:
          all_words = line.rsplit(":", 1)
          pos_samples.append(all_words)

In [None]:
def get_website_text(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = ' '.join([p.get_text() for p in soup.find_all('p')])
        return text
    except requests.exceptions.Timeout:
        print(f"Timeout occurred while fetching: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return ""

In [None]:
pos_websitsText = dict()

for i in range(1, len(pos_samples)):
    website = pos_samples[i][0]
    pos_websitsText[website] = [get_website_text(website)]
    pos_websitsText[website].append(''.join(pos_samples[i][1]))

In [None]:
neg_websitsText = dict()

for i in range(1, len(neg_samples)):
    website = neg_samples[i][0]
    neg_websitsText[website] = [get_website_text(website)]
    neg_websitsText[website].append(''.join(neg_samples[i][1]))

In [None]:
with open("pos.json", encoding="utf-8") as f:
    pos_data = json.load(f)

df_pos = pd.DataFrame.from_dict(pos_data, orient='index', columns=['text', 'keywords'])
df_pos = df_pos.reset_index().rename(columns={'index': 'url'})

In [None]:
df_pos.head()

Unnamed: 0,url,text,keywords
0,https://anchalproject.org/,Set the table for a bright Easter brunch. Anch...,"organic, social impact, AZO-free, , , , , , ,..."
1,https://www.nudiejeans.com/,Your wishlist is empty. Nudie Jeans kicked off...,"organic, , , , , , , , , , , , , , , , , , , ..."
2,https://www.fairindigo.com/,sustainable - fairly made - forever in fashion...,"organic, fair trade, , , , , , , , , , , , , ..."
3,https://recreateclothing.co.nz/,Every garment empowers its maker ♡ Naturally d...,", , , , , , , , , , , , , , , , , , , , , , ,..."
4,https://eticadenim.com/,"Sign up for exclusive offers, original stories...","REACh, , , , , , , , , , , , , , , , , , , , ..."


In [None]:
with open("neg.json", encoding="utf-8") as f:
    neg_data = json.load(f)

df_neg = pd.DataFrame.from_dict(neg_data, orient='index', columns=['text', 'keywords'])
df_neg = df_neg.reset_index().rename(columns={'index': 'url'})

In [None]:
df_neg.head()

Unnamed: 0,url,text,keywords
0,https://www.zaful.com/clothes-e_1,Welcome 15% OFF For New Users Or Join With ...,"trending, , , , , , , , , , , , , , , , , , ,..."
1,https://www.nastygal.com/womens,60% OFF Everything!* Shop Now! Download the ap...,"off everything, 60% off, , , , , , , , , , , ..."
2,https://oldnavy.gap.com/,Open & use a Navyist Rewards Credit Card & get...,", , , , , , , , , , , , , , , , , , , , , , ,..."
3,https://www.fashionnova.com/,Dresses Matching Sets Swim Tops Denim Shorts S...,"shop now, , , , , , , , , , , , , , , , , , ,..."
4,https://us.motelrocks.com/,UP TO 50% OFF SELECTED STYLES FREE USA SHIPPIN...,"60% off, off everything, , , , , , , , , , , ..."


In [None]:
df_pos["eco_friendly"] = "Yes"
df_neg["eco_friendly"] = "No"

In [None]:
df_pos.head()

Unnamed: 0,url,text,keywords,eco_friendly
0,https://anchalproject.org/,Set the table for a bright Easter brunch. Anch...,"organic, social impact, AZO-free, , , , , , ,...",Yes
1,https://www.nudiejeans.com/,Your wishlist is empty. Nudie Jeans kicked off...,"organic, , , , , , , , , , , , , , , , , , , ...",Yes
2,https://www.fairindigo.com/,sustainable - fairly made - forever in fashion...,"organic, fair trade, , , , , , , , , , , , , ...",Yes
3,https://recreateclothing.co.nz/,Every garment empowers its maker ♡ Naturally d...,", , , , , , , , , , , , , , , , , , , , , , ,...",Yes
4,https://eticadenim.com/,"Sign up for exclusive offers, original stories...","REACh, , , , , , , , , , , , , , , , , , , , ...",Yes


In [None]:
df_neg.head()

Unnamed: 0,url,text,keywords,eco_friendly
0,https://www.zaful.com/clothes-e_1,Welcome 15% OFF For New Users Or Join With ...,"trending, , , , , , , , , , , , , , , , , , ,...",No
1,https://www.nastygal.com/womens,60% OFF Everything!* Shop Now! Download the ap...,"off everything, 60% off, , , , , , , , , , , ...",No
2,https://oldnavy.gap.com/,Open & use a Navyist Rewards Credit Card & get...,", , , , , , , , , , , , , , , , , , , , , , ,...",No
3,https://www.fashionnova.com/,Dresses Matching Sets Swim Tops Denim Shorts S...,"shop now, , , , , , , , , , , , , , , , , , ,...",No
4,https://us.motelrocks.com/,UP TO 50% OFF SELECTED STYLES FREE USA SHIPPIN...,"60% off, off everything, , , , , , , , , , , ...",No


# Data Training - RUN FROM HERE

In [None]:
#all_df = pd.concat([df_pos, df_neg], ignore_index=True)
#all_df.to_csv("websites_data.csv", index=False, encoding='utf-8', errors='ignore')
all_df = pd.read_csv("websites_data.csv")

In [None]:
all_df.head()

Unnamed: 0,url,text,keywords,eco_friendly
0,https://anchalproject.org/,Set the table for a bright Easter brunch. Anch...,"organic, social impact, AZO-free, , , , , , ,...",Yes
1,https://www.nudiejeans.com/,Your wishlist is empty. Nudie Jeans kicked off...,"organic, , , , , , , , , , , , , , , , , , , ...",Yes
2,https://www.fairindigo.com/,sustainable - fairly made - forever in fashion...,"organic, fair trade, , , , , , , , , , , , , ...",Yes
3,https://recreateclothing.co.nz/,Every garment empowers its maker ♡ Naturally d...,", , , , , , , , , , , , , , , , , , , , , , ,...",Yes
4,https://eticadenim.com/,"Sign up for exclusive offers, original stories...","REACh, , , , , , , , , , , , , , , , , , , , ...",Yes


In [None]:
website_embeddings = dict()
eco_status = dict()

# Creating dataset to fine-tune

In [None]:
#pip install datasets torch

In [None]:
clothing = ["sweater", "jeans", "t-shirt", "dress", "jacket", "pants", "coat"]

In [None]:
train_examples = []

for _, row in all_df.iterrows():
    text = str(row['text'])
    url = row['url'].lower()
    keywords = str(row['keywords'])
    keywords_clean = ", ".join([k.strip() for k in keywords.split(",") if k.strip()])
    special_tokens = "[ECO] " + keywords_clean + " " + str(row['text']) if row['eco_friendly'] == "Yes" else str(row['text'])


    for cloth in clothing:
        if cloth in text or cloth in url:
            label = 1.0 if row['eco_friendly'] == "Yes" else 0.0
            train_examples.append(InputExample(texts=[cloth, special_tokens], label=label))
        else:
            train_examples.append(InputExample(texts=[cloth, special_tokens], label=0.0))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
loss = losses.CosineSimilarityLoss(model)

model.fit(train_objectives=[(dataloader, loss)], epochs=1)
model.save('fine_tuned_model')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msssure[0m ([33msssure-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.0538
1000,0.0414
1500,0.0349


In [None]:
model = SentenceTransformer('fine_tuned_model')

In [None]:
website_embeddings = {}

for _, row in all_df.iterrows():
    text = str(row['text'])
    website_embeddings[row['url']] = model.encode(text, convert_to_tensor=True)

In [None]:
torch.save(website_embeddings, "website_embeddings.pt")

In [None]:
def retrieve_websites(query, top_n=5):
    website_emb = torch.load("website_emb.pt")
    query_emb = model.encode(query, convert_to_tensor=True)

    similarities = {}
    for url, emb in website_emb.items():
        similarity = util.pytorch_cos_sim(query_emb, emb).item()
        similarities[url] = similarity

    sites = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    top_n = sites[:top_n]
    topSites = [url for url, similarity in top_n]

    return topSites

In [None]:
top5 = retrieve_websites("sweater", top_n=5)
print(top5)

['https://www.fairindigo.com/products/fair-indigo-womens-fair-trade-organic-stripe-sleeveless-sweater-shell', 'https://www.fairindigo.com/products/fair-indigo-womens-fair-trade-organic-relaxed-v-neck-sweater', 'https://www.fairindigo.com/collections/alpaca-sweaters', 'https://www.fairindigo.com/products/fair-indigo-womens-fair-trade-organic-boyfriend-cardigan', 'https://orbasics.com/products/kids-sweater']


In [None]:
import torch
from sentence_transformers import SentenceTransformer, util

def retrieve_websites(query, top_n=5):
    model = SentenceTransformer("fine_tuned_model", device='cpu')
    website_emb = torch.load("website_emb.pt", map_location=torch.device('cpu'))
    query_emb = model.encode(query, convert_to_tensor=True, device='cpu')

    similarities = {}
    for url, emb in website_emb.items():

        emb = emb.to('cpu')
        similarity = util.pytorch_cos_sim(query_emb, emb).item()
        similarities[url] = similarity

    sites = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    top_n = sites[:top_n]
    topSites = [url for url, similarity in top_n]

    return topSites


In [None]:
top_5_websites = retrieve_websites("sweater", top_n=5)
print(top_5_websites)

['https://www.fairindigo.com/products/fair-indigo-womens-fair-trade-organic-stripe-sleeveless-sweater-shell', 'https://www.fairindigo.com/products/fair-indigo-womens-fair-trade-organic-relaxed-v-neck-sweater', 'https://www.fairindigo.com/collections/alpaca-sweaters', 'https://www.fairindigo.com/products/fair-indigo-womens-fair-trade-organic-boyfriend-cardigan', 'https://orbasics.com/products/kids-sweater']
