In [2]:
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:

def extract_keywords(url: str) -> List[str]:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    keyword_counter = Counter(filtered_words)
    keywords = [word for word, count in keyword_counter.most_common(10)]
    return keywords


def train_text_analysis_model(data: List[str], labels: List[int], max_words: int, embedding_dim: int, max_length: int, num_epochs: int) -> tf.keras.models.Sequential:
    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')
    
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(padded_sequences, labels, epochs=num_epochs)

    return model

lista_urls_concorrente = [
    "https://www.flooranddecor.com/ledgers-stone/alamo-sandstone-random-ledger-panel-100899772.html",
    "https://www.flooranddecor.com/porcelain-ceramic-decoratives/zellige-pearl-opal-polished-ceramic-tile-100776673.html",
    "https://www.flooranddecor.com/limestone-basalt-stone/black-brushed-limestone-tile-100046804.html",
    "https://www.flooranddecor.com/limestone-basalt-stone/basalt-3-in.-hexagon-mosaic-100654334.html"
]

lista_urls_portobello = [
    "https://www.portobello.com.br/revestimentos/linha/bloomy",
    "https://www.portobello.com.br/revestimentos/linha/filo",
    "https://www.portobello.com.br/revestimentos/linha/chelsea"
]


"""Extraindo keywords concorrência"""
concorrente_keywords: Dict[str, List[str]] = {}
for url in lista_urls_concorrente:
    keywords = extract_keywords(url)
    concorrente_keywords[url] = keywords
    print(f"Palavras-chave para {url}: {keywords}")


"""Extraindo keywords Portobello"""
portobello_keywords: Dict[str, List[str]] = {}
for url in lista_urls_portobello:
    keywords = extract_keywords(url)
    portobello_keywords[url] = keywords
    print(f"Palavras-chave para {url}: {keywords}")


"""TReinar modelo com palavras chaves"""
"""Adicionar futuramente modelo com imagens e identificação de codigos hexadecimal de cores"""
data = [" ".join(keywords) for keywords in concorrente_keywords.values()]
labels = np.array([1] * len(data)) 
max_words = 1000
embedding_dim = 100
max_length = 100
num_epochs = 10

model = train_text_analysis_model(data, labels, max_words, embedding_dim, max_length, num_epochs)


"""Compara palavras chaves entre Concorrente e Portobello"""
for url_concorrente, keywords_concorrente in concorrente_keywords.items():
    for url_portobello, keywords_portobello in portobello_keywords.items():
        overlap = set(keywords_concorrente) & set(keywords_portobello)
        similarity = len(overlap) / max(len(keywords_concorrente), len(keywords_portobello))
        print(f"Similaridade entre {url_concorrente} e {url_portobello}: {similarity:.2%}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Palavras-chave para https://www.flooranddecor.com/ledgers-stone/alamo-sandstone-random-ledger-panel-100899772.html: ['shop', 'projects', 'pro', 'add', 'services', 'design', 'home', 'installation', 'piece', 'back']
Palavras-chave para https://www.flooranddecor.com/porcelain-ceramic-decoratives/zellige-pearl-opal-polished-ceramic-tile-100776673.html: ['shop', 'projects', 'pro', 'add', 'tile', 'services', 'design', 'installation', 'home', 'back']
Palavras-chave para https://www.flooranddecor.com/limestone-basalt-stone/black-brushed-limestone-tile-100046804.html: ['shop', 'projects', 'pro', 'add', 'stone', 'services', 'design', 'home', 'installation', 'tile']
Palavras-chave para https://www.flooranddecor.com/limestone-basalt-stone/basalt-3-in.-hexagon-mosaic-100654334.html: ['shop', 'projects', 'add', 'pro', 'design', 'services', 'installation', 'home', 'piece', 'stone']
Palavras-chave para https://www.portobello.com.br/revestimentos/linha/bloomy: ['e', 'portobello', 'de', 'para', 'bloomy'