In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# Carrega os links do CSV para uma lista
links_df = pd.read_csv('StackOverflowQuestions.csv')
links_list = links_df['Links'].tolist()

def get_post_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Verifica se a requisição foi bem-sucedida
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Capturando o título
        title = soup.find('a', class_='question-hyperlink')
        title_text = title.get_text() if title else "Title not found"
        
        # Capturando o corpo da pergunta
        question_body = soup.find('div', class_='s-prose js-post-body')
        question_text = question_body.get_text() if question_body else "Question body not found"
        
        # Capturando as respostas
        answers = soup.find_all('div', class_='s-prose js-post-body')
        answers_text = "\n\n".join([answer.get_text() for answer in answers[1:]]) if len(answers) > 1 else "No answers found"
        
        # Capturando os comentários (se houver)
        comments = soup.find_all('span', class_='comment-copy')
        comments_text = "\n".join([comment.get_text() for comment in comments]) if comments else "No comments found"
        
        # Unindo todos os conteúdos
        full_content = f"Title: {title_text}\n\nQuestion:\n{question_text}\n\nAnswers:\n{answers_text}\n\nComments:\n{comments_text}"
        
        return full_content
    
    except Exception as e:
        return f"Failed to retrieve content from {url}: {str(e)}"

# Lista para armazenar os conteúdos extraídos
contents = []

# Loop através da lista de links e processa cada um
for url in links_list:
    content = get_post_content(url)
    contents.append(content)
    
    # Aguardar alguns segundos antes de fazer a próxima requisição
    time.sleep(10)  # Aguarda 10 segundos entre as requisições


# Exibir os conteúdos extraídos (opcional)
for i, content in enumerate(contents):
    print(f"Content from link {i+1}:\n{content}\n{'-'*80}\n")


Content from link 1:
Title: Spring-boot UTF-8 resources

Question:

I'm using spring-boot (I'm following this tutorial), the current code is here.
My application is configured via @EnableAutoConfiguration, so I don't have web.xml, or other configuration files.
The application has a default handler for /**, files from projectRoot/src/main/webapp/ are served, but unfortunately with Content-Type: text/html;charset=ISO-8859-1. I'd like to serve (html files) with UTF-8 charset.
The static files are served by spring too, as far as I know: after launching the application, the following line apprears in the log: Mapped URL path [/**] onto handler of type [class org.springframework.web.servlet.resource.ResourceHttpRequestHandler] (github.com/spring-projects/spring-boot/blob/master/spring-boot-autoconfigure/src/main/java/org/springframework/boot/autoconfigure/web/WebMvcAutoConfiguration.java#L182, sorry, I cannot post more than 2 links)
Putting <meta charset="utf-8" /> or <meta http-equiv="Conte

In [43]:
import csv

# Caso queira salvar o conteúdo em um csv
with open('contents.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Content'])  # Escreve o cabeçalho
    for item in contents:
        writer.writerow([item])  # Escreve cada conteúdo como uma nova linha

print("Lista de conteúdos salva em contents.csv")


Lista de conteúdos salva em contents.csv


In [3]:
# Convertendo o conteúdo para um data frame
df = pd.read_csv('contents.csv')

print(df)


                                               Content
0    Title: Spring-boot UTF-8 resources\n\nQuestion...
1    Title: Cannot build Spring 4 project with Mave...
2    Title: How to run Spring 4 sample code in ecli...
3    Title: Dependency issue after adding spring-da...
4    Title: Spring Boot with Tomcat container secur...
..                                                 ...
261  Title: What are the tradeoffs between the diff...
262  Title: MIP SDK Java Wrapper Sample Crashed Wit...
263  Title: What does the ".Configure()" call do in...
264  Title: How to enable correctly the Application...
265  Title: Raise Azure VM from marketplace image v...

[266 rows x 1 columns]


In [21]:
import re

# Remove pontuação
df['content_processed'] = df['Content'].apply(lambda x: re.sub('()[,\.!?:]', '', x))

# Remove palavras específicas (title, question, answers, comments)
df['content_processed'] = df['content_processed'].apply(
    lambda x: re.sub(r'\b(title|question|answers|comments|https)\b', '', x, flags=re.IGNORECASE)
)

# Converte para minúsculas
df['content_processed'] = df['content_processed'].apply(lambda x: x.lower())

# Exibe o resultado
print(df)


                                               Content  \
0    Title: Spring-boot UTF-8 resources\n\nQuestion...   
1    Title: Cannot build Spring 4 project with Mave...   
2    Title: How to run Spring 4 sample code in ecli...   
3    Title: Dependency issue after adding spring-da...   
4    Title: Spring Boot with Tomcat container secur...   
..                                                 ...   
261  Title: What are the tradeoffs between the diff...   
262  Title: MIP SDK Java Wrapper Sample Crashed Wit...   
263  Title: What does the ".Configure()" call do in...   
264  Title: How to enable correctly the Application...   
265  Title: Raise Azure VM from marketplace image v...   

                                     content_processed  
0     spring-boot utf-8 resources\n\n\n\ni'm using ...  
1     cannot build spring 4 project with maven\n\n\...  
2     how to run spring 4 sample code in eclipse ke...  
3     dependency issue after adding spring-data-res...  
4     spring boot 

  df['content_processed'] = df['Content'].apply(lambda x: re.sub('()[,\.!?:]', '', x))


In [23]:
!pip install gensim
from gensim.utils import simple_preprocess

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords

# Baixar as stopwords do NLTK
nltk.download('stopwords')

# Definir stopwords
stop_words = stopwords.words('english')
# Extender as stopwords com palavras adicionais
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Função para converter sentenças em palavras
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True remove pontuações
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)

# Função para remover stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Extrair os dados da coluna 'content_processed' do DataFrame
data = df['content_processed'].values.tolist()

# Converter as sentenças em palavras
data_words = list(sent_to_words(data))

# Remover stopwords
data_words = remove_stopwords(data_words)

# Exibir as primeiras 30 palavras do primeiro documento processado
print(data_words[0][:30])




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tyumi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['spring', 'boot', 'utf', 'resources', 'using', 'spring', 'boot', 'following', 'tutorial', 'current', 'code', 'application', 'configured', 'via', 'webxml', 'configuration', 'files', 'application', 'default', 'handler', 'files', 'projectroot', 'src', 'main', 'webapp', 'served', 'unfortunately', 'content', 'type', 'text']


In [24]:
import gensim.corpora as corpora# Create Dictionary
id2word = corpora.Dictionary(data_words)# Create Corpus
texts = data_words# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 14), (8, 1), (9, 1), (10, 2), (11, 2), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 11), (18, 2), (19, 1), (20, 4), (21, 7), (22, 4), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 2), (29, 6)]


In [25]:
from pprint import pprint# number of topics
num_topics = 10 # Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.018*"spring" + 0.010*"boot" + 0.008*"application" + 0.007*"error" + '
  '0.007*"import" + 0.006*"azure" + 0.006*"class" + 0.006*"app" + '
  '0.005*"public" + 0.005*"file"'),
 (1,
  '0.008*"azure" + 0.007*"spring" + 0.007*"using" + 0.007*"version" + '
  '0.006*"api" + 0.006*"string" + 0.006*"app" + 0.005*"application" + '
  '0.005*"artifactid" + 0.005*"http"'),
 (2,
  '0.016*"artifactid" + 0.015*"groupid" + 0.012*"spring" + 0.012*"dependency" '
  '+ 0.010*"version" + 0.008*"boot" + 0.008*"error" + 0.006*"name" + '
  '0.006*"id" + 0.006*"import"'),
 (3,
  '0.011*"spring" + 0.009*"jar" + 0.007*"app" + 0.007*"api" + 0.007*"token" + '
  '0.007*"code" + 0.007*"azure" + 0.006*"get" + 0.005*"id" + 0.005*"user"'),
 (4,
  '0.014*"spring" + 0.010*"groupid" + 0.010*"jar" + 0.010*"artifactid" + '
  '0.010*"version" + 0.010*"java" + 0.009*"dependency" + 0.008*"error" + '
  '0.007*"boot" + 0.007*"application"'),
 (5,
  '0.014*"spring" + 0.008*"boot" + 0.008*"main" + 0.007*"error" + 0.007*"i

In [26]:
#!pip install pyLDAvis

import warnings
import os
import pyLDAvis.gensim
import pickle
import pyLDAvis

# Ignore Warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Enable visualization in notebooks.
pyLDAvis.enable_notebook()

# Set the number of topics as needed"
LDAvis_data_filepath = os.path.join('ldavis_prepared_' + str(num_topics))

# Conditional to execute the visualization preparation"
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# Load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

# Save visualization in HTML
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_' + str(num_topics) + '.html')

# Show visualization
LDAvis_prepared
