In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import csv
import re

In [5]:
# Carrega os links do CSV para uma lista
links_df = pd.read_csv('StackOverflowQuestions.csv')
links_list = links_df['Links'].tolist()

def get_post_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Verifica se a requisição foi bem-sucedida
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Capturando o título
        title = soup.find('a', class_='question-hyperlink')
        title_text = title.get_text() if title else "Title not found"
        
        # Capturando o corpo da pergunta
        question_body = soup.find('div', class_='s-prose js-post-body')
        question_text = question_body.get_text() if question_body else "Question body not found"
        
        # Capturando as respostas
        answers = soup.find_all('div', class_='s-prose js-post-body')
        answers_text = "\n\n".join([answer.get_text() for answer in answers[1:]]) if len(answers) > 1 else "No answers found"
        
        # Capturando os comentários (se houver)
        comments = soup.find_all('span', class_='comment-copy')
        comments_text = "\n".join([comment.get_text() for comment in comments]) if comments else "No comments found"
        
        # Unindo todos os conteúdos
        full_content = f"Title: {title_text}\n\nQuestion:\n{question_text}\n\nAnswers:\n{answers_text}\n\nComments:\n{comments_text}"
        
        return full_content
    
    except Exception as e:
        return f"Failed to retrieve content from {url}: {str(e)}"

# Lista para armazenar os links e conteúdos
data = []

# Loop através da lista de links e processa cada um
for url in links_list:
    content = get_post_content(url)
    data.append({'Link': url, 'Content': content})
    
    # Aguardar alguns segundos antes de fazer a próxima requisição
    time.sleep(10)  # Aguarda 10 segundos entre as requisições

# Criar DataFrame com os links e conteúdos
output_df = pd.DataFrame(data)

# Salvar o DataFrame em um arquivo CSV
output_df.to_csv('StackOverflowExtractedContent.csv', index=False)

# Exibir os primeiros conteúdos extraídos (opcional)
print(output_df.head())



                                           Link  \
0  https://stackoverflow.com/questions/20168563   
1  https://stackoverflow.com/questions/22935840   
2  https://stackoverflow.com/questions/23023732   
3  https://stackoverflow.com/questions/25262377   
4  https://stackoverflow.com/questions/26177428   

                                             Content  
0  Title: Spring-boot UTF-8 resources\n\nQuestion...  
1  Title: Cannot build Spring 4 project with Mave...  
2  Title: How to run Spring 4 sample code in ecli...  
3  Title: Dependency issue after adding spring-da...  
4  Title: Spring Boot with Tomcat container secur...  


In [43]:
# Para salvar o conteúdo em csv, caso queira modificar algo em um outro dia
with open('contents.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Content'])  # Escreve o cabeçalho
    for item in contents:
        writer.writerow([item])  # Escreve cada conteúdo como uma nova linha

print("Salvo em contents.csv")


Lista de conteúdos salva em contents.csv


In [1]:
# Convertendo o conteúdo para um data frame
df = pd.read_csv('StackOverflowExtractedContent.csv')

print(df)


                                             Link  \
0    https://stackoverflow.com/questions/20168563   
1    https://stackoverflow.com/questions/22935840   
2    https://stackoverflow.com/questions/23023732   
3    https://stackoverflow.com/questions/25262377   
4    https://stackoverflow.com/questions/26177428   
..                                            ...   
261  https://stackoverflow.com/questions/73773112   
262  https://stackoverflow.com/questions/75280876   
263  https://stackoverflow.com/questions/62837607   
264  https://stackoverflow.com/questions/77925827   
265  https://stackoverflow.com/questions/65185095   

                                               Content  
0    Title: Spring-boot UTF-8 resources\n\nQuestion...  
1    Title: Cannot build Spring 4 project with Mave...  
2    Title: How to run Spring 4 sample code in ecli...  
3    Title: Dependency issue after adding spring-da...  
4    Title: Spring Boot with Tomcat container secur...  
..                   

In [2]:
# Remove pontuação
df['content_processed'] = df['Content'].apply(lambda x: re.sub('()[,.!?:]', '', x))

# Remove palavras específicas (title, question, answers, comments)
df['content_processed'] = df['content_processed'].apply(
    lambda x: re.sub(r'\b(title|question|answers|comments|https|google|spring|azure|java|utf|jar|boot|http|public)\b', '', x, flags=re.IGNORECASE)
)

# Converte para minúsculas
df['content_processed'] = df['content_processed'].apply(lambda x: x.lower())

# Exibe o resultado
print(df)


                                             Link  \
0    https://stackoverflow.com/questions/20168563   
1    https://stackoverflow.com/questions/22935840   
2    https://stackoverflow.com/questions/23023732   
3    https://stackoverflow.com/questions/25262377   
4    https://stackoverflow.com/questions/26177428   
..                                            ...   
261  https://stackoverflow.com/questions/73773112   
262  https://stackoverflow.com/questions/75280876   
263  https://stackoverflow.com/questions/62837607   
264  https://stackoverflow.com/questions/77925827   
265  https://stackoverflow.com/questions/65185095   

                                               Content  \
0    Title: Spring-boot UTF-8 resources\n\nQuestion...   
1    Title: Cannot build Spring 4 project with Mave...   
2    Title: How to run Spring 4 sample code in ecli...   
3    Title: Dependency issue after adding spring-da...   
4    Title: Spring Boot with Tomcat container secur...   
..             

In [45]:
#!pip install gensim
from gensim.utils import simple_preprocess

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords

# Baixar as stopwords do NLTK
nltk.download('stopwords')

# Definir stopwords
stop_words = stopwords.words('english')
# Extender as stopwords com palavras adicionais
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Função para converter sentenças em palavras
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True remove pontuações
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)

# Função para remover stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Extrair os dados da coluna 'content_processed' do DataFrame
data = df['content_processed'].values.tolist()

# Converter as sentenças em palavras
data_words = list(sent_to_words(data))

# Remover stopwords
data_words = remove_stopwords(data_words)

# Exibir as primeiras 30 palavras do primeiro documento processado
print(data_words[0][:50])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tyumi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['resources', 'using', 'following', 'tutorial', 'current', 'code', 'application', 'configured', 'via', 'webxml', 'configuration', 'files', 'application', 'default', 'handler', 'files', 'projectroot', 'src', 'main', 'webapp', 'served', 'unfortunately', 'content', 'type', 'text', 'html', 'charset', 'iso', 'like', 'serve', 'html', 'files', 'charset', 'static', 'files', 'served', 'far', 'know', 'launching', 'application', 'following', 'line', 'apprears', 'log', 'mapped', 'url', 'path', 'onto', 'handler', 'type']


In [5]:
# Dependências
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint

# Baixar as stopwords do NLTK
nltk.download('stopwords')

# Definir stopwords
stop_words = stopwords.words('english')
# Extender as stopwords com palavras adicionais
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Função para converter sentenças em palavras
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True remove pontuações
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)

# Função para remover stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Extrair os dados da coluna 'content_processed' e 'link' do DataFrame
data = df['content_processed'].values.tolist()
links = df['Link'].values.tolist()  # Supondo que 'link' contém os links das perguntas

# Converter as sentenças em palavras
data_words = list(sent_to_words(data))

# Remover stopwords
data_words = remove_stopwords(data_words)

# Criar o Dicionário
id2word = corpora.Dictionary(data_words)

# Criar o Corpus
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

# Definir o número de tópicos
num_topics = 15

# Treinar o modelo LDA
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Exibir os tópicos gerados com suas palavras-chave
pprint(lda_model.print_topics())

# Relacionar as perguntas com os tópicos
# Vamos capturar o tópico mais relevante para cada documento

doc_lda = lda_model[corpus]  # Distribuição de tópicos para cada documento

# Lista para armazenar o mapeamento de perguntas para tópicos
questions_by_topic = {i: [] for i in range(num_topics)}

# Para cada documento (pergunta), vamos identificar o tópico mais provável
for i, doc in enumerate(doc_lda):
    # Encontrar o tópico com a maior probabilidade
    dominant_topic = sorted(doc, key=lambda x: x[1], reverse=True)[0][0]
    # Adicionar a pergunta (ou link da pergunta) ao tópico mais relevante
    questions_by_topic[dominant_topic].append(links[i])

# Exibir as perguntas relacionadas para cada tópico
for topic_id, questions in questions_by_topic.items():
    print(f"\nTópico {topic_id}:")
    pprint(lda_model.print_topic(topic_id))  # Exibe as palavras do tópico
    print(f"Perguntas relacionadas ({len(questions)}):")
    for question in questions:
        print(question)  # Aqui podemos exibir o link ou outra informação relevante da pergunta


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tyumi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(0,
  '0.006*"name" + 0.006*"error" + 0.005*"application" + 0.005*"app" + '
  '0.005*"api" + 0.005*"get" + 0.005*"project" + 0.005*"version" + '
  '0.004*"string" + 0.004*"id"'),
 (1,
  '0.013*"import" + 0.010*"new" + 0.009*"error" + 0.008*"application" + '
  '0.007*"class" + 0.007*"test" + 0.006*"app" + 0.006*"using" + 0.006*"file" + '
  '0.005*"project"'),
 (2,
  '0.015*"api" + 0.012*"application" + 0.011*"app" + 0.007*"error" + '
  '0.007*"web" + 0.006*"user" + 0.006*"get" + 0.006*"using" + 0.005*"need" + '
  '0.005*"code"'),
 (3,
  '0.009*"application" + 0.007*"main" + 0.006*"string" + 0.006*"new" + '
  '0.006*"using" + 0.006*"class" + 0.006*"info" + 0.006*"run" + 0.006*"error" '
  '+ 0.006*"project"'),
 (4,
  '0.010*"groupid" + 0.010*"string" + 0.007*"artifactid" + 0.007*"using" + '
  '0.007*"version" + 0.006*"app" + 0.006*"static" + 0.006*"application" + '
  '0.006*"get" + 0.005*"return"'),
 (5,
  '0.013*"groupid" + 0.011*"artifactid" + 0.010*"dependency" + '
  '0.009*"applicati

In [47]:
import gensim.corpora as corpora# Create Dictionary
id2word = corpora.Dictionary(data_words)# Create Corpus
texts = data_words# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 14), (8, 1), (9, 1), (10, 2), (11, 2), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 11), (18, 2), (19, 1), (20, 4), (21, 4), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 2), (28, 6), (29, 1)]


In [49]:
from pprint import pprint# number of topics
num_topics = 15 # Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.008*"code" + 0.007*"project" + 0.007*"app" + 0.006*"using" + '
  '0.006*"string" + 0.006*"class" + 0.006*"application" + 0.005*"artifactid" + '
  '0.005*"constructor" + 0.005*"dependency"'),
 (1,
  '0.011*"info" + 0.009*"error" + 0.008*"artifactid" + 0.008*"application" + '
  '0.007*"version" + 0.007*"dependency" + 0.006*"groupid" + 0.006*"main" + '
  '0.005*"bean" + 0.005*"using"'),
 (2,
  '0.007*"app" + 0.006*"error" + 0.005*"string" + 0.005*"name" + 0.005*"run" + '
  '0.005*"version" + 0.005*"code" + 0.004*"application" + 0.004*"project" + '
  '0.004*"class"'),
 (3,
  '0.011*"jar" + 0.007*"artifactid" + 0.007*"application" + 0.006*"app" + '
  '0.006*"error" + 0.006*"import" + 0.006*"web" + 0.006*"class" + 0.006*"run" '
  '+ 0.006*"version"'),
 (4,
  '0.009*"new" + 0.008*"using" + 0.007*"application" + 0.007*"code" + '
  '0.006*"artifactid" + 0.006*"app" + 0.005*"name" + 0.005*"version" + '
  '0.005*"groupid" + 0.005*"string"'),
 (5,
  '0.008*"app" + 0.007*"import" + 0.006*

In [51]:
#!pip install pyLDAvis

import warnings
import os
import pyLDAvis.gensim
import pickle
import pyLDAvis

# Ignore Warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Enable visualization in notebooks.
pyLDAvis.enable_notebook()

# Set the number of topics as needed"
LDAvis_data_filepath = os.path.join('ldavis_prepared_' + str(num_topics))

# Conditional to execute the visualization preparation"
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# Load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

# Save visualization in HTML
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_' + str(num_topics) + '.html')

# Show visualization
LDAvis_prepared
