In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage
url = 'https://github.com/topics'

# Fetch the HTML content of the page
response = requests.get(url)

# Print the status code to ensure the request was successful
print(f"Status code: {response.status_code}")

# Check if the response is successful (status code 200)
if response.status_code == 200:
    # Print the first 100 characters of the HTML content
    print(response.text[:100])

    # Save the HTML content to a file named 'webpage.html'
    with open('webpage.html', 'w', encoding='utf-8') as file:
        file.write(response.text)

    # Parse the saved HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the titles and descriptions of the topics
    topic_titles = []
    topic_descriptions = []

    # Loop through all topic containers on the page
    for topic in soup.find_all('div', class_='col-md-3'):
        title = topic.find('p', class_='f3').text.strip()
        description = topic.find('p', class_='f5').text.strip() if topic.find('p', class_='f5') else 'No description available'

        # Append the extracted data to the respective lists
        topic_titles.append(title)
        topic_descriptions.append(description)

    # Print the length and content of each extracted list
    print(f"Number of topics: {len(topic_titles)}")
    print(f"Topic Titles: {topic_titles[:5]}")  # Print the first 5 topic titles for brevity
    print(f"Topic Descriptions: {topic_descriptions[:5]}")  # Print the first 5 descriptions for brevity

    # Create a dictionary to structure the extracted data
    topics_dict = {
        'Title': topic_titles,
        'Description': topic_descriptions
    }

    # Convert the dictionary into a pandas DataFrame
    df = pd.DataFrame(topics_dict)

    # Print the DataFrame to confirm its structure and contents
    print("\nDataFrame:")
    print(df)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Status code: 200


<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-t
Number of topics: 0
Topic Titles: []
Topic Descriptions: []

DataFrame:
Empty DataFrame
Columns: [Title, Description]
Index: []


In [9]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = 'https://github.com/topics'

# Fetch the HTML content of the page
response = requests.get(url)

# Print the status code to ensure the request was successful
print(f"Status code: {response.status_code}")

# Check if the response is successful (status code 200)
if response.status_code == 200:
    # Parse the saved HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Print out the first 500 characters of the parsed HTML to inspect
    print(soup.prettify()[:500])  # Printing the first 500 characters of the parsed HTML

    # Inspect the structure of the page to see how topics are defined
    # Find all divs with class 'col-md-3', as topics are usually in these divs
    topic_containers = soup.find_all('div', class_='col-md-3')

    # If no topics are found in this class, print a message
    if not topic_containers:
        print("No topics found in div with class 'col-md-3'.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Status code: 200
<!DOCTYPE html>
<html data-a11y-animated-images="system" data-a11y-link-underlines="true" data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="https://github.githubassets.com" rel="dns-prefetch"/>
  <link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
  <link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
No topics found in div with class 'col-md-3'.


In [10]:
from bs4 import BeautifulSoup

# Open the saved HTML file

with open('webpage.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse HTML Content with BeautifulSoup

soup = BeautifulSoup(html_content, 'html.parser')

# Show the beginning of the HTML to see its structure

print(soup.prettify()[:500])  # Displays the first 500 characters to inspect the content

# Example 1: Extract topic titles
titles = soup.find_all(['h3', 'h2'])

# Example 2: Extract topic descriptions
descriptions = soup.find_all('p')  # Remplacez par les balises correctes pour la description

# Display extracted information
print(f"Nombre de titres extraits: {len(titles)}")
for title in titles[:5]:  # Limit display to the first 5 titles
    print(title.text.strip())

print(f"\nNombre de descriptions extraites: {len(descriptions)}")
for description in descriptions[:5]:  # Limit display to the first 5 descriptions
    print(description.text.strip())

<!DOCTYPE html>
<html data-a11y-animated-images="system" data-a11y-link-underlines="true" data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="https://github.githubassets.com" rel="dns-prefetch"/>
  <link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
  <link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
Nombre de titres extraits: 6
Navigation Menu
Use saved searches to filter your results more quickly
All featured topics
Popular topics
Footer

Nombre de descriptions extraites: 69
We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Browse popular topics on GitHub.
Elixir
Elixir is a dynamic, functional language designed for building scalable and maintainable applications.


In [11]:
 # Show length and content of extracted track list

print(f"\nNumber of titles extracted: {len(titles)}")
for idx, title in enumerate(titles, start=1):
    print(f"Title {idx}: {title.text.strip()}")

# Show length and content of extracted description list
print(f"\nNumber of descriptions extracted: {len(descriptions)}")
for idx, description in enumerate(descriptions, start=1):
    print(f"Description {idx}: {description.text.strip()}")


Number of titles extracted: 6
Title 1: Navigation Menu
Title 2: Use saved searches to filter your results more quickly
Title 3: All featured topics
Title 4: Popular topics
Title 5: Footer
Title 6: Footer navigation

Number of descriptions extracted: 69
Description 1: We read every piece of feedback, and take your input very seriously.
Description 2: To see all available qualifiers, see our documentation.
Description 3: Browse popular topics on GitHub.
Description 4: Elixir
Description 5: Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
Description 6: R
Description 7: R is a free programming language and software environment for statistical computing and graphics.
Description 8: Telegram
Description 9: Telegram is a non-profit, cloud-based instant messaging service.
Description 10: 3D
Description 11: 3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
Description 12: Ajax
Description 13

In [12]:
import pandas as pd
from bs4 import BeautifulSoup

# Example of extracting titles and descriptions
titles = soup.find_all(['h3', 'h2'])
descriptions = soup.find_all('p')

# Ensure both lists have the same length by trimming the longer list
min_length = min(len(titles), len(descriptions))

# Extracting text
titles_list = [title.text.strip() for title in titles[:min_length]]
descriptions_list = [description.text.strip() for description in descriptions[:min_length]]

# Creating a dictionary
data = {
    'Title': titles_list,
    'Description': descriptions_list
}

# Converting the dictionary to a pandas DataFrame
df = pd.DataFrame(data)

# Displaying the DataFrame
print(df)

                                               Title  \
0                                    Navigation Menu   
1  Use saved searches to filter your results more...   
2                                All featured topics   
3                                     Popular topics   
4                                             Footer   
5                                  Footer navigation   

                                         Description  
0  We read every piece of feedback, and take your...  
1  To see all available qualifiers, see our docum...  
2                   Browse popular topics on GitHub.  
3                                             Elixir  
4  Elixir is a dynamic, functional language desig...  
5                                                  R  
