In [15]:
import requests

# Beautiful Soup is a Python library for pulling data out of HTML and XML files.
# It works with your favorite parser to provide idiomatic ways of navigating, searching,
# and modifying the parse tree. 
# It commonly saves programmers hours or days of work.
from bs4 import BeautifulSoup

# urllib.parse.urljoin(base, url, allow_fragments=True)
# Construct a full (“absolute”) URL by combining a “base URL” (base) with another URL (url).
# Informally, this uses components of the base URL, in particular the addressing scheme, 
# the network location and (part of) the path, to provide missing components in the relative URL. 
from urllib.parse import urljoin

import os

url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'

# Python requests are generally used to fetch the content from a particular resource URI. 
# Whenever we make a request to a specified URI through Python, it returns a response object. 
# Now, this response object would be used to access certain features such as content, headers, etc.
response = requests.get (url)

# response.content returns the content of the response, in bytes.
# Basically, it refers to Binary Response content.
response.content

# Running the response.content document through Beautiful Soup gives us a BeautifulSoup object,
# which represents the document as a nested data structure
soup = BeautifulSoup(response.content, 'html.parser')

# Method signature: find_all(name, attrs, recursive, string, limit, **kwargs)
# The find_all() method looks through a tag’s descendants and retrieves all descendants that match your filters.
soup.find_all('a')

# version_1
# href = []
# for link in soup.find_all('a'):
#    if link.has_attr('href'):
#        href.append(link['href'])

# version_2 : pour eviter les liens en double, passer en set
# Les objets de type set représentent un autre type de containers qui peut se révéler très pratique.
# Ils ont la particularité d'être modifiables, non hachables, non ordonnés, non indexables
# et de ne contenir qu'une seule copie maximum de chaque élément.

href = set()
for link in soup.find_all('a', href=True):
    if link['href'].startswith('/wiki') and ':' not in link['href']:
        href.add(link['href'])
print(len(href))


for element in href:
    url = 'https://en.wikipedia.org' + element
    print(url)

# using urljoint to complete the url
# url = urljoin('https://en.wikipedia.org', element)

imgs = set()
for link in soup.find_all('img', class_='thumbimage'):
    url = link['src']
    if url.startswith('//'):
        url = 'https:' + url
    elif url.startswith('/'):
        url = 'https://en.wikipedia.org' + url
    imgs.add(url)
print(len(imgs))

for element in imgs:
    print(url)

def create_dir(url, directory): # 
    dir_name = soup.find('title').text # soup est le content de l'url - ici le titre de l'url est choisi comme nom du dossier
    dir = os.path.join(base_directory, dir_name) # emplacement du dossier principal, nom du dossier a creer

def dowload_img(url, name, directory):
    response = requests.get(url)
    if response.status_code != 200:
        return
    extention = url.split('.')[-1]
    file_name = os.path.join(directory, f'{name}.{extention}')
    with open(file_name, 'wb') as filter: # "wb" means that you are writing to the file (w), and that you are writing in binary mode (b)
        file.write(response.content)

base_directory = '/home/mefathim/Documents/Python'

directory = create_dir(soup, base_directory)

for name, img in enumerate (imgs): # associe un nom au lien de chaque image d'apres son emplacement dans la liste
    dowload_img(img, name, directory) # dowload l'image dans un dossier d'apres son nom et son lien
    

# En Python, la fonction native enumerate() permet de parcourir un itérable 
# tout en gardant un compte des itérations. 
# Enumerate() renvoie un objet de type enumerate qui contient l’indice et l’élément parcouru. 
# On peut transformer cet objet en list ou tuple pour pouvoir l’utiliser.


815
https://en.wikipedia.org/wiki/Comparison_of_open-source_and_closed-source_software
https://en.wikipedia.org/wiki/Mila_(research_institute)
https://en.wikipedia.org/wiki/Firaxis_Games
https://en.wikipedia.org/wiki/List_of_open-source_bioinformatics_software
https://en.wikipedia.org/wiki/Nuke_(software)
https://en.wikipedia.org/wiki/Scheme_(programming_language)
https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization
https://en.wikipedia.org/wiki/Python_Software_Foundation
https://en.wikipedia.org/wiki/Java_(programming_language)
https://en.wikipedia.org/wiki/IDLE
https://en.wikipedia.org/wiki/PyPy
https://en.wikipedia.org/wiki/Nokia
https://en.wikipedia.org/wiki/System_administration
https://en.wikipedia.org/wiki/Stochastic_gradient_descent
https://en.wikipedia.org/wiki/Factorial
https://en.wikipedia.org/wiki/Regularization_(mathematics)
https://en.wikipedia.org/wiki/Flask_(web_framework)
https://en.wikipedia.org/wiki/Windows_XP
https://en.wikipedia.org/wiki/PythonAnywh