In [1]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_article(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Erreur lors de l'accès à la page {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('h1')  # Ajustez le sélecteur si nécessaire
    # Extraction de la date en utilisant la classe 'itemDateCreated'
    date_span = soup.find('span', class_='itemDateCreated')
    date = date_span.get_text().strip().replace('Date de création:', '').strip() if date_span else "Date non trouvée"
    author = soup.find('div', class_='author')  # Ajustez si nécessaire

    content_container = soup.find('div', class_='itemIntroText')
    if content_container:
        content_parts = content_container.find_all('p', class_='texte textearticle')
        content = ' '.join(p.text.strip() for p in content_parts)
    else:
        content = "Contenu non trouvé"

    article = {
        'url': url,
        'title': title.text.strip() if title else "Titre non trouvé",
        'date': date,
        'author': author.text.strip() if author else "Auteur non trouvé",
        'content': content
    }
    
    return article

def main():
    urls = [
        "https://www.agenceecofin.com/formation/1804-117979-le-japon-ouvre-les-candidatures-2025-de-son-programme-de-bourses-aux-etudiants-etrangers",
        "https://www.agenceecofin.com/entreprendre/1804-117982-google-propose-un-programme-de-formation-et-de-mentorat-aux-start-up-du-secteur-de-l-education",
        "https://www.agenceecofin.com/gestion-publique/2204-118042-burkina-le-gouvernement-devoile-des-projets-de-cooperation-educative-avec-la-russie"
    ]
    
    articles = []
    
    for url in urls:
        article = scrape_article(url)
        if article:
            articles.append(article)
    
    # Sauvegarde en JSON
    with open('articles.json', 'w', encoding='utf-8') as f:
        json.dump(articles, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    main()

# Extracting and Processing Data with OpenAI

In this Jupyter Notebook, we demonstrate how to load, process, and use data for generating responses with OpenAI's GPT-3.5. We will walk through the steps of loading articles from a JSON file, vectorizing article content, finding the most relevant article based on a query, and generating responses using OpenAI's model.


## Setup and Imports

First, we import the necessary libraries and set up the environment for our project.

In [2]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [6]:
import json
import numpy as np
import openai
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
from dotenv import load_dotenv
from openai.types import Completion, CompletionChoice, CompletionUsage
# Load environment variables
load_dotenv()

# Initialize OpenAI client
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


## Loading Articles from JSON

This section covers how to load articles from a JSON file. The file should be structured as an array of articles, each with fields for URL, title, date, author, and content.

In [14]:
def load_articles(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        articles = json.load(file)
    return articles

## Vectorizing Article Content

We use the Sentence-BERT model to convert article contents into vector embeddings. This will facilitate the calculation of semantic similarity between a user's query and the articles.


In [15]:
def vectorize_articles(articles):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    for article in articles:
        article['embedding'] = model.encode(article['content'][:512], convert_to_tensor=True)
    return articles

## Finding the Most Similar Article

Here, we define a function to find the article most similar to a given query using cosine similarity between the query and article embeddings.

In [16]:
def find_most_similar_article(query, articles):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarities = [cosine_similarity(query_embedding.reshape(1, -1), article['embedding'].reshape(1, -1))[0][0] for article in articles]
    most_similar_index = np.argmax(similarities)
    return articles[most_similar_index]

## Generating Responses Using OpenAI's GPT-3

Once we have identified the most relevant article, we can generate a response by feeding the article content along with the query into the GPT-3 model.

In [17]:
def generate_response_with_gpt3(article, query):
    try:
        chat_response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # Adjust the model as necessary
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": query},
                {"role": "assistant", "content": article['content'][:1024]}  # Including article context
            ]
        )
        # Utilisation des propriétés directes des objets pour obtenir le contenu
        response_content = "\u200B\n\n" + chat_response.choices[0].message.content
        return response_content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


## Main  TEST Execution Function

The `main` function orchestrates the loading, processing, and querying of articles. It also outputs the generated response.

In [18]:
def main():
    articles = load_articles('articles.json')
    articles = vectorize_articles(articles)
    query = "Bonjour. Que peux tu me dire sur le japon "
    most_similar_article = find_most_similar_article(query, articles)
    response = generate_response_with_gpt3(most_similar_article, query)
    print("Generated Response:", response)

if __name__ == "__main__":
    main()



Generated Response: ​

japonaise pour poursuivre leurs études dans leur domaine d'intérêt. Les bourses couvrent les frais de scolarité, les frais de voyage aller-retour, une allocation mensuelle, ainsi que d'autres avantages. Le Japon est réputé pour ses avancées technologiques, sa culture riche et fascinante, sa cuisine délicieuse et sa société bien organisée. Le pays offre une combinaison unique de tradition et de modernité, avec des temples anciens côtoyant des gratte-ciel ultramodernes. Les Japonais sont connus pour leur politesse, leur sens du devoir et leur efficacité au travail. La culture japonaise regorge d'art, de musique, d'architecture, de littérature et de nombreux autres aspects qui la rendent captivante. N'hésitez pas à explorer davantage la culture japonaise si vous êtes intéressé par ce pays fascinant !
