In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from ebooklib import epub
import datetime

In [20]:
request = requests.get(r'https://www.lesechos.fr/')

In [21]:
soup = BeautifulSoup(request.text, "html.parser")

In [22]:
links_with_text = []
for a in soup.find_all('a', href=True): 
    if a.text: 
        links_with_text.append(a['href'])

In [23]:
df_echo = pd.DataFrame(columns=["href","id", "Label", "Sublabel", "Title", "Lead", "Publication", "Description"])

In [24]:
df_echo["href"] = links_with_text

In [25]:
def split_it(href):
    try:
        result = re.findall('\d{7}$', href)
        result = result[0]
    except:
        result = ""
    return result

In [26]:
df_echo["id"] = df_echo["href"].apply(lambda x: split_it(x))

In [27]:
df_echo = df_echo.loc[df_echo["id"]!=""].reset_index(drop=True)

In [28]:
r = requests.get(r'https://api.lesechos.fr/api/v1/articles/1242262')
r = r.json()

In [29]:
for i, val in enumerate(df_echo.itertuples()):
    try:
        url = r"https://api.lesechos.fr/api/v1/articles/" + str(df_echo.at[i, "id"])
        r =  requests.get(url).json()
        df_echo.at[i, "Label"] = r["stripes"][0]["mainContent"][0]["data"]["section"]["label"] # Label
        df_echo.at[i, "Sublabel"]= r["stripes"][0]["mainContent"][0]["data"]["subsection"]["label"] # Sublabel
        df_echo.at[i, "Title"]= r["stripes"][0]["mainContent"][0]["data"]["title"] # Title
        df_echo.at[i, "Lead"]= r["stripes"][0]["mainContent"][0]["data"]["lead"] # Lead
        df_echo.at[i, "Publication"]= r["stripes"][0]["mainContent"][0]["data"]["publicationDate"]
        df_echo.at[i, "Description"] = r["stripes"][0]["mainContent"][0]["data"]["description"] # Description
        
    except:
        print("KO " + str(df_echo.at[i,"id"]))
       

KO 1243676
KO 1243470
KO 1243457


In [30]:
df_echo = df_echo.loc[df_echo["Label"] != ""].reset_index(drop=True)

In [31]:
# Filtre sur les articles du jours
now = datetime.date.today()
df_echo["Publication"] = pd.to_datetime(df_echo["Publication"])
df_echo["Publication"] = df_echo["Publication"].dt.date
df_echo = df_echo.loc[df_echo["Publication"]==now]

In [32]:
# Ajout des chapitres au format epub
# Create chapter
def chapter(title, filename, content):
    chapter = epub.EpubHtml(title=title, file_name = filename, lang='fr')
    chapter.content = content
    return chapter

for i, val in enumerate(df_echo.iterrows()):
    try:
        df_echo.at[i, "Chapter"]=chapter(df_echo.at[i,"Title"],'chap_0'+ str(i) + '.xhtml',df_echo.at[i, "Description"])
    except:
        df_echo.at[i, "Chapter"]="ko"


In [37]:
df_echo = df_echo.drop_duplicates("Title")
df_echo =  df_echo.loc[df_echo["Chapter"] != "ko"]
df_echo = df_echo.loc[df_echo["Chapter"].isna() == False]
df_echo = df_echo.reset_index(drop= True)

In [38]:
book = epub.EpubBook()
book.set_title('Echo ' + str(datetime.date.today()))
chapter_list = []

book.set_cover("image.jpg", open('Les-Echos cover.jpg', 'rb').read())

for chapter in list(df_echo["Chapter"]):
    book.add_item(chapter)

# Define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

# Add CSS file
book.add_item(nav_css)

book.toc = (
    (
        epub.Section("Finance & Marchés"),
        tuple(df_echo.loc[df_echo["Label"]=="Finance & Marchés", "Chapter"])),
           (epub.Section("Industrie Services"),
    tuple(df_echo.loc[df_echo["Label"]=="Industrie Services", "Chapter"])),
    (epub.Section("Monde"),
    tuple(df_echo.loc[df_echo["Label"]=="Monde", "Chapter"])),
    (epub.Section("Tech - Médias"),
    tuple(df_echo.loc[df_echo["Label"]=="Tech - Médias", "Chapter"])),
    (epub.Section("Idées & Débats"),
    tuple(df_echo.loc[df_echo["Label"]=="Idées & Débats", "Chapter"])),
    (epub.Section("Weekend"),
    tuple(df_echo.loc[df_echo["Label"]=="Weekend", "Chapter"])),
     (epub.Section("PME Régions"),
    tuple(df_echo.loc[df_echo["Label"]=="PME Régions", "Chapter"])),
    (epub.Section("Politique Société"),
    tuple(df_echo.loc[df_echo["Label"]=="Politique Société", "Chapter"])),
    (epub.Section("Sport"),
    tuple(df_echo.loc[df_echo["Label"]=="Sport", "Chapter"])),
    (epub.Section("Patrimoine"),
    tuple(df_echo.loc[df_echo["Label"]=="Patrimoine", "Chapter"]))  
           )
                      
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

# Basic spine
malist = ["nav"]
for chapter in list(df_echo["Chapter"]):
    malist.append(chapter)

book.spine = malist


# write to the file
epub.write_epub('test.epub', book, {})

In [36]:
print(df_echo["Chapter"])

0       <EpubHtml:chapter_0:chap_00.xhtml>
1       <EpubHtml:chapter_1:chap_01.xhtml>
2       <EpubHtml:chapter_2:chap_02.xhtml>
3       <EpubHtml:chapter_3:chap_03.xhtml>
4       <EpubHtml:chapter_4:chap_04.xhtml>
5       <EpubHtml:chapter_5:chap_05.xhtml>
6       <EpubHtml:chapter_6:chap_06.xhtml>
7       <EpubHtml:chapter_7:chap_08.xhtml>
8       <EpubHtml:chapter_8:chap_09.xhtml>
9      <EpubHtml:chapter_9:chap_010.xhtml>
10    <EpubHtml:chapter_10:chap_020.xhtml>
11    <EpubHtml:chapter_11:chap_021.xhtml>
12    <EpubHtml:chapter_12:chap_022.xhtml>
13    <EpubHtml:chapter_13:chap_023.xhtml>
14    <EpubHtml:chapter_14:chap_024.xhtml>
15    <EpubHtml:chapter_15:chap_025.xhtml>
16    <EpubHtml:chapter_16:chap_026.xhtml>
17    <EpubHtml:chapter_17:chap_027.xhtml>
18    <EpubHtml:chapter_18:chap_028.xhtml>
19    <EpubHtml:chapter_19:chap_030.xhtml>
20    <EpubHtml:chapter_20:chap_031.xhtml>
21    <EpubHtml:chapter_21:chap_032.xhtml>
22    <EpubHtml:chapter_22:chap_033.xhtml>
23    <Epub