In [1]:
# Import libraries
import re
import pandas as pd
import advertools as adv

from advertools import crawl
from typing import Dict, List, Optional

ModuleNotFoundError: No module named 'advertools'

In [2]:
""" Config with various media brand info to crawl and parse """

media_config = {

    "bfmtv": {
        "sitemap_index": None,
        "sitemap_url": "https://www.bfmtv.com/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.bfmtv\.com\/(?P<section>[\/\w-]*)\/.+$"
    },
    "lefigaro": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lefigaro.fr/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.lefigaro\.fr\/(?P<section>[\/\w-]*)\/.+$"
    },
    "francetvinfo": {
        "sitemap_index": None,
        "sitemap_url": "https://www.francetvinfo.fr/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.francetvinfo\.fr\/(?P<section>[\/\w-]*)\/.+$"
    },
    "lemonde": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lemonde.fr/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.lemonde\.fr\/(?P<section>[\/\w-]*)\/.+$"
    },
    
    "20_minutes": {
        "sitemap_index": None,
        "sitemap_url": "https://www.20minutes.fr/sitemap-news.xml",
        "regex_section": r"^https:\/\/www\.20minutes\.fr\/(?P<section>[\/\w-]*)\/.+$"
    },

    "liberation": {
        "sitemap_index": None,
        "sitemap_url": "https://www.liberation.fr/arc/outboundfeeds/sitemap_news.xml?outputType=xml",
        "regex_section": r"^https:\/\/www\.liberation\.fr\/(?P<section>[\/\w-]*)\/.+$"
    },

    "nouvel_obs": {
        "sitemap_index": None,
        "sitemap_url": "https://www.nouvelobs.com/sitemap-articles-news.xml",
        "regex_section": r"^https:\/\/www\.nouvelobs\.com\/(?P<section>[\/\w-]*)\/.+$"
    },

    "le_point": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lepoint.fr/sitemap-news.xml",
        "regex_section": r"^https:\/\/www\.lepoint\.fr\/(?P<section>[\/\w-]*)\/.+$"
    },

    "lexpress": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lexpress.fr/sitemap_actu_1.xml",
        "regex_section": r"^https:\/\/www\.lexpress\.fr\/(?P<section>[\/\w-]*)\/.+$"
    }

}


In [3]:
def get_sitemap(media_config: Dict) -> pd.DataFrame:
    ''' Scrap sitemap for each media and returns df '''
    
    df = pd.DataFrame(columns=[
        'loc', 'lastmod', 'news_publication_date', 'news_title', 'publication_name', 'download_date'])
    
    for media, media_conf in media_config.items():
        
        temp_df = adv.sitemap_to_df(media_conf['sitemap_url'])
        temp_df['media'] = media
        df = pd.concat([df, temp_df], axis=0)

    return df


In [4]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    ''' Clean df from unused columns '''
    
    df = df.rename(columns={'loc':'url'})
    df = df.drop(['image_loc', 'image_caption', 'sitemap', 'sitemap_size_mb', 'news',
   'news_publication', 'news_keywords', 'image', 'news_genres', 'etag', 'sitemap_last_modified', 
    'changefreq', 'priority', 'news_access', 'publication_language', 'lastmod', 'publication_name'], axis=1)
    
    return df


In [5]:
def find_sections(media_config: Dict, url: str, media: str) -> List[str]:
    ''' Find and parse section with url '''
    
    clean_url_from_date = re.sub(r"\/[0-9]{2,4}\/[0-9]{2,4}\/[0-9]{2,4}", '', url)
    search_url = re.search(media_config[media]["regex_section"], clean_url_from_date)
        
    return search_url.group("section").split('/') if search_url else ["unknown"]


In [6]:
def get_sections(media_config: Dict, df: pd.DataFrame)-> pd.DataFrame:
    ''' Get sections and apply it to df '''
    
    df["section"] = df.apply(lambda x: find_sections(media_config, x.url, x.media), axis=1)
    
    return df

In [7]:
def change_datetime_format(df: pd.DataFrame) -> pd.DataFrame:
    """ Changes the date format for BQ """

    # WARNING: timezone information are deleted in this function.
    for column_name in ["news_publication_date", "download_date"]:
        df[column_name] = pd.to_datetime(df[column_name]).apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        df[column_name] = df[column_name].apply(pd.Timestamp)
        
    return df

In [8]:
def run()-> pd.DataFrame:
    
    df = get_sitemap(media_config)
    df = clean_df(df)
    df = get_sections(media_config, df)
    df = change_datetime_format(df)
    
    return df

In [9]:
df = run()
df

2022-11-20 13:58:16,106 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bfmtv.com/sitemap_news.xml
2022-11-20 13:58:16,280 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lefigaro.fr/sitemap_news.xml
2022-11-20 13:58:16,402 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.francetvinfo.fr/sitemap_news.xml
2022-11-20 13:58:16,571 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lemonde.fr/sitemap_news.xml
2022-11-20 13:58:16,721 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.20minutes.fr/sitemap-news.xml
2022-11-20 13:58:16,925 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.liberation.fr/arc/outboundfeeds/sitemap_news.xml?outputType=xml
2022-11-20 13:58:17,172 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nouvelobs.com/sitemap-articles-news.xml
2022-11-20 13:58:17,405 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lepoint.fr/sitemap-news.xml
2022-11-20 13:58:17,636 |

Unnamed: 0,url,news_publication_date,news_title,download_date,media,section
0,https://www.bfmtv.com/normandie/replay-emissio...,2022-11-20 12:53:44,Seine-Maritime: un jeune homme brûlé à 35% à L...,2022-11-20 12:58:16,bfmtv,"[normandie, replay-emissions, normandie-week-end]"
1,https://www.bfmtv.com/tech/une-application-per...,2022-11-20 12:48:33,Une application permet de localiser les zones ...,2022-11-20 12:58:16,bfmtv,[tech]
2,https://www.bfmtv.com/normandie/replay-emissio...,2022-11-20 12:48:06,Le Havre: des visites fictives inclusives pour...,2022-11-20 12:58:16,bfmtv,"[normandie, replay-emissions, normandie-week-end]"
3,https://www.bfmtv.com/police-justice/affaire-e...,2022-11-20 12:45:05,"Affaire Élisa Pilarski: 3 ans après, l'enquête...",2022-11-20 12:58:16,bfmtv,[police-justice]
4,https://www.bfmtv.com/cote-d-azur/malgre-une-c...,2022-11-20 12:41:39,"Malgré une Coupe du monde polémique, le siège ...",2022-11-20 12:58:16,bfmtv,[cote-d-azur]
...,...,...,...,...,...,...
97,https://www.lexpress.fr/actualites/1/actualite...,2022-11-18 16:05:02,Les supporters privés d'alcool autour des stad...,2022-11-20 12:58:17,lexpress,"[actualites, 1, actualite]"
98,https://www.lexpress.fr/actualites/1/actualite...,2022-11-18 20:15:06,"Mondial: nuages autour de Benzema, Ballon d'Or...",2022-11-20 12:58:17,lexpress,"[actualites, 1, actualite]"
99,https://www.lexpress.fr/actualites/1/monde/ukr...,2022-11-20 04:45:21,"Sunak assure Kiev du soutien britannique ""jusq...",2022-11-20 12:58:17,lexpress,"[actualites, 1, monde]"
100,https://www.lexpress.fr/actualites/1/actualite...,2022-11-18 23:10:06,Mondial-2022: les organisateurs font volte-fac...,2022-11-20 12:58:17,lexpress,"[actualites, 1, actualite]"
