In [None]:
# LINE TO DELETE - Test Git Push

In [2]:
# Import libraries
import re
import pandas as pd
import advertools as adv

from advertools import crawl
from typing import Dict, List, Optional

In [None]:
# TODO
# - deduplication
# - 

In [45]:
""" Config with various media brand info to crawl and parse """

media_config = {

    "bfmtv": {
        "sitemap_index": None,
        "sitemap_url": "https://www.bfmtv.com/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.bfmtv\.com\/(?P<section>[\/\w-]*)\/.+$",
        "type": "tv"
    },
    "lefigaro": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lefigaro.fr/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.lefigaro\.fr\/(?P<section>[\/\w-]*)\/.+$",
        "type": "webpress"
    },
    "francetvinfo": {
        "sitemap_index": None,
        "sitemap_url": "https://www.francetvinfo.fr/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.francetvinfo\.fr\/(?P<section>[\/\w-]*)\/.+$",
        "type": "tv"
    },
    "lemonde": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lemonde.fr/sitemap_news.xml",
        "regex_section": r"^https:\/\/www\.lemonde\.fr\/(?P<section>[\/\w-]*)\/.+$",
    },
    
    "20_minutes": {
        "sitemap_index": None,
        "sitemap_url": "https://www.20minutes.fr/sitemap-news.xml",
        "regex_section": r"^https:\/\/www\.20minutes\.fr\/(?P<section>[\/\w-]*)\/.+$",
        "type": "webpress"
    },

    "liberation": {
        "sitemap_index": None,
        "sitemap_url": "https://www.liberation.fr/arc/outboundfeeds/sitemap_news.xml?outputType=xml",
        "regex_section": r"^https:\/\/www\.liberation\.fr\/(?P<section>[\/\w-]*)\/.+$",
        "type": "webpress"
    },

    "nouvel_obs": {
        "sitemap_index": None,
        "sitemap_url": "https://www.nouvelobs.com/sitemap-articles-news.xml",
        "regex_section": r"^https:\/\/www\.nouvelobs\.com\/(?P<section>[\/\w-]*)\/.+$",
        "type": "webpress"
    },

    "le_point": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lepoint.fr/sitemap-news.xml",
        "regex_section": r"^https:\/\/www\.lepoint\.fr\/(?P<section>[\/\w-]*)\/.+$",
        "type": "webpress"
    },

    "lexpress": {
        "sitemap_index": None,
        "sitemap_url": "https://www.lexpress.fr/sitemap_actu_1.xml",
        "regex_section": r"^https:\/\/www\.lexpress\.fr\/(?P<section>[\/\w-]*)\/.+$",
        "type": "webpress"
    }

}


In [5]:
def get_sitemap(media_config: Dict) -> pd.DataFrame:
    ''' Scrap sitemap for each media and returns df '''
    
    df = pd.DataFrame(columns=[
        'loc', 'lastmod', 'news_publication_date', 'news_title', 'publication_name', 'download_date'])
    
    for media, media_conf in media_config.items():
        
        temp_df = adv.sitemap_to_df(media_conf['sitemap_url'])
        temp_df['media'] = media
        df = pd.concat([df, temp_df], axis=0)

    return df


In [6]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    ''' Clean df from unused columns '''
    
    df = df.rename(columns={'loc':'url'})
    df = df.drop(['image_loc', 'image_caption', 'sitemap', 'sitemap_size_mb', 'news',
   'news_publication', 'news_keywords', 'image', 'news_genres', 'etag', 'sitemap_last_modified', 
    'changefreq', 'priority', 'news_access', 'publication_language', 'lastmod', 'publication_name'], axis=1)
    
    return df


In [7]:
def find_sections(media_config: Dict, url: str, media: str) -> List[str]:
    ''' Find and parse section with url '''
    
    clean_url_from_date = re.sub(r"\/[0-9]{2,4}\/[0-9]{2,4}\/[0-9]{2,4}", '', url)
    search_url = re.search(media_config[media]["regex_section"], clean_url_from_date)
        
    return search_url.group("section").split('/') if search_url else ["unknown"]


In [8]:
def get_sections(media_config: Dict, df: pd.DataFrame)-> pd.DataFrame:
    ''' Get sections and apply it to df '''
    
    df["section"] = df.apply(lambda x: find_sections(media_config, x.url, x.media), axis=1)
    
    return df

In [9]:
def change_datetime_format(df: pd.DataFrame) -> pd.DataFrame:
    """ Changes the date format for BQ """

    # WARNING: timezone information are deleted in this function.
    for column_name in ["news_publication_date", "download_date"]:
        df[column_name] = pd.to_datetime(df[column_name]).apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        df[column_name] = df[column_name].apply(pd.Timestamp)
        
    return df

In [10]:
def run()-> pd.DataFrame:
    
    df = get_sitemap(media_config)
    df = clean_df(df)
    df = get_sections(media_config, df)
    df = change_datetime_format(df)
    
    return df

In [9]:
df = run()
df

2022-11-20 13:58:16,106 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bfmtv.com/sitemap_news.xml
2022-11-20 13:58:16,280 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lefigaro.fr/sitemap_news.xml
2022-11-20 13:58:16,402 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.francetvinfo.fr/sitemap_news.xml
2022-11-20 13:58:16,571 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lemonde.fr/sitemap_news.xml
2022-11-20 13:58:16,721 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.20minutes.fr/sitemap-news.xml
2022-11-20 13:58:16,925 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.liberation.fr/arc/outboundfeeds/sitemap_news.xml?outputType=xml
2022-11-20 13:58:17,172 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nouvelobs.com/sitemap-articles-news.xml
2022-11-20 13:58:17,405 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lepoint.fr/sitemap-news.xml
2022-11-20 13:58:17,636 |

Unnamed: 0,url,news_publication_date,news_title,download_date,media,section
0,https://www.bfmtv.com/normandie/replay-emissio...,2022-11-20 12:53:44,Seine-Maritime: un jeune homme brûlé à 35% à L...,2022-11-20 12:58:16,bfmtv,"[normandie, replay-emissions, normandie-week-end]"
1,https://www.bfmtv.com/tech/une-application-per...,2022-11-20 12:48:33,Une application permet de localiser les zones ...,2022-11-20 12:58:16,bfmtv,[tech]
2,https://www.bfmtv.com/normandie/replay-emissio...,2022-11-20 12:48:06,Le Havre: des visites fictives inclusives pour...,2022-11-20 12:58:16,bfmtv,"[normandie, replay-emissions, normandie-week-end]"
3,https://www.bfmtv.com/police-justice/affaire-e...,2022-11-20 12:45:05,"Affaire Élisa Pilarski: 3 ans après, l'enquête...",2022-11-20 12:58:16,bfmtv,[police-justice]
4,https://www.bfmtv.com/cote-d-azur/malgre-une-c...,2022-11-20 12:41:39,"Malgré une Coupe du monde polémique, le siège ...",2022-11-20 12:58:16,bfmtv,[cote-d-azur]
...,...,...,...,...,...,...
97,https://www.lexpress.fr/actualites/1/actualite...,2022-11-18 16:05:02,Les supporters privés d'alcool autour des stad...,2022-11-20 12:58:17,lexpress,"[actualites, 1, actualite]"
98,https://www.lexpress.fr/actualites/1/actualite...,2022-11-18 20:15:06,"Mondial: nuages autour de Benzema, Ballon d'Or...",2022-11-20 12:58:17,lexpress,"[actualites, 1, actualite]"
99,https://www.lexpress.fr/actualites/1/monde/ukr...,2022-11-20 04:45:21,"Sunak assure Kiev du soutien britannique ""jusq...",2022-11-20 12:58:17,lexpress,"[actualites, 1, monde]"
100,https://www.lexpress.fr/actualites/1/actualite...,2022-11-18 23:10:06,Mondial-2022: les organisateurs font volte-fac...,2022-11-20 12:58:17,lexpress,"[actualites, 1, actualite]"


In [11]:
df = run()
df

2022-11-22 18:54:52,691 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bfmtv.com/sitemap_news.xml
2022-11-22 18:54:53,140 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lefigaro.fr/sitemap_news.xml
2022-11-22 18:54:53,535 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.francetvinfo.fr/sitemap_news.xml
2022-11-22 18:54:54,069 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lemonde.fr/sitemap_news.xml
2022-11-22 18:54:54,411 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.20minutes.fr/sitemap-news.xml
2022-11-22 18:54:54,660 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.liberation.fr/arc/outboundfeeds/sitemap_news.xml?outputType=xml
2022-11-22 18:54:55,229 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nouvelobs.com/sitemap-articles-news.xml
2022-11-22 18:54:55,567 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lepoint.fr/sitemap-news.xml
2022-11-22 18:54:55,955 |

Unnamed: 0,url,news_publication_date,news_title,download_date,media,section
0,https://www.bfmtv.com/tech/bons-plans/black-fr...,2022-11-22 17:53:00,Black Friday : remise sur les AirPods Pro 2 pe...,2022-11-22 17:54:52,bfmtv,"[tech, bons-plans]"
1,https://www.bfmtv.com/var/replay-emissions/bon...,2022-11-22 17:52:17,Toulon: les professionnels du secteur de la ju...,2022-11-22 17:54:52,bfmtv,"[var, replay-emissions, bonsoir-var]"
2,https://www.bfmtv.com/economie/hausse-des-prix...,2022-11-22 17:49:27,Hausse des prix: le porte-parole de JouéClub n...,2022-11-22 17:54:52,bfmtv,[economie]
3,https://www.bfmtv.com/sante/variole-du-singe-u...,2022-11-22 17:48:35,Variole du singe: une étude britannique indiqu...,2022-11-22 17:54:52,bfmtv,[sante]
4,https://www.bfmtv.com/economie/achats-de-jouet...,2022-11-22 17:47:44,"Achats de jouets de Noël: ""Les consommateurs o...",2022-11-22 17:54:52,bfmtv,[economie]
...,...,...,...,...,...,...
106,https://www.lexpress.fr/actualites/1/societe/v...,2022-11-21 14:15:21,Vins: les enchères des Hospices de Beaune à de...,2022-11-22 17:54:55,lexpress,"[actualites, 1, societe]"
107,https://www.lexpress.fr/actualites/1/societe/c...,2022-11-20 20:10:04,Collégienne tuée en Lot-et-Garonne: le suspect...,2022-11-22 17:54:55,lexpress,"[actualites, 1, societe]"
108,https://www.lexpress.fr/actualites/1/monde/la-...,2022-11-20 19:50:21,La Russie et l'Ukraine s'accusent de bombardem...,2022-11-22 17:54:55,lexpress,"[actualites, 1, monde]"
109,https://www.lexpress.fr/actualites/1/actualite...,2022-11-20 20:40:04,"Mondial: pas de miracle pour le Qatar, dominé ...",2022-11-22 17:54:55,lexpress,"[actualites, 1, actualite]"


In [12]:
df.groupby('media').count()

Unnamed: 0_level_0,url,news_publication_date,news_title,download_date,section
media,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20_minutes,462,462,462,462,462
bfmtv,1000,1000,1000,1000,1000
francetvinfo,361,361,361,361,361
le_point,252,252,252,252,252
lefigaro,686,686,686,686,686
lemonde,237,237,237,237,237
lexpress,111,111,111,111,111
liberation,100,100,100,100,100
nouvel_obs,90,90,90,90,90


In [None]:
df.groupby('media').count()

In [28]:
len(df[df.news_title.str.contains("COP")])/len(df)

0.009093664746892998

In [25]:
df[df.news_title.str.contains("COP")]

Unnamed: 0,url,news_publication_date,news_title,download_date,media,section
898,https://www.bfmtv.com/environnement/climat/man...,2022-11-21 10:59:04,"""Manque d'ambition"" et ""déception"": quel bilan...",2022-11-22 17:54:52,bfmtv,"[environnement, climat]"
239,https://www.lefigaro.fr/vox/societe/cop27-plut...,2022-11-21 17:09:59,"COP27 : «La décroissance n'est pas une option,...",2022-11-22 17:54:53,lefigaro,"[vox, societe]"
516,https://www.lefigaro.fr/sciences/remy-roux-cer...,2022-11-20 20:35:42,Rémy Rioux: «Certains résultats de la COP27 so...,2022-11-22 17:54:53,lefigaro,[sciences]
559,https://www.lefigaro.fr/flash-actu/cop27-encor...,2022-11-21 11:41:08,COP27 : «encore un long chemin» après l'accord...,2022-11-22 17:54:53,lefigaro,[flash-actu]
589,https://www.lefigaro.fr/sciences/cop27-un-acco...,2022-11-19 14:12:00,Un succès en demi-teinte pour la COP27 qui min...,2022-11-22 17:54:53,lefigaro,[sciences]
596,https://www.lefigaro.fr/international/cop27-l-...,2022-11-20 20:05:33,"COP27: l’agriculture, le parent pauvre des nég...",2022-11-22 17:54:53,lefigaro,[international]
213,https://www.francetvinfo.fr/monde/environnemen...,2022-11-21 19:10:16,COP27 : pourquoi l'accord final ne mentionne-t...,2022-11-22 17:54:53,francetvinfo,"[monde, environnement, cop]"
297,https://www.francetvinfo.fr/replay-radio/les-i...,2022-11-21 10:42:05,L'aide du gouvernement aux entreprises face à ...,2022-11-22 17:54:53,francetvinfo,"[replay-radio, les-informes-du-matin]"
342,https://www.francetvinfo.fr/monde/environnemen...,2022-11-20 22:29:33,COP 27 : une édition décevante et sans ambition,2022-11-22 17:54:53,francetvinfo,"[monde, environnement, cop]"
343,https://www.francetvinfo.fr/replay-radio/les-i...,2022-11-20 22:10:06,"Mondial au Qatar c'est parti, Benzema forfait,...",2022-11-22 17:54:53,francetvinfo,"[replay-radio, les-informes-de-france-info]"


In [None]:
LANDING_PATH = 'data_public/media=%s/'

In [58]:
def query_one_sitemap_and_transform(media:str, media_conf: Dict)-> pd.DataFrame:
    """Query a site map url from media_conf and tranform it as pd.DataFrame

    Args:
        media_conf (Dict): from MEDIA_CONF

    Returns:
        pd.DataFrame
    """
    temp_df = adv.sitemap_to_df(media_conf['sitemap_url'])
    temp_df.rename(columns={'loc':'url'}, inplace=True)
    temp_df['media'] = media
    df = get_sections(media_conf, temp_df)
    df = change_datetime_format(df)
    return df

def sanity_check(df:pd.DataFrame):
    """ Checks if the data is correct"""
    # TODO pandera?

# TODO slack login

def write_df(df: pd.DataFrame, media: str, media_conf: Dict):
    """ Write de extracted dataframe to standardized path"""
    download_date = datetime.datetime.today()
    # TODO make dir 
    df.to_csv('/data_public/sitemap_dumps/media_type=%s/media=%s/year=%s/month=%y/day=%s'%(media_conf['type'], media, download_date.year, download_date.month, download_date.day))

def run():
    for media, media_conf in media_config.items():
        try:
            df = query_one_sitemap_and_transform(media, media_conf)
            write_df(df)
        except Exception as e:
            print(e)
            # TODO proper logging
            continue

In [50]:
download_date = datetime.today()

In [52]:
download_date.month

11

In [None]:
year/month/day

In [36]:
from datetime import datetime

In [43]:
datetime.today().strftime('%Y%m%d_%H:%m')

'20221123_08:11'

In [57]:
media_conf

{'sitemap_index': None,
 'sitemap_url': 'https://www.lexpress.fr/sitemap_actu_1.xml',
 'regex_section': '^https:\\/\\/www\\.lexpress\\.fr\\/(?P<section>[\\/\\w-]*)\\/.+$',
 'type': 'webpress'}

In [59]:


    break
    
    


2022-11-23 08:58:35,389 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bfmtv.com/sitemap_news.xml


'bfmtv'


2022-11-23 08:58:35,690 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lefigaro.fr/sitemap_news.xml


'lefigaro'


2022-11-23 08:58:36,004 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.francetvinfo.fr/sitemap_news.xml
2022-11-23 08:58:36,172 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lemonde.fr/sitemap_news.xml


'francetvinfo'
'lemonde'


2022-11-23 08:58:36,392 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.20minutes.fr/sitemap-news.xml
2022-11-23 08:58:36,516 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.liberation.fr/arc/outboundfeeds/sitemap_news.xml?outputType=xml


'20_minutes'
'liberation'


2022-11-23 08:58:36,629 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nouvelobs.com/sitemap-articles-news.xml


'nouvel_obs'


2022-11-23 08:58:36,913 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lepoint.fr/sitemap-news.xml


'le_point'


2022-11-23 08:58:37,263 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.lexpress.fr/sitemap_actu_1.xml


'lexpress'


In [None]:
.news_publication_date.min()

In [44]:
temp_df


Unnamed: 0,loc,publication_name,publication_language,news_publication_date,news_title,image_loc,image_caption,sitemap,sitemap_size_mb,download_date,media
0,https://www.bfmtv.com/marseille/replay-emissio...,BFMTV,fr,2022-11-23T07:35:07.000Z,Coran dans un car Aix-Marseille: le chauffeur ...,https://images.bfmtv.com/SN4KZ0KS9eJ_qcRwGsSrm...,Coran dans un car Aix-Marseille: le chauffeur ...,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
1,https://www.bfmtv.com/cote-d-azur/replay-emiss...,BFMTV,fr,2022-11-23T07:33:51.000Z,Retour sur l'affaire Magne-Ciotti,https://images.bfmtv.com/VuZoru05fV7-w6z44N-l8...,Retour sur l'affaire Magne-Ciotti,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
2,https://www.bfmtv.com/international/asie/russi...,BFMTV,fr,2022-11-23T07:33:00.000Z,7 MINUTES POUR COMPRENDRE - L'Ukraine peut-ell...,https://images.bfmtv.com/N-IeCBUHVcd1ROzCfxbEB...,7 MINUTES POUR COMPRENDRE - L'Ukraine peut-ell...,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
3,https://www.bfmtv.com/international/europe/xav...,BFMTV,fr,2022-11-23T07:32:08.000Z,"Xavier Bettel: ""Être homosexuel, d'origine jui...",https://images.bfmtv.com/0KPgL3RlLMhqJsjMaaGS-...,"Xavier Bettel: ""Être homosexuel, d'origine jui...",https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
4,https://www.bfmtv.com/marseille/replay-emissio...,BFMTV,fr,2022-11-23T07:31:57.000Z,Provence: attention aux santons made in China,https://images.bfmtv.com/yEeA-1fUAVu6_jdZG8pNx...,Provence: attention aux santons made in China,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
...,...,...,...,...,...,...,...,...,...,...,...
995,https://www.bfmtv.com/cote-d-azur/replay-emiss...,BFMTV,fr,2022-11-21T16:16:54.000Z,Procès de l'attentat de Nice: le vendeur de la...,https://images.bfmtv.com/ihissT63EFkqn-1OXpUpI...,Procès de l'attentat de Nice: le vendeur de la...,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
996,https://www.bfmtv.com/marseille/replay-emissio...,BFMTV,fr,2022-11-21T16:15:55.000Z,Marseille: une isolation éco-responsable,https://images.bfmtv.com/2QXdf8C0W2kItpCOLuaZA...,Marseille: une isolation éco-responsable,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
997,https://www.bfmtv.com/alsace/replay-emissions/...,BFMTV,fr,2022-11-21T16:15:34.000Z,Alsace: le ministre chargé du commerce extérie...,https://images.bfmtv.com/VPgSL4eQ9d0vPdEob6uB3...,Alsace: le ministre chargé du commerce extérie...,https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
998,https://www.bfmtv.com/international/asie/russi...,BFMTV,fr,2022-11-21T16:15:15.000Z,"Kherson: pour Vera Ageeva, les actes de tortur...",https://images.bfmtv.com/qPdu_UKVjYK-nR5WWT-rV...,"Kherson: pour Vera Ageeva, les actes de tortur...",https://www.bfmtv.com/sitemap_news.xml,0.746476,2022-11-23 07:37:04.531795+00:00,bfmtv
