In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')
from quotaclimat.data_processing.sitemap_processing import load_all

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import gensim
from quotaclimat.data_ingestion.config_sitmap import MEDIA_CONFIG, SITEMAP_CONFIG
import datetime


stopwords = stopwords.words('french')
pd.options.display.max_colwidth = 200

# snowball steamer 
stemmer = SnowballStemmer("french")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result




In [3]:
df = load_all("../data_public/sitemap_dumps/")


In [4]:
df

Unnamed: 0,url,news,news_publication,publication_name,publication_language,news_publication_date,news_title,news_keywords,image,image_loc,...,media,section,changefreq,news_access,image_title,lastmod,news_genres,priority,download_date_last,media_type
37884,https://www.francetvinfo.fr/monde/nouvel-an-2021/nouvel-an-2023-le-feu-d-artifice-fait-son-retour-sur-les-champs-elysees_5573295.html,\n,\n,Franceinfo,fr,2022-12-31 15:57:46,Nouvel An 2023 : le feu d’artifice fait son retour sur les Champs-Élysées,"Nouvel An 2023, Monde, France",\n,https://www.francetvinfo.fr/image/761i6v10w-6c06/570/320/30325236.jpeg,...,francetvinfo,"[monde, nouvel-an-2021]",,,,,,,2023-01-02 01:28:01,tv
37885,https://www.francetvinfo.fr/replay-jt/france-3/12-13/nouvel-an-2023-dans-la-marne-des-ethylotests-distribues-gratuitement-pour-le-reveillon_5573307.html,\n,\n,Franceinfo,fr,2022-12-31 16:03:28,"Nouvel An 2023 : dans la Marne, des éthylotests distribués gratuitement pour le réveillon","Nouvel An 2023, Monde, Sécurité routière, Mode de vie, lifestyle",\n,https://www.francetvinfo.fr/image/761i6v4km-1ac5/570/320/30325371.jpg,...,francetvinfo,"[replay-jt, france-3, 12-13]",,,,,,,2023-01-02 01:28:01,tv
37886,https://www.francetvinfo.fr/culture/charente-maritime-decouverte-de-la-ferme-des-hippocampes-un-lieu-unique-en-france_5573310.html,\n,\n,Franceinfo,fr,2022-12-31 16:05:38,"Charente-Maritime : découverte de la ferme des hippocampes, un lieu unique en France","Culture, Animaux",\n,https://www.francetvinfo.fr/image/761i6v4sd-7006/570/320/30325425.jpeg,...,francetvinfo,[culture],,,,,,,2023-01-02 01:28:01,tv
37887,https://www.francetvinfo.fr/monde/nouvel-an-2021/nouvel-an-2023-un-dispositif-de-securite-important-deploye-sur-toute-la-france_5573298.html,\n,\n,Franceinfo,fr,2022-12-31 16:00:23,Nouvel An 2023 : un dispositif de sécurité important déployé sur toute la France,"Nouvel An 2023, Monde",\n,https://www.francetvinfo.fr/image/761i6v19q-42db/570/320/30325290.jpeg,...,francetvinfo,"[monde, nouvel-an-2021]",,,,,,,2023-01-02 01:28:01,tv
37888,https://www.francetvinfo.fr/replay-radio/micro-europeen/la-fondation-rene-cassin-developper-les-droits-de-l-homme-par-la-formation-l-enseignement-et-la-recherche_5549607.html,\n,\n,Franceinfo,fr,2022-12-31 15:42:02,"La fondation René Cassin : développer les droits de l'homme par la formation, l'enseignement et la recherche","Europe, Mode de vie, lifestyle, Formation, économie, Emploi, Monde, Carrière",\n,https://www.francetvinfo.fr/image/761i6wgbd-5e23/570/320/30305094.jpg,...,francetvinfo,"[replay-radio, micro-europeen]",,,,,,,2023-01-02 01:28:01,tv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87145,https://www.nouvelobs.com/coronavirus-de-wuhan/20221225.OBS67558/le-pic-de-la-9e-vague-de-covid-19-desormais-franchi.html,\n,\n,L'Obs,fr,2022-12-25 00:00:00,Le pic de la 9e vague de Covid-19 désormais franchi,,\n,https://focus.nouvelobs.com/2022/12/25/176/0/3999/1999/640/310/60/0/0291516_1671977887581-080-hl-martinlelievre-1609740.jpg,...,nouvel_obs,[coronavirus-de-wuhan],,,,,,,2022-12-27 01:26:21,webpress
87146,https://www.nouvelobs.com/rue89/20221225.OBS67559/qui-est-mon-pere-noel.html,\n,\n,L'Obs,fr,2022-12-25 00:00:00,Qui est mon père Noël ?,,\n,https://focus.nouvelobs.com/2022/12/23/228/0/2738/1369/640/310/60/0/0935c20_1671799899057-anderson-w-rangel-osal6nap-hw-unsplash.jpg,...,nouvel_obs,[rue89],,,,,,,2022-12-27 01:26:21,webpress
87147,https://www.nouvelobs.com/teleobs/20221225.OBS67560/ben-hur-le-grand-barnum.html,\n,\n,L'Obs,fr,2022-12-25 00:00:00,"Ce soir à la télé : : « Ben-Hur », le grand barnum",,\n,https://focus.nouvelobs.com/2022/12/13/258/0/3512/1756/640/310/60/0/f9651b2_1670929834991-076-chl-016984.jpg,...,nouvel_obs,[teleobs],,,,,,,2022-12-27 01:26:21,webpress
87148,https://www.nouvelobs.com/monde/20221225.OBS67561/interdites-de-travailler-avec-des-afghanes-des-ong-suspendent-leurs-activites.html,\n,\n,L'Obs,fr,2022-12-25 00:00:00,"Interdites de travailler avec des Afghanes, des ONG suspendent leurs activités",,\n,https://focus.nouvelobs.com/2022/12/25/210/0/2526/1263/640/310/60/0/19c75b0_1671986312107-000-336436g.jpg,...,nouvel_obs,[monde],,,,,,,2022-12-27 01:26:21,webpress


In [None]:
df.columns

Index(['url', 'news', 'news_publication', 'publication_name',
       'publication_language', 'news_publication_date', 'news_title',
       'news_keywords', 'image', 'image_loc', 'image_caption', 'sitemap',
       'etag', 'sitemap_last_modified', 'sitemap_size_mb', 'download_date',
       'media', 'section', 'media_type', 'lastmod', 'news_genres',
       'changefreq', 'priority', 'image_title', 'news_access'],
      dtype='object')

In [None]:
df.shape

(126117, 25)