# 👨‍💻🔍 EDA Topics des Articles

Ce notebook permet d'explorer les possibilités liées à la tâche Notion : Classifier le topic des articles afin de pouvoir comparer les sujets.

In [13]:
import sys
sys.path.append('..')
from quotaclimat.data_processing.sitemap.sitemap_processing import load_all

import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [2]:
# Data import and check
df = load_all("../data_public/sitemap_dumps/")
print(df.shape)
print(df.columns)
df.head()

(320545, 26)
Index(['url', 'news', 'news_publication', 'publication_name',
       'publication_language', 'news_publication_date', 'news_title',
       'news_keywords', 'image', 'image_loc', 'image_caption', 'sitemap',
       'etag', 'sitemap_last_modified', 'sitemap_size_mb', 'download_date',
       'media', 'section', 'changefreq', 'news_access', 'image_title',
       'lastmod', 'news_genres', 'priority', 'download_date_last',
       'media_type'],
      dtype='object')


Unnamed: 0,url,news,news_publication,publication_name,publication_language,news_publication_date,news_title,news_keywords,image,image_loc,...,media,section,changefreq,news_access,image_title,lastmod,news_genres,priority,download_date_last,media_type
37884,https://www.francetvinfo.fr/monde/nouvel-an-20...,\n,\n,Franceinfo,fr,2022-12-31 15:57:46,Nouvel An 2023 : le feu d’artifice fait son re...,"Nouvel An 2023, Monde, France",\n,https://www.francetvinfo.fr/image/761i6v10w-6c...,...,francetvinfo,"[monde, nouvel-an-2021]",,,,,,,2023-01-02 01:28:01,tv
37885,https://www.francetvinfo.fr/replay-jt/france-3...,\n,\n,Franceinfo,fr,2022-12-31 16:03:28,"Nouvel An 2023 : dans la Marne, des éthylotest...","Nouvel An 2023, Monde, Sécurité routière, Mode...",\n,https://www.francetvinfo.fr/image/761i6v4km-1a...,...,francetvinfo,"[replay-jt, france-3, 12-13]",,,,,,,2023-01-02 01:28:01,tv
37886,https://www.francetvinfo.fr/culture/charente-m...,\n,\n,Franceinfo,fr,2022-12-31 16:05:38,Charente-Maritime : découverte de la ferme des...,"Culture, Animaux",\n,https://www.francetvinfo.fr/image/761i6v4sd-70...,...,francetvinfo,[culture],,,,,,,2023-01-02 01:28:01,tv
37887,https://www.francetvinfo.fr/monde/nouvel-an-20...,\n,\n,Franceinfo,fr,2022-12-31 16:00:23,Nouvel An 2023 : un dispositif de sécurité imp...,"Nouvel An 2023, Monde",\n,https://www.francetvinfo.fr/image/761i6v19q-42...,...,francetvinfo,"[monde, nouvel-an-2021]",,,,,,,2023-01-02 01:28:01,tv
37888,https://www.francetvinfo.fr/replay-radio/micro...,\n,\n,Franceinfo,fr,2022-12-31 15:42:02,La fondation René Cassin : développer les droi...,"Europe, Mode de vie, lifestyle, Formation, éco...",\n,https://www.francetvinfo.fr/image/761i6wgbd-5e...,...,francetvinfo,"[replay-radio, micro-europeen]",,,,,,,2023-01-02 01:28:01,tv


In [3]:
df_title = df[['news_title']]
df_title

Unnamed: 0,news_title
37884,Nouvel An 2023 : le feu d’artifice fait son re...
37885,"Nouvel An 2023 : dans la Marne, des éthylotest..."
37886,Charente-Maritime : découverte de la ferme des...
37887,Nouvel An 2023 : un dispositif de sécurité imp...
37888,La fondation René Cassin : développer les droi...
...,...
87145,Le pic de la 9e vague de Covid-19 désormais fr...
87146,Qui est mon père Noël ?
87147,"Ce soir à la télé : : « Ben-Hur », le grand ba..."
87148,"Interdites de travailler avec des Afghanes, de..."


In [4]:
# Preprocess data
vectorizer = CountVectorizer(stop_words=list(fr_stop))
X = vectorizer.fit_transform(df_title['news_title'])

# Topic modeling with LDA
n_topics = 20
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)



In [5]:
# Print the top words for each topic
feature_names = vectorizer.get_feature_names_out()
topic_top_words = []
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-15 - 1:-1]]
    print("Topic %d: %s" % (topic_idx, ", ".join(top_words)))
    # Save top words to plot them
    topic_top_words.append(top_words)

Topic 0: retraites, réforme, borne, assemblée, nationale, pont, lr, députés, gauche, censure, député, lfi, rn, nupes, elisabeth
Topic 1: lyon, biden, nice, procès, prison, ferme, directeur, joe, trafic, france, ans, film, veut, minutes, bancaire
Topic 2: turquie, morts, syrie, séisme, 000, marché, hôpital, fêtes, bilan, neige, art, froid, noël, euros, tempête
Topic 3: prix, 2022, hausse, inflation, offre, taux, baisse, site, euros, 40, record, banques, européen, samsung, apple
Topic 4: saint, école, conseil, élèves, mer, 2023, avril, brieuc, municipal, mars, jean, enfants, jeunes, collège, association
Topic 5: ligue, marseille, foot, psg, lorient, match, om, nantes, rennes, face, l1, darmanin, fc, tennis, ol
Topic 6: monde, coupe, football, france, week, end, 2022, finale, face, charles, argentine, incendie, iii, roi, stade
Topic 7: macron, emmanuel, alpes, projet, eau, haute, loi, provence, maritimes, jeux, hautes, immigration, sécheresse, plan, veut
Topic 8: grève, paris, covid, 19, 

In [6]:
# Assign topics to new titles
new_titles = df_title.loc[:, 'news_title'].copy()
X_new = vectorizer.transform(new_titles)
topic_probs = lda.transform(X_new)
topic_labels = topic_probs.argmax(axis=1)

# Add topic labels to the original data frame
df_title['topic'] = topic_labels.copy()

df_title.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title['topic'] = topic_labels.copy()


Unnamed: 0,news_title,topic
37884,Nouvel An 2023 : le feu d’artifice fait son re...,6
37885,"Nouvel An 2023 : dans la Marne, des éthylotest...",19
37886,Charente-Maritime : découverte de la ferme des...,9
37887,Nouvel An 2023 : un dispositif de sécurité imp...,9
37888,La fondation René Cassin : développer les droi...,2


In [7]:
# Plot the top words for each topic
fig = go.Figure()
for i, words in enumerate(topic_top_words):
    fig.add_trace(go.Bar(x=words, y=[i+1]*10, name=f"Topic {i+1}", orientation='h'))

fig.update_layout(
    title="Top 10 Words per Topic",
    xaxis_title="Word Frequency",
    yaxis_title="Topic",
    yaxis=dict(
        tickvals=list(range(1, n_topics+1)),
        ticktext=[f"Topic {i+1}" for i in range(n_topics)]
    ),
    barmode='stack'
)

fig.show()

In [11]:
# Plot the top words for each topic
for i, words in enumerate(topic_top_words):
    idxs = [np.where(feature_names == word)[0][0] for word in words]
    fig = go.Figure(go.Bar(
        x=words,
        y=lda.components_[i][idxs],
        text=[f"{word}: {int(lda.components_[i][idx])}" for word, idx in zip(words, idxs)],
        textposition='auto',
        name=f"Topic {i+1}"
    ))

    fig.update_layout(
        title=f"Top 10 Words for Topic {i+1}",
        xaxis_title="Words",
        yaxis_title="Word Frequency"
    )

    fig.show()

In [18]:
# Create subplots
fig = make_subplots(rows=5, cols=5, subplot_titles=[f"Topic {i+1}" for i in range(n_topics)])

# Plot the top words for each topic
for i, words in enumerate(topic_top_words):
    idxs = [np.where(feature_names == word)[0][0] for word in words]
    trace = go.Bar(
        x=words,
        y=lda.components_[i][idxs],
        text=[f"{word}: {int(lda.components_[i][idx])}" for word, idx in zip(words, idxs)],
        textposition='auto',
        name=f"Topic {i+1}"
    )
    row = (i // 2) + 1
    col = (i % 2) + 1
    fig.add_trace(trace, row=row, col=col)

# Update layout
fig.update_layout(
    title="Top 10 Words for Each Topic",
    xaxis_title="Words",
    yaxis_title="Word Frequency",
    height=900
)

fig.show()

Exception: The (row, col) pair sent is out of range. Use Figure.print_grid to view the subplot grid. 

In [19]:
fig.print_grid

<bound method BaseFigure.print_grid of Figure({
    'data': [{'name': 'Topic 1',
              'text': [retraites: 4906, réforme: 2758, borne: 2072, assemblée:
                       1971, nationale: 1246, pont: 1181, lr: 1061, députés: 995,
                       gauche: 901, censure: 898, député: 889, lfi: 833, rn: 799,
                       nupes: 797, elisabeth: 788],
              'textposition': 'auto',
              'type': 'bar',
              'x': [retraites, réforme, borne, assemblée, nationale, pont, lr,
                    députés, gauche, censure, député, lfi, rn, nupes, elisabeth],
              'xaxis': 'x',
              'y': array([4906.54870073, 2758.68014759, 2072.00853423, 1971.54214782,
                          1246.17585623, 1181.98623269, 1061.05      ,  995.38191194,
                           901.23375733,  898.05      ,  889.7801262 ,  833.87850801,
                           799.05      ,  797.00994752,  788.05      ]),
              'yaxis': 'y'},
        