In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')
from quotaclimat.data_processing.sitemap_processing import load_all

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud


stopwords = stopwords.words('french')
pd.options.display.max_colwidth = 200



In [None]:
df = load_all("../data_public/sitemap_dumps/")


In [None]:
df.columns

# Feature engineering

In [None]:
print(df.news_publication_date.min())
print(df.news_publication_date.max())

In [None]:
# format date
df['news_publication_date'] = df.news_publication_date.dt.strftime("%Y-%m-%d")
df['download_date'] = df.download_date.dt.strftime("%Y-%m-%d")
# filtering
df = df[df.news_publication_date > '2022-11-24']  # some article are very old

# extract section 
mlb = MultiLabelBinarizer()
df_sparse = pd.DataFrame(mlb.fit_transform(df.section),columns=mlb.classes_, index=df.index)
df[df_sparse.columns] = df_sparse

# news title processing
df.news_title = df.news_title.str.lower()


In [None]:
from quotaclimat.data_ingestion.config_sitmap import MEDIA_CONFIG, SITEMAP_CONFIG
import datetime


# EDA

In [None]:
df['type'] = df['media'].apply(lambda m: MEDIA_CONFIG[m]['type'])

In [None]:
df_count_pub_date_media = df.groupby(['news_publication_date', 'media'], as_index=False).count()
keywords = ['COP27', 'écologie']
keywords_comp = ['marcon', 'qatar']
df['day_of_the_month'] = df.download_date.str[-5:]


In [None]:
keywords = ['COP27', 'écologie']
keywords_comp = ['marcon', 'qatar']

## Section exploration

In [None]:
# top n sections
top_n = 70
df_count_sections = df_sparse.sum(axis=0).sort_values(ascending=False).head(top_n)
fig = go.Figure()
fig.add_trace(go.Bar(y=df_count_sections, x=df_count_sections.index))
fig.update_xaxes(tickangle=-45, title=None)
fig.update_yaxes(title=None)
fig.update_layout(margin={"b": 100}, title_text='Nombre total d article par section')

In [None]:
df_sum_media = df.groupby("media").sum()

In [None]:
nb_section_per_media = df_sum_media.shape[1] - df_sum_media[df_sum_media==0].isna().sum(axis=1)
print(nb_section_per_media)

In [None]:
SECTION_CLIMAT = ['planete', 'environnement', 'crise-climatique']



In [None]:
df[df.publication_name == 'Le Figaro'].section
keywords = [' cop27', '  cop ', 'climatique', 'écologie', 'CO2', 'effet de serre', 'transition énergétique', 'carbone']

df[(df.news_title.str.contains("|".join(keywords)))&(df.publication_name == 'Le Figaro')].news_title


In [None]:
df_nb_in_climat_section = df[SECTION_CLIMAT + ['publication_name']].groupby("publication_name").sum().sum(axis=1)
df_nb_total = df.groupby("publication_name").sum().sum(axis=1)
percentage_article_in_climat_sections = df_nb_in_climat_section/df_nb_total * 100

In [None]:

# top n sections
top_n = 10
percentage_article_in_climat_sections = percentage_article_in_climat_sections.sort_values(ascending=False).head(top_n)
fig = go.Figure()
fig.add_trace(go.Bar(y=percentage_article_in_climat_sections, x=percentage_article_in_climat_sections.index))
fig.update_xaxes(tickangle=-45, title=None)
fig.update_yaxes(title=None)
fig.update_layout(margin={"b": 100}, title_text='Pourcentage d article dans la section climat par publisher')

In [None]:
keywords = [' cop27', '  cop ', 'climatique', 'écologie', 'CO2', 'effet de serre', 'transition énergétique', 'carbone']

df[df.news_title.str.contains("|".join(keywords))].section


In [None]:
df.news_title.str.len().max()

In [None]:
df[(df.planete !=0)|(df.environnement !=0)|(df['crise-climatique'] !=0)].news_title

# Find back news from other sections

## Similarity score

In [None]:
vectorizer = TfidfVectorizer(max_df=0.3, min_df=0.01, stop_words=stopwords)
df_tfidf = vectorizer.fit_transform(df['news_title'])                                                                                                                                                                                               


In [None]:
df_tfidf_sum = pd.DataFrame(df_tfidf.T.sum(axis=1), index=vectorizer.get_feature_names(), columns=["tfidf_sum"])

In [None]:
wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=df_tfidf_sum.to_dict()['tfidf_sum'])
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
df[(df.planete !=0)|(df.environnement !=0)|(df['crise-climatique'] !=0) | (df[SECTION_CLIMAT] == 1).any(axis=1)].news_title

In [None]:
print(df.shape)

In [None]:
import datetime

In [None]:
a_week_ago = datetime.datetime.today() - datetime.timedelta(weeks = 1)
df[pd.to_datetime(df.download_date) > a_week_ago]

In [None]:
## word cloud per section
## here filtered only on section planete, crise climatique and environnment


climate_keywords = [' cop27', '  cop ', 'climatique', 'écologie', 'CO2', 'effet de serre', 'transition énergétique', 'carbone', 'sécheresse' 'transition énergétique', 'méthane', 'GIEC', 'zéro émission']


vectorizer = TfidfVectorizer(max_df=0.06, min_df=0.01, stop_words=stopwords)
df_positive_climat_topic = df[df.news_title.str.contains("|".join(climate_keywords)) | (df[SECTION_CLIMAT] == 1).any(axis=1)].news_title
tfidf_positive_climat_topic = vectorizer.fit_transform(df_positive_climat_topic)
tfidf_positive_climat_topic_sum = pd.DataFrame(tfidf_positive_climat_topic.T.sum(axis=1), index=vectorizer.get_feature_names(), columns=["tfidf_sum"])

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=tfidf_positive_climat_topic_sum.to_dict()['tfidf_sum'])
plt.figure(figsize=(13,8))

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
## word cloud per section
## Here we try to build a data set with only non climat topics

NON_CLIMAT_SECTION = ['football', "sport", 'coupe-du-monde', 'sante', 'justice', 'moyen-orient', 'tech', 'crypto', 'gaming', "economie", 'culture']

vectorizer = TfidfVectorizer(max_df=0.06, min_df=0.01, stop_words=stopwords)
df_negative_climat_topic = df[(df.politique !=0)|(df['economie'] !=0)| (df[NON_CLIMAT_SECTION] == 1).any(axis=1)].news_title

tfidf_negative_climat_topic = vectorizer.fit_transform(df_negative_climat_topic)
tfidf_negative_climat_topic_sum = pd.DataFrame(tfidf_negative_climat_topic.T.sum(axis=1), index=vectorizer.get_feature_names(), columns=["tfidf_sum"])

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=tfidf_negative_climat_topic_sum.to_dict()['tfidf_sum'])
plt.figure(figsize=(13,8))

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Classifier

In [None]:
df_positive_climat_topic.shape

In [None]:
df_negative_climat_topic.shape

# Topic modeling