# **Quota Climat** - Analyse du traitement médiatique sur le sujet climat

---


# **Mot clé analysé** - Pollution

## **1.   Importation des librairies**

In [1]:
# Librairies
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import datetime

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **2.   Charte graphique plotly**

In [3]:
# CONFIG AND THEMES
COLOR_SEQUENCE = [
    "rgb(230, 50, 24)",
    "rgb(240, 73, 70)",
    "rgb(243, 127, 125)",
    "rgb(248, 182, 181)",
    "rgb(209, 220, 197)",
    "rgb(137, 168, 141)",
    "rgb(59, 111, 66)",
    "rgb(66, 66, 66)",
]

SMALL_SEQUENCE2 = [
    "rgb(230, 50, 24)",
    "rgb(59, 111, 66)",
]

WARMING_STRIPES_SEQUENCE =  ['#08306b', '#08519c', '#2171b5', '#4292c6',
    '#6baed6', '#9ecae1', '#c6dbef', '#deebf7',
    '#fee0d2', '#fcbba1', '#fc9272', '#fb6a4a',
    '#ef3b2c', '#cb181d', '#a50f15', '#67000d',
]

COLOR_SEQUENCE = COLOR_SEQUENCE #+ px.colors.qualitative.Antique
px.defaults.template = "plotly_white"
px.defaults.color_discrete_sequence = COLOR_SEQUENCE

# Create Plotly theme to set as default
THEME = go.layout.Template()
THEME.layout.treemapcolorway = COLOR_SEQUENCE
THEME.layout.sunburstcolorway = COLOR_SEQUENCE
THEME.layout.colorway = COLOR_SEQUENCE
THEME.layout.piecolorway = COLOR_SEQUENCE
THEME.layout.font = {"family":"Helvetica"}
px.defaults.template = THEME

# Visualize Colormap
fig = px.bar(
    pd.DataFrame({"color":COLOR_SEQUENCE}).assign(value = lambda x : 1,palette = lambda x : "QUOTACLIMAT").append(
    pd.DataFrame({"color":WARMING_STRIPES_SEQUENCE}).assign(value = lambda x : 1,palette = lambda x : "WARMINGSTRIPES")
    ),
    x = "value",
    y = "palette",
    color = "color",
    height = 300,
    
    color_discrete_sequence=COLOR_SEQUENCE + WARMING_STRIPES_SEQUENCE
)
fig.update_layout(xaxis={"showticklabels":False,"title":""},yaxis={"title":""},showlegend=False)
fig

## **3.   Récupération des données**

In [4]:
# Importation des données pollution
data = pd.read_csv('/content/drive/MyDrive/DataForGood/QuotaClimat/Pollution/20221019_20211018_20221018_all_pollution.csv')
data.head()

Unnamed: 0,CHANNEL,RADIO,DATE,TEXT,HIGHLIGHT,START CHUNK,END CHUNK,ORIGIN,URL
0,france2,False,2021-10-22T00-10-00,co a reconnu une pollution ponctuelles et acci...,co a reconnu une pollution ponctuelles et acci...,2021-10-22T00-10-00,2021-10-22T00-12-00,s2t,https://keywords.mediatree.fr/player/?fifo=fra...
1,fbleu-b-normandie,True,2021-10-21T12-30-00,sur facebook va traverser le monde faire des m...,est fou parce que c' est une pollution invisi...,2021-10-21T12-30-00,2021-10-21T12-32-00,s2t,https://keywords.mediatree.fr/player/?fifo=fbl...
2,fbleu-rcfm-edd,True,2021-10-29T08-48-00,puis la pollution la pollution je vois une pai...,puis la pollution la pollution je vois une pai...,2021-10-29T08-48-00,2021-10-29T08-50-00,s2t,https://keywords.mediatree.fr/player/?fifo=fbl...
3,france24,False,2021-10-19T17-04-00,"sont associés à l'incidence, c'est-à-dire au r...",à la pollution atmosphérique? - Rémy Slama: E...,2021-10-19T17-04-00,2021-10-19T17-06-00,dvb,https://keywords.mediatree.fr/player/?fifo=fra...
4,alta-edd,True,2021-10-19T18-40-00,le rural la problématique est la même et nous ...,de tranquillité publique et aussi au niveau s...,2021-10-19T18-40-00,2021-10-19T18-42-00,s2t,https://keywords.mediatree.fr/player/?fifo=alt...


In [5]:
channels = pd.read_excel('/content/drive/MyDrive/DataForGood/QuotaClimat/Pollution/channels.xlsx')
channels.head()

Unnamed: 0,CHANNEL,CHANNEL_NAME
0,tf1,TF1
1,france2,France 2
2,france3,France 3
3,cplus,Canal+
4,france5,France 5


## **4.   Préparation des données**

In [8]:
def process_mediatree_extract(path_file,path_channels):
    data = pd.read_csv(path_file)
    channels = pd.read_excel(path_channels)
    
    data = (
        data
        .merge(channels,on = "CHANNEL")
        .rename(columns = {"START CHUNK" : "START_CHUNK","END CHUNK":"END_CHUNK"})
        .assign(DATE = lambda x : pd.to_datetime(x["DATE"],format = "%Y-%m-%dT%H-%M-%S"))
        .assign(TIME = lambda x : x["DATE"].dt.time)
        .assign(TIME = lambda x : x["TIME"].map(lambda y : datetime.timedelta(hours=y.hour, minutes=y.minute, seconds=y.second)))
        .assign(MEDIA = lambda x : x["RADIO"].map(lambda y : "Radio" if y else "TV"))
        .assign(FILENAME = lambda x : path_file)
        .assign(COUNT = lambda x : 1)
        .assign(DURATION = lambda x : 2)
        .assign(KEYWORD = lambda x : x["FILENAME"].map(lambda y : y.rsplit("_",1)[-1].replace(".xlsx","")))
        .drop(columns = ["ORIGIN","START_CHUNK","END_CHUNK"])
        
    )

    return data
    
data = process_mediatree_extract('/content/drive/MyDrive/DataForGood/QuotaClimat/Pollution/20221019_20211018_20221018_all_pollution.csv',"/content/drive/MyDrive/DataForGood/QuotaClimat/Pollution/channels.xlsx")
data.head()

Unnamed: 0,CHANNEL,RADIO,DATE,TEXT,HIGHLIGHT,URL,CHANNEL_NAME,TIME,MEDIA,FILENAME,COUNT,DURATION,KEYWORD
0,france2,False,2021-10-22 00:10:00,co a reconnu une pollution ponctuelles et acci...,co a reconnu une pollution ponctuelles et acci...,https://keywords.mediatree.fr/player/?fifo=fra...,France 2,0 days 00:10:00,TV,/content/drive/MyDrive/DataForGood/QuotaClimat...,1,2,pollution.csv
1,france2,False,2021-10-22 00:06:00,afrique abrite l' une des biodiversité les plu...,les puits de pétrole perrin co devant son but...,https://keywords.mediatree.fr/player/?fifo=fra...,France 2,0 days 00:06:00,TV,/content/drive/MyDrive/DataForGood/QuotaClimat...,1,2,pollution.csv
2,france2,False,2021-10-22 00:14:00,est-à-dire que la fuite de pipe jusqu' en deux...,mis en examen la multinationale pour des fait...,https://keywords.mediatree.fr/player/?fifo=fra...,France 2,0 days 00:14:00,TV,/content/drive/MyDrive/DataForGood/QuotaClimat...,1,2,pollution.csv
3,france2,False,2021-10-22 00:08:00,une tête de puits de pétrole et des cuves on c...,lorsqu' il y a ce type de pollution ils font ...,https://keywords.mediatree.fr/player/?fifo=fra...,France 2,0 days 00:08:00,TV,/content/drive/MyDrive/DataForGood/QuotaClimat...,1,2,pollution.csv
4,france2,False,2021-10-25 23:44:00,européenne alors au début on n' a pas été très...,des deux mains et donc on dit aux industriels...,https://keywords.mediatree.fr/player/?fifo=fra...,France 2,0 days 23:44:00,TV,/content/drive/MyDrive/DataForGood/QuotaClimat...,1,2,pollution.csv


## **6.   Analyse**

### Analyse sur un extrait keyword Mediatree

In [9]:
top_channels = channels.head(25)["CHANNEL_NAME"].tolist()
top_channels_tv = top_channels[:8]
top_channels

['TF1',
 'France 2',
 'France 3',
 'Canal+',
 'France 5',
 'M6',
 'BFMTV',
 'CNEWS',
 'LCI',
 'Arte',
 'Europe 1',
 'RMC',
 'RTL',
 'France Inter',
 'Euronews',
 'LCP',
 'France Info',
 'France 24',
 'France S',
 'TV5Monde',
 'BFM Business Radio',
 'BFM Business',
 'France Culture',
 'Radio Classique',
 'RFI']

### 6.1 Split des chaînes

In [10]:
count = data.groupby(["CHANNEL_NAME","MEDIA"],as_index = False)["COUNT"].sum().sort_values("COUNT",ascending = False).head(30)

fig = px.bar(
    count,
    x = "CHANNEL_NAME",
    y = "COUNT",
    color = "MEDIA",
    color_discrete_sequence=SMALL_SEQUENCE2,
    text_auto = ".2s",
    category_orders={"CHANNEL_NAME": count["CHANNEL_NAME"].tolist()},
    height = 500,
    title = "Nombre de mentions par chaîne"
)

fig.update_xaxes(tickangle=-45,title=None)
fig.update_yaxes(title=None)
fig.update_layout(margin={"b":100})
fig.show()

In [11]:
count = (data
        .loc[data["CHANNEL_NAME"].isin(top_channels)]
        .groupby(["CHANNEL_NAME","MEDIA"],as_index = False)["COUNT"].sum().sort_values("COUNT",ascending = False)
)

fig = px.bar(
    count,
    x = "CHANNEL_NAME",
    y = "COUNT",
    color = "MEDIA",
    color_discrete_sequence=SMALL_SEQUENCE2,
    text_auto = ".2s",
    category_orders={"CHANNEL_NAME": count["CHANNEL_NAME"].tolist()},
    height = 500,
    title = "Nombre de mentions par chaîne"
)

fig.update_xaxes(tickangle=-45,title=None)
fig.update_yaxes(title=None)
fig.update_layout(margin={"b":100})
fig.show()

In [12]:
count = data.groupby(["MEDIA"],as_index = False)["COUNT"].sum().sort_values("COUNT",ascending = False)
count

fig = px.pie(
    count,
    names = "MEDIA",
    values = "COUNT",
    color_discrete_sequence=SMALL_SEQUENCE2,
    title = "Split TV / Radio"
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

### 6.2 Evolution au cours du temps

In [13]:
freq = "D"

count = (
    data.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq)],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.bar(count,x = "DATE",y = "COUNT",title = "Evolution du nombre de mention au cours du temps",height = 400)
fig.show()

In [14]:
freq = "D"

count = (
    data.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"MEDIA"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.bar(count,
             x = "DATE",y = "COUNT",color = "MEDIA",
             title = "Evolution du nombre de mention au cours du temps par type de média",
             height = 400,color_discrete_sequence=SMALL_SEQUENCE2
)
fig.show()

In [15]:
freq = "D"

count = (
    data.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"MEDIA"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.area(count,
             x = "DATE",y = "COUNT",color = "MEDIA",
             title = "Evolution du nombre de mention au cours du temps par type de média",height = 400,color_discrete_sequence=SMALL_SEQUENCE2
)
fig.show()

In [16]:
freq = "D"

count = (
    data.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"MEDIA"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.area(count,
             x = "DATE",y = "COUNT",color = "MEDIA",groupnorm='fraction',
             title = "Evolution du nombre de mention au cours du temps par type de média en %",height = 400,color_discrete_sequence=SMALL_SEQUENCE2,
)
fig.update_layout(yaxis_tickformat='0%') 
fig.show()

In [17]:
freq = "6H"

count = (
    data.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"MEDIA"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.area(count,
             x = "DATE",y = "COUNT",color = "MEDIA",
             title = "Evolution du nombre de mention au cours du temps par type de média",height = 400,color_discrete_sequence=SMALL_SEQUENCE2
)
fig.show()

In [18]:
freq = "D"

count = (
    data.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"MEDIA","CHANNEL_NAME"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

count = count.loc[count["CHANNEL_NAME"].isin(top_channels_tv)]

fig = px.line(count,
             x = "DATE",y = "COUNT",color = "CHANNEL_NAME",
             title = "Evolution du nombre de mention au cours du temps par chaîne TV",height = 400
)
fig.show()

fig = px.bar(count,
             x = "DATE",y = "COUNT",color = "CHANNEL_NAME",
             title = "Evolution du nombre de mention au cours du temps par chaîne TV",height = 400
)
fig.show()

### 6.3 Heures de la journée

In [19]:
freq = "1H"

count = (
    data
    .set_index(["TIME"])
    .groupby([pd.Grouper(freq = freq)],as_index = True)
    ["COUNT"].sum()
    .reset_index()
    .assign(TIME = lambda x: x["TIME"].map(lambda y : str(y)[7:12]))
)

fig = px.bar(
    count,
    x = "TIME",y = "COUNT",
)
fig.show()

In [20]:
count["TIME"].unique()

array(['00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00',
       '07:00', '08:00', '09:00', '10:00', '11:00', '12:00', '13:00',
       '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00',
       '21:00', '22:00', '23:00'], dtype=object)

In [21]:
freq = "1H"

count = (
    data
    .set_index(["TIME"])
    .groupby([pd.Grouper(freq = freq),"MEDIA"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
    .assign(TIME = lambda x: x["TIME"].map(lambda y : str(y)[7:12]))
    .sort_values("TIME",ascending = True)
)

fig = px.bar(
    count,
    text_auto = "s",
    x = "TIME",y = "COUNT",color="MEDIA",
    color_discrete_sequence=SMALL_SEQUENCE2,
    height = 400,
    category_orders={"TIME":count["TIME"].unique()},
    title = "Répartition des mentions par heure de la journée"
)
fig.show()


fig = px.area(
    count,
    x = "TIME",y = "COUNT",color = "MEDIA",groupnorm='fraction',
    category_orders={"TIME":count["TIME"].unique()},
    title = "Répartition des mentions par heure de la journée en %",height = 400,color_discrete_sequence=SMALL_SEQUENCE2,
)
fig.update_layout(yaxis_tickformat='0%') 
fig.show()

In [22]:
freq = "1H"

count = (
    data
    .set_index(["TIME"])
    .groupby([pd.Grouper(freq = freq),"CHANNEL_NAME"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
    .assign(TIME = lambda x: x["TIME"].map(lambda y : str(y)[7:12]))
    .sort_values("TIME",ascending = True)
)

count = count.loc[count["CHANNEL_NAME"].isin(top_channels_tv)]

fig = px.bar(
    count,
    x = "TIME",y = "COUNT",color="CHANNEL_NAME",
    height = 400,
    category_orders={"TIME":count["TIME"].unique()},
    title = "Répartition des mentions par heure de la journée"
)
fig.show()


fig = px.area(
    count,
    x = "TIME",y = "COUNT",color = "CHANNEL_NAME",groupnorm='fraction',
    category_orders={"TIME":count["TIME"].unique()},
    title = "Répartition des mentions par heure de la journée en %",height = 400,
)
fig.update_layout(yaxis_tickformat='0%') 
fig.show()

### 6.4 Par chaîne

In [23]:
freq = "3H"

count = (
    data
    .set_index(["TIME"])
    .groupby([pd.Grouper(freq = freq),"CHANNEL_NAME"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
    .assign(TIME = lambda x: x["TIME"].map(lambda y : str(y)[7:12]))
    .sort_values("TIME",ascending = True)
)

count = count.loc[count["CHANNEL_NAME"].isin(top_channels_tv)]

fig = px.bar(
    count,
    x = "TIME",y = "COUNT",color="CHANNEL_NAME",
    height = 400,
    category_orders={"TIME":count["TIME"].unique()},
    title = "Répartition des mentions par heure de la journée"
)
fig.show()


fig = px.area(
    count,
    x = "TIME",y = "COUNT",color = "CHANNEL_NAME",groupnorm='fraction',
    category_orders={"TIME":count["TIME"].unique()},
    title = "Répartition des mentions par heure de la journée en %",height = 400,
)
fig.update_layout(yaxis_tickformat='0%') 
fig.show()

In [24]:
count

Unnamed: 0,TIME,CHANNEL_NAME,COUNT
39,00:00,M6,15
29,00:00,France 5,7
3,00:00,BFMTV,20
5,00:00,CNEWS,51
8,00:00,France 2,23
10,00:00,France 3,2
92,03:00,M6,3
66,03:00,BFMTV,50
67,03:00,CNEWS,12
68,03:00,Canal+,1


In [25]:
freq = "4H"

def parse_period(y,freq):
    hours = int(str(y)[7:9])
    return f"{hours}-{hours+int(freq.replace('H',''))}h"

count = (
    data
    .set_index(["TIME"])
    .groupby([pd.Grouper(freq = freq),"CHANNEL_NAME"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
    .assign(TIME = lambda x: x["TIME"].map(lambda y : parse_period(y,freq)))
    .sort_values("TIME",ascending = True)
)

count = count.loc[count["CHANNEL_NAME"].isin(top_channels)]

fig = px.treemap(
    count,
    path = ["CHANNEL_NAME","TIME"],
    values = "COUNT",
)
fig

### 6.5 Comparer deux fichiers

In [None]:
'''data_climat = process_mediatree_extract('../data/keywords/20221015_lastmonth_all_changement climatique.xlsx',"../data/channels.xlsx")
data_biodiv = process_mediatree_extract('../data/keywords/20221015_lastmonth_all_biodiversité.xlsx',"../data/channels.xlsx")
data_agg = pd.concat([data_climat,data_biodiv],axis = 0,ignore_index = True)'''

In [None]:
'''freq = "D"

count = (
    data_agg.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"KEYWORD"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.bar(count,
             x = "DATE",y = "COUNT",color = "KEYWORD",
             title = "Evolution du nombre de mention au cours du temps par mot clé",
             height = 400,color_discrete_sequence=SMALL_SEQUENCE2
)
fig.show()'''

In [None]:
'''freq = "D"

count = (
    data_agg.set_index(["DATE"])
    .groupby([pd.Grouper(freq = freq),"KEYWORD"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
)

fig = px.area(count,
             x = "DATE",y = "COUNT",color = "KEYWORD",groupnorm='fraction',
             title = "Evolution du nombre de mention au cours du temps par keyword en %",height = 400,color_discrete_sequence=SMALL_SEQUENCE2,
)
fig.update_layout(yaxis_tickformat='0%') 
fig.show()'''

In [None]:
'''count = (
    data_agg
    .groupby(["CHANNEL_NAME","MEDIA","KEYWORD"],as_index = False)
    ["COUNT"].sum()
)

count = count.loc[count["CHANNEL_NAME"].isin(top_channels_tv)]

fig = px.bar(
    count,
    x = "CHANNEL_NAME",y = "COUNT",color="KEYWORD",
    height = 400,
    title = "Répartition des mentions par mot clé par chaîne TV principale",
    color_discrete_sequence=SMALL_SEQUENCE2,
    text_auto = "s",
)
fig.show()'''

In [None]:
'''freq = "4H"

def parse_period(y,freq):
    hours = int(str(y)[7:9])
    return f"{hours}-{hours+int(freq.replace('H',''))}h"

count = (
    data_agg
    .set_index(["TIME"])
    .groupby([pd.Grouper(freq = freq),"CHANNEL_NAME","KEYWORD"],as_index = True)
    ["COUNT"].sum()
    .reset_index()
    .assign(TIME = lambda x: x["TIME"].map(lambda y : parse_period(y,freq)))
    .sort_values("TIME",ascending = True)
)

count = count.loc[count["CHANNEL_NAME"].isin(top_channels)]

fig = px.treemap(
    count,
    path = ["CHANNEL_NAME","KEYWORD","TIME"],
    values = "COUNT",
    height = 800
)
fig'''