# COP27
> Ce notebook sert de marche à suivre pour analyser les données d'une journée de COP

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import datetime

import sys
sys.path.append("../../")

%load_ext autoreload
%autoreload 2

from quotaclimat.utils.plotly_theme import *

In [2]:
%%html
<style type="text/css">
@import url('http://fonts.googleapis.com/css?family=Poppins');
</style>

# Récupération et préparation des données

## Récupérer et préparer les données de la journée
Choisissez bien le bon fichier

In [3]:
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_one
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_all_data_dump
from quotaclimat.data_processing.read_format_deduplicate import deduplicate_extracts

In [6]:
day = "08"
folder = f"../../data/cop27/{day}11"
paths = [os.path.join(folder,x) for x in os.listdir(folder)]
paths

['../../data/cop27/0811/20220811_20220811_all_ecologie.xlsx',
 '../../data/cop27/0811/20220811_20220811_all_cop27.xlsx']

In [81]:
path_cop27 = [x for x in paths if "_cop27" in x][0]
path_ecologie = [x for x in paths if "ecologie" in x][0]

In [82]:
data = read_and_format_one(path_file = path_cop27,path_channels=None)
data.shape

(2675, 14)

In [83]:
data.keyword.unique()

array(['cop27'], dtype=object)

## Filtrer sur les 50 chaînes TV et Radio avec le plus d'audience

In [84]:
top_audiences = pd.read_excel("../../data/channels.xlsx",sheet_name = "top_audiences")
top_audiences["channel_id"] = top_audiences["channel_name"] + "_" + top_audiences["media"]
top_channels_tv = top_audiences.query("media=='TV'")["channel_name"].tolist()
top_channels_radio = top_audiences.query("media=='Radio'")["channel_name"].tolist()

top_channels_tv_gen = top_audiences.query("type=='Généraliste'")["channel_name"].tolist()
top_channels_tv_info = top_audiences.query("type=='Information en continu'")["channel_name"].tolist()

In [85]:
data["channel_id"] = data["channel_name"] + "_" + data["media"]
data = data.merge(top_audiences[["channel_id","type"]],on = ["channel_id"],how = "inner")

data.loc[data["media"]=="TV","media2"] = "TV" + " - " + data["type"]
data.loc[data["media"]=="Radio","media2"] = "Radio"

data.shape

(707, 17)

##### Nombre de chaînes TV ou Radio dans l'échantillon
Vérifier combien de chaînes sur les 50 parlent du sujet 

In [86]:
data.drop_duplicates(subset = ["channel_name"]).groupby(["media"])["channel_name"].count()

media
Radio    21
TV       14
Name: channel_name, dtype: int64

In [87]:
data.drop_duplicates(subset = ["channel_name"]).groupby(["media2"])["channel_name"].count()

media2
Radio                          21
TV - Généraliste                8
TV - Information en continu     6
Name: channel_name, dtype: int64

## Filtrer dans les horaires d'antenne entre 6h et minuit

In [88]:
from quotaclimat.data_analytics.exploration import filter_data_between_hours

In [89]:
data = filter_data_between_hours(data,"06:00","24:00")

In [90]:
data.shape

(635, 17)

# Préparation des analyses

In [91]:
from quotaclimat.utils.channels import TOP_25_CHANNELS,TOP_CHANNELS_TV,TOP_CHANNELS_TV_8
from quotaclimat.data_analytics.exploration import show_mentions_by_channel
from quotaclimat.data_analytics.exploration import show_mentions_by_time_of_the_day
from quotaclimat.data_analytics.exploration import show_mentions_over_time
from quotaclimat.data_analytics.exploration import show_mentions_treemap
from quotaclimat.data_analytics.exploration import show_piechart_split_tv_radio

## Analyse 1 - volume médiatique total sur les 50 chaînes

In [92]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 18h * n_days)
n_days = 1
n_channels = 25 #TV and Radio

media_time = data.groupby(["media"]).agg({"count":"sum","channel_name":"nunique"})
media_time["n_channels"] = 25
media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)
media_time["media_time"] = media_time["count"] * 2
media_time["total_time"] = media_time["n_channels"] * n_days * 18 * 60
media_time["media_part"] = media_time["media_time"] / media_time["total_time"]

media_time


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,count,channel_name,n_channels,media_time,total_time,media_part
Radio,371,21,25,742,27000,0.027481
TV,264,14,25,528,27000,0.019556
Total,635,35,50,1270,54000,0.023519


In [101]:
fig = px.bar(media_time.reset_index(),x = "index",y = "media_part",height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%',
                  title = "Volume médiatique total sur les 50 chaînes TV et Radio",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color=["#f49182",'#e6381b'])
fig

## Analyse 1.2 split par type de TV

In [103]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 18h * n_days)
n_days = 1
n_channels = 25 #TV and Radio

media_time = data.groupby(["media2"]).agg({"count":"sum","channel_name":"nunique"})
media_time.loc["Radio","n_channels"] = 25
media_time.loc["TV - Généraliste","n_channels"] = 19
media_time.loc["TV - Information en continu","n_channels"] = 6
media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)
media_time["media_time"] = media_time["count"] * 2
media_time["total_time"] = media_time["n_channels"] * n_days * 18 * 60
media_time["media_part"] = media_time["media_time"] / media_time["total_time"]

media_time


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,count,channel_name,n_channels,media_time,total_time,media_part
Radio,371.0,21.0,25.0,742.0,27000.0,0.027481
TV - Généraliste,47.0,8.0,19.0,94.0,20520.0,0.004581
TV - Information en continu,217.0,6.0,6.0,434.0,6480.0,0.066975
Total,635.0,35.0,50.0,1270.0,54000.0,0.023519


In [104]:
fig = px.bar(media_time.reset_index(),x = "index",y = "media_part",height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%',
                  title = "Volume médiatique total sur les 50 chaînes TV et Radio",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color=["#f49182",'#e6381b',"#e6381b"])
fig

## Analyse 2 - TOP 3 TV et Radio

In [105]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_tv_info,n = 3,
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%',
                  title = "Podium TV - Chaînes d'information en continu",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color='#e6381b')
fig

In [106]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_tv_gen,n = 3,
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%',
                  title = "Podium TV -  Chaînes généralistes",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color='#e6381b')
fig

In [107]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_radio,n = 3,
                               method = multiplier,height = 400,text_auto = ".1%")

fig.update_layout(yaxis_tickformat='0%',
                  title = "Podium Radio",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color='#f49182')
fig

## Analyse 3 - Classement complet TV et Radio

In [108]:
data.keyword.unique()

array(['cop27'], dtype=object)

In [109]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_tv_info,n = 25,split="keyword",
                               method = multiplier,height = 400,text_auto = ".1%",
                                )
fig.update_layout(yaxis_tickformat='0%',font_family="Poppins",yaxis_title="% du volume médiatique",legend_title = "",title = "Classement TV - Chaînes d'information en continu")
fig.update_traces(marker_color='#e6381b')
fig.show()

# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_tv_gen,n = 25,split="keyword",
                               method = multiplier,height = 400,text_auto = ".1%",
                               )
fig.update_layout(yaxis_tickformat='0%',font_family="Poppins",yaxis_title="% du volume médiatique",legend_title = "",title = "Classement TV -  Chaînes généralistes")
fig.update_traces(marker_color='#e6381b')
fig.show()

# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_radio,n = 25,split = "keyword",
                               method = multiplier,height = 400,text_auto = ".1%",
                              )
fig.update_layout(yaxis_tickformat='0%',font_family="Poppins",yaxis_title="% du volume médiatique",legend_title = "",title = "Classement Radio")
fig.update_traces(marker_color='#f49182')
fig.show()

# Autres sujets écologiques

In [110]:
data_total = read_and_format_all_data_dump(path_folder = "../../data/cop27/0811/",path_channel_metadata=None)

data_total["channel_id"] = data_total["channel_name"] + "_" + data_total["media"]
data_total = data_total.merge(top_audiences[["channel_id","type"]],on = ["channel_id"],how = "inner")

data_total.loc[data_total["media"]=="TV","media2"] = "TV" + " - " + data_total["type"]
data_total.loc[data_total["media"]=="Radio","media2"] = "Radio"

data_total = filter_data_between_hours(data_total,"06:00","24:00")

data_total.loc[data_total["keywords"].map(lambda x : "cop27" in x),"keyword"] = "cop27"

data_total.shape

(1432, 18)

In [111]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 18h * n_days)
n_days = 1

media_time = data_total.groupby(["media"]).agg({"count":"sum","channel_name":"nunique"})
media_time["n_channels"] = 25
media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)
media_time["media_time"] = media_time["count"] * 2
media_time["total_time"] = media_time["n_channels"] * n_days * 18 * 60
media_time["media_part"] = media_time["media_time"] / media_time["total_time"]

fig = px.bar(media_time.reset_index(),x = "index",y = "media_part",height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%',
                  title = "Volume médiatique total sur les 50 chaînes TV et Radio",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color=["#f49182",'#e6381b'])
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [112]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 18h * n_days)
n_days = 1

media_time = data_total.groupby(["media2"]).agg({"count":"sum","channel_name":"nunique"})
media_time.loc["Radio","n_channels"] = 25
media_time.loc["TV - Généraliste","n_channels"] = 19
media_time.loc["TV - Information en continu","n_channels"] = 6
media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)
media_time["media_time"] = media_time["count"] * 2
media_time["total_time"] = media_time["n_channels"] * n_days * 18 * 60
media_time["media_part"] = media_time["media_time"] / media_time["total_time"]

fig = px.bar(media_time.reset_index(),x = "index",y = "media_part",height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%',
                  title = "Volume médiatique total sur les 50 chaînes TV et Radio",
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color=["#f49182",'#e6381b','#e6381b'])
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



## Analyse 3

In [113]:
DISCRETE_MAP = {
    "cop27":"#cb181d",
    "écologie":"#2171b5",
}

# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data_total,list_of_channels=top_channels_tv_info,n = 25,split="keyword",
                               method = multiplier,height = 400,text_auto = ".1%",
                               color_discrete_map = DISCRETE_MAP,
                              )
fig.update_layout(yaxis_tickformat='0%',font_family="Poppins",yaxis_title="% du volume médiatique",
                  title = "Classement TV - Chaînes d'information en continu",legend_title = "")
# fig.update_traces(marker_color='#e6381b')
fig.show()

# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data_total,list_of_channels=top_channels_tv_gen,n = 25,split="keyword",
                               method = multiplier,height = 400,text_auto = ".1%",
                               color_discrete_map = DISCRETE_MAP,
                              )
fig.update_layout(yaxis_tickformat='0%',font_family="Poppins",yaxis_title="% du volume médiatique",
                  title = "Classement TV - Chaînes généralistes",legend_title = "")
# fig.update_traces(marker_color='#e6381b')
fig.show()


# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 1)

fig = show_mentions_by_channel(data_total,list_of_channels=top_channels_radio,n = 25,split = "keyword",
                               method = multiplier,height = 400,text_auto = ".1%",
                               color_discrete_map = DISCRETE_MAP
                              )
fig.update_layout(yaxis_tickformat='0%',font_family="Poppins",yaxis_title="% du volume médiatique",title = "Classement Radio",legend_title = "")
# fig.update_traces(marker_color='#f49182')
fig.show()

# Analyse des sujets mentionnés

In [114]:
from quotaclimat.data_processing.keyword_tool import KeywordsTool

kwt = KeywordsTool(case_sensitive=False,lowercase = True)

kwt.load_from_airtable(airtable_table_name="COP27",keyword_col = "name",variants_col=["alternatives_mediatree","alternatives"])

ModuleNotFoundError: No module named 'flashtext'

In [None]:
counts_total = kwt.count_keywords_on_corpus(data["text"],as_melted = True)
counts_total = counts_total.query("name!='Pays'")

NameError: name 'kwt' is not defined

In [115]:
px.treemap(
    counts_total.groupby(["name","category"])["count"].sum().reset_index(),
    path = ["category","name"],
    values = "count",
    color_discrete_sequence = WARMING_STRIPES_SEQUENCE
)

NameError: name 'counts_total' is not defined

In [116]:
data_dsm = data.reset_index(drop=True).loc[counts_total.query("name=='Fonds marins'")["text_id"].tolist()]

fig = show_mentions_by_channel(data_dsm,n = 25,
                               method = "minutes",height = 400,text_auto = ".1s")
fig.update_layout(
                  font_family="Poppins",yaxis_title="% du volume médiatique",xaxis_title = "")
fig.update_traces(marker_color='#e6381b')
fig

NameError: name 'counts_total' is not defined

# Focus 3 jours + classement

In [58]:
data_all_days = []
for folder in ["05","06","07", ]:
    
    day = f"{folder}11"

    data_total = read_and_format_all_data_dump(path_folder = f"../../data/cop27/{day}/",path_channel_metadata=None)

    data_total["channel_id"] = data_total["channel_name"] + "_" + data_total["media"]
    data_total = data_total.merge(top_audiences[["channel_id","type"]],on = ["channel_id"],how = "inner")

    data_total.loc[data_total["media"]=="TV","media2"] = "TV" + " - " + data_total["type"]
    data_total.loc[data_total["media"]=="Radio","media2"] = "Radio"

    data_total = filter_data_between_hours(data_total,"06:00","24:00").reset_index(drop = True)
    data_total.loc[data_total["keywords"].map(lambda x : "cop27" in x),"keyword"] = "cop27"
    
    data_total["keyword"] = data_total["keyword"].str.lower()

    data_total["day_extract"] = day
    data_all_days.append(data_total)
    
data_all_days = pd.concat(data_all_days,axis = 0,ignore_index = True)

FileNotFoundError: [Errno 2] No such file or directory: '../../data/cop27/0511/'

In [338]:
data_all_days["day_dt"] = data_all_days["date"].dt.date
data_all_days["day_str"] = data_all_days["day_dt"].map(str)

In [437]:
method = "first" #average or first
ranking = (data_all_days
           .groupby(["channel_name","media","media2","keyword","day_dt"])["count"].sum()
           .unstack("day_dt").fillna(0.0).stack()
           .unstack("keyword").fillna(0.0).stack()
#            .unstack("channel_name").fillna(0.0).stack()
           .reset_index(drop = False)
           .rename(columns = {0:"count"})
          )

ranking["minutes"] = ranking["count"] * 2
ranking["total_time"] = 18 * 60 * 1
ranking["media_part"] = ranking["minutes"] / ranking["total_time"]

ranking["rank"] = ranking.groupby(["media2","day_dt","keyword"])["count"].transform("rank",ascending = False,method = method)
ranking["count_total"] = ranking.groupby(["channel_name","media2","day_dt"])["count"].transform("sum")
ranking["rank_total"] = ranking.groupby(["media2","day_dt","keyword"])["count_total"].transform("rank",ascending = False,method = method)
ranking["media_part_total"] = ranking["count_total"] * 2 / ranking["total_time"]
ranking["day_str"] = ranking["day_dt"].map(str)
ranking.query("media2=='TV - Généraliste' and keyword=='cop27'").head()



Unnamed: 0,channel_name,media,media2,day_dt,keyword,count,minutes,total_time,media_part,rank,count_total,rank_total,media_part_total,day_str
12,C8,TV,TV - Généraliste,2022-11-05,cop27,2.0,4.0,1080,0.003704,3.0,9.0,2.0,0.016667,2022-11-05
14,C8,TV,TV - Généraliste,2022-11-06,cop27,1.0,2.0,1080,0.001852,5.0,2.0,5.0,0.003704,2022-11-06
16,C8,TV,TV - Généraliste,2022-11-07,cop27,2.0,4.0,1080,0.003704,6.0,6.0,7.0,0.011111,2022-11-07
24,Canal+,TV,TV - Généraliste,2022-11-05,cop27,0.0,0.0,1080,0.0,6.0,1.0,6.0,0.001852,2022-11-05
26,Canal+,TV,TV - Généraliste,2022-11-06,cop27,0.0,0.0,1080,0.0,6.0,1.0,6.0,0.001852,2022-11-06


In [441]:
ranking_data_chart = ranking.query("media2=='TV - Généraliste' and keyword=='cop27'")

def show_ranking_chart(ranking_data,title = "",height = 500,total = False):
    
    rank_col = "rank" if not total else "rank_total"
    percent_col = "media_part" if not total else "media_part_total"
    
    fig = px.line(
        ranking_data,
        x = "day_dt",
        y = rank_col,
        color = "channel_name",
        text = rank_col,
        markers = True,
    )

    # fig.update_traces(marker=dict(size=15),selector=dict(mode='markers'))
    fig.update_layout(
        xaxis_tickmode = "linear",yaxis_autorange = "reversed",
        xaxis_showgrid = False,yaxis_showgrid = False,
        xaxis_title = "Date de la COP27",
        yaxis_title = None,
        yaxis_showticklabels = False,
    )
    fig.update_traces(marker_size= 20,
                      marker = dict(line=dict(width=2)),textposition="middle center",
                      textfont_size=12,
                      textfont_color="white")



    annotations = []


    annot_labels_data = (ranking_data
                         .loc[ranking_data["day_dt"]==ranking_data["day_dt"].max()]
                         .sort_values(rank_col,ascending = True)
                        )


    # Adding labels    
    for i,row in annot_labels_data.iterrows():
        annotations.append(dict(xref='paper', x=0.95, y=row[rank_col],
                                      xanchor='left', yanchor='middle',
                                      text=row["channel_name"] + " ("+ f'{row[percent_col]:.1%}'+")",
                                      font=dict(family='Poppins',
                                                size=12),
                                      showarrow=False))
    #     # labeling the right_side of the plot
    #     annotations.append(dict(xref='paper', x=0.95, y=y_trace[11],
    #                                   xanchor='left', yanchor='middle',
    #                                   text='{}%'.format(y_trace[11]),
    #                                   font=dict(family='Arial',
    #                                             size=16),


    fig.update_layout(
    #     width = 1000,
        margin_r = 200,
        height = height,
        annotations=annotations,
        showlegend = False,title = title,
        font_family = "Poppins"
                     )
    return fig





In [442]:
show_ranking_chart(
    ranking.query("media2=='TV - Généraliste' and keyword=='cop27'"),
    "Evolution du classement des chaînes TV généralistes sur la COP27",
    height = 600,
).show()

show_ranking_chart(
    ranking.query("media2=='TV - Généraliste' and keyword=='cop27'"),
    "Evolution du classement des chaînes TV généralistes sur la COP27 et l'écologie",
    height = 600,
    total = True
).show()

In [430]:
show_ranking_chart(
    ranking.query("media2=='TV - Information en continu' and keyword=='cop27'"),
    "Evolution du classement des chaînes TV d'info en continu sur la COP27",
    height = 400,
)

In [428]:
show_ranking_chart(
    ranking.query("media2=='Radio' and keyword=='cop27'"),
    "Evolution du classement des chaînes Radio sur la COP27",
    height = 700
)