In [1]:
%load_ext autoreload
%autoreload 2

# COP27`

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import datetime

import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

from quotaclimat.utils.plotly_theme import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Update les data de la journée dans ../data/keywords/cop27/
Vous pouvez restrindre l'analyse à une journée en placant seulement data de la journée dan le ficher ou en changant DATA_PATH en dessous

# 2. Récupération des données

In [5]:
DATA_PATH = "../data/keywords/cop27/"
os.listdir(DATA_PATH)


['20221105_20221029_gregoirefournas.xlsx',
 '20221105_20221029_cop27.xlsx',
 '20221105_20221104_routeduRhum.xlsx',
 '20221104_20221102_routeduRhum.xlsx',
 '20221101_20221029_routeduRhum.xlsx',
 '20221105_20221029_elonmusk.xlsx']

In [6]:
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_one
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_all_data_dump
from quotaclimat.data_processing.read_format_deduplicate import deduplicate_extracts
from quotaclimat.data_analytics.exploration import filter_data_between_hours
from quotaclimat.utils.channels import TOP_25_CHANNELS,TOP_CHANNELS_TV,TOP_CHANNELS_TV_8
from quotaclimat.data_analytics.exploration import show_mentions_by_channel
from quotaclimat.data_analytics.exploration import show_mentions_by_time_of_the_day
from quotaclimat.data_analytics.exploration import show_mentions_over_time
from quotaclimat.data_analytics.exploration import show_mentions_treemap
from quotaclimat.data_analytics.exploration import show_piechart_split_tv_radio

In [7]:
data = read_and_format_all_data_dump(path_folder=DATA_PATH, path_channel_metadata=None)
data.shape

(10788, 15)

## Processing

In [8]:
top_audiences = pd.read_excel("../data/channels.xlsx",sheet_name = "top_audiences")
top_audiences["channel_id"] = top_audiences["channel_name"] + "_" + top_audiences["media"]
top_channels_tv = top_audiences.query("media=='TV'")["channel_name"].tolist()
top_channels_radio = top_audiences.query("media=='Radio'")["channel_name"].tolist()

In [59]:
# Boolean for top audiance TV and Radios
data['is_top_audiance'] = False
data.loc[data.channel_name.isin(top_channels_tv + top_channels_radio), 'is_top_audiance'] = True

data_at_pick_hours_6_00 = filter_data_between_hours(data, min_hour="06:00", max_hour="24:00")
data_at_pick_hours_6_10 = filter_data_between_hours(data, min_hour="06:00", max_hour="10:00")
data_at_pick_hours_19_21 = filter_data_between_hours(data, min_hour="19:00", max_hour="21:00")


# 3. Analyse mot clef COP 27

In [11]:
data_cop27 = data_at_pick_hours_6_00[data_at_pick_hours_6_00.keyword == 'cop27']

## Courverture total

In [12]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 18h * n_days)
n_days = (data_cop27.date.max() - data_cop27.date.min()).days
media_time = data_cop27.groupby(["media"]).agg({"count":"sum","channel_name":"nunique"})
media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)
media_time["media_time"] = media_time["count"] * 2
media_time["total_time"] = media_time["channel_name"] * n_days * 18 * 60
media_time["media_part"] = media_time["media_time"] / media_time["total_time"]

media_time

  media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)


Unnamed: 0,count,channel_name,media_time,total_time,media_part
Radio,568,132,1136,855360,0.001328
TV,471,25,942,162000,0.005815
Total,1039,157,2078,1017360,0.002043


##  TOP 3 TV // radio

In [14]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * n_days)

fig = show_mentions_by_channel(data_cop27,list_of_channels=top_channels_tv,n = 3,
                               title = "TOP 3 TV COP27",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%')

In [15]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * n_days)

fig = show_mentions_by_channel(data_cop27,list_of_channels=top_channels_radio,n = 3,
                               title = "TOP 3 Radio COP27",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%')

## Classement complet radio et tv

In [16]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * n_days)

fig = show_mentions_by_channel(data_cop27,list_of_channels=top_channels_tv,n = 25,
                               title = "Classement TV COP26",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%').show()


# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * n_days)

fig = show_mentions_by_channel(data_cop27,list_of_channels=top_channels_radio,n = 25,
                               title = "Classement Radio COP26",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%').show()

In [18]:
## Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 20h)
multiplier = 2 / (data["channel_name"].nunique() * 60 * 18)

fig = show_mentions_over_time(data_cop27,freq = "D",method = multiplier,height = 500,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%')

# 4. COP 27 et autres actualités

In [20]:
data_elonmusk = data[data.keyword == 'elonmusk']

In [65]:
data_at_pick_hours_19_21.date.dt.hour

12       19
39       19
54       19
72       19
73       20
         ..
10742    19
10756    19
10757    20
10758    20
10779    19
Name: date, Length: 902, dtype: int64

2

In [76]:
def get_percentage_per_day(df):
    minutes_per_sample = 2
    minutes_covered_per_day = ((max(df.date.dt.hour) - min(df.date.dt.hour)) + 1) * 60 * df.media.nunique()
    nb_days = (df.date.max() - df.date.min()).days

    df_percentage = df.set_index('date').groupby(pd.Grouper(freq='d')).count()['index'] * minutes_per_sample / (minutes_covered_per_day * nb_days) * 100
    df_percentage.name = 'percentage'
    return df_percentage



In [77]:
fig = go.Figure()
data_percentage_elon = get_percentage_per_day(data[data.keyword == 'elonmusk'])
data_percentage_cop27 = get_percentage_per_day(data[data.keyword == 'cop27'])
data_percentage_routeduRhum = get_percentage_per_day(data[data.keyword == 'routeduRhum'])
data_percentage_gregoirefournas = get_percentage_per_day(data[data.keyword == 'gregoirefournas'])
fig.add_trace(go.Scatter(y=data_percentage_elon, x=data_percentage_elon.index, name='Elon Musk'))
fig.add_trace(go.Scatter(y=data_percentage_cop27, x=data_percentage_cop27.index, name='COP 27'))
fig.add_trace(go.Scatter(y=data_percentage_gregoirefournas, x=data_percentage_gregoirefournas.index, name='Gregoire Fournas'))
fig.add_trace(go.Scatter(y=data_percentage_routeduRhum, x=data_percentage_routeduRhum.index, name='route du Rhum'))
fig.update_layout(title='Couverture en % des sujet d actualité TV + Radio de 6h à 00h')
fig.show()

In [78]:
fig = go.Figure()
data_percentage_elon = get_percentage_per_day(data[(data.keyword == 'elonmusk')&data.radio])
data_percentage_cop27 = get_percentage_per_day(data[(data.keyword == 'cop27')&data.radio])
data_percentage_routeduRhum = get_percentage_per_day(data[(data.keyword == 'routeduRhum')&data.radio])
data_percentage_gregoirefournas = get_percentage_per_day(data[(data.keyword == 'gregoirefournas')&data.radio])
fig.add_trace(go.Scatter(y=data_percentage_elon, x=data_percentage_elon.index, name='Elon Musk'))
fig.add_trace(go.Scatter(y=data_percentage_cop27, x=data_percentage_cop27.index, name='COP 27'))
fig.add_trace(go.Scatter(y=data_percentage_gregoirefournas, x=data_percentage_gregoirefournas.index, name='Gregoire Fournas'))
fig.add_trace(go.Scatter(y=data_percentage_routeduRhum, x=data_percentage_routeduRhum.index, name='route du Rhum'))
fig.update_layout(title='Couverture en % des sujet d actualité Radio de 6h à 00h')
fig.show()

In [79]:
data_to_use = data_at_pick_hours_19_21.copy()

fig = go.Figure()
data_percentage_elon = get_percentage_per_day(data_to_use[(data_to_use.keyword == 'elonmusk')& ~data_to_use.radio])
data_percentage_cop27 = get_percentage_per_day(data_to_use[(data_to_use.keyword == 'cop27')& ~data_to_use.radio])
data_percentage_routeduRhum = get_percentage_per_day(data_to_use[(data_to_use.keyword == 'routeduRhum')& ~data_to_use.radio])
data_percentage_gregoirefournas = get_percentage_per_day(data_to_use[(data_to_use.keyword == 'gregoirefournas')& ~data_to_use.radio])
fig.add_trace(go.Scatter(y=data_percentage_elon, x=data_percentage_elon.index, name='Elon Musk'))
fig.add_trace(go.Scatter(y=data_percentage_cop27, x=data_percentage_cop27.index, name='COP 27'))
fig.add_trace(go.Scatter(y=data_percentage_gregoirefournas, x=data_percentage_gregoirefournas.index, name='Gregoire Fournas'))
fig.add_trace(go.Scatter(y=data_percentage_routeduRhum, x=data_percentage_routeduRhum.index, name='route du Rhum'))
fig.update_layout(title='Couverture en % des sujet d actualité TV de 6h à 00h')
fig.show()

## Quelle media parle de quoi?

In [19]:
minutes_per_sample = 2
minutes_covered_radio_per_day = (10 - 6) * 60
nb_days = (data_cop27.date.max() - data_cop27.date.min()).days

def get_ration(df, minutes_per_sample = 2, minutes_covered_radio_per_day = (10 - 6) * 60):
    nb_days = (df.date.max() - df.date.min()).days
    ratio = round(df[(df.radio)&df.is_top_audiance& data.is_heure_grande_ecoute].groupby('channel_name').count()['count'] *minutes_per_sample / (minutes_covered_radio_per_day * nb_days) * 100, 1)
    ratio.name = 'percentage'
    return ratio
cop27_ratio = get_ration(data[data.keyword == 'cop27'])
gregoirefournas_ratio = get_ration(data[data.keyword == 'gregoirefournas'])
elonmusk = get_ration(data[data.keyword == 'elonmusk'])
routeduRhum_ratio = get_ration(data[data.keyword == 'routeduRhum'])

AttributeError: 'DataFrame' object has no attribute 'is_heure_grande_ecoute'

In [114]:
minutes_per_sample = 2
minutes_covered_radio_per_day = (10 - 6) * 60
nb_days = (data_cop27.date.max() - data_cop27.date.min()).days

def get_ration(df, minutes_per_sample = 2, minutes_covered_radio_per_day = (10 - 6) * 60):
    nb_days = (df.date.max() - df.date.min()).days
    ratio = round(df[(df.radio)&df.is_top_audiance& data.is_heure_grande_ecoute].groupby('channel_name').count()['count'] *minutes_per_sample / (minutes_covered_radio_per_day * nb_days) * 100, 1)
    ratio.name = 'percentage'
    return ratio
cop27_ratio = get_ration(data[data.keyword == 'cop27'])
gregoirefournas_ratio = get_ration(data[data.keyword == 'gregoirefournas'])
elonmusk = get_ration(data[data.keyword == 'elonmusk'])
routeduRhum_ratio = get_ration(data[data.keyword == 'routeduRhum'])


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [125]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y = cop27_ratio,
    x = cop27_ratio.index,
    text=cop27_ratio,
    textposition='auto',
    name='cop27'
))
fig.add_trace(go.Bar(
    y = gregoirefournas_ratio,
    x = gregoirefournas_ratio.index,
    name='Gregoire Fournas',
    textposition='auto',
    text=gregoirefournas_ratio,
))
fig.add_trace(go.Bar(
    y = elonmusk,
    x = elonmusk.index,
    textposition='auto',
    text=elonmusk,
    name='Elon Musk'
))
fig.add_trace(go.Bar(
    y = routeduRhum_ratio,
    x = routeduRhum_ratio.index,
    textposition='auto',
    text=routeduRhum_ratio,
    name='route du Rhum'
))
fig.update_layout(title='% couverture radio entre 6h et 10h semaine du 31 Octobre')
fig.update_yaxes(title='% de couverture')
fig.show()

In [126]:
## TV

In [None]:
minutes_per_sample = 2
minutes_covered_radio_per_day = (21 - 19) * 60
nb_days = (data_cop27.date.max() - data_cop27.date.min()).days

def get_ratio_tv(df, minutes_per_sample = 2, minutes_covered_radio_per_day =minutes_covered_radio_per_day):
    nb_days = (df.date.max() - df.date.min()).days
    ratio = round(df[(~df.radio)&df.is_top_audiance& data.is_heure_grande_ecoute].groupby('channel_name').count()['count'] *minutes_per_sample / (minutes_covered_radio_per_day * nb_days) * 100, 1)
    ratio.name = 'percentage'
    return ratio
cop27_ratio = get_ratio_tv(data[data.keyword == 'cop27'])
gregoirefournas_ratio = get_ratio_tv(data[data.keyword == 'gregoirefournas'])
elonmusk = get_ratio_tv(data[data.keyword == 'elonmusk'])
routeduRhum_ratio = get_ratio_tv(data[data.keyword == 'routeduRhum'])

In [81]:
show_mentions_by_channel(data_cop27[(data_cop27.radio)&data_cop27.is_top_audiance& data.is_heure_grande_ecoute], n=3, method="count", title='Nombre de mention par radio top 3 semaine pre COP27 ')




Boolean Series key will be reindexed to match DataFrame index.



In [12]:
show_mentions_by_channel(data_cop27[~data_cop27.radio], n=10,method="minutes", title='Nombre de mention par TV top 10 semaine pre COP27 ')

## En heure de haute audiance

In [19]:
data_tv_1920 = data[(~data.radio)&(data.date.dt.hour >= 19)&(data.date.dt.hour < 21)]
show_mentions_by_channel(data_tv_1920, n=10,method="minutes", title='Nombre de mention par TV entre 19h et 21h top 10 semaine pre COP27')

In [26]:
data_radio_1920 = data[(data.radio)&(data.date.dt.hour >= 6)&(data.date.dt.hour < 10)]
show_mentions_by_channel(data_radio_1920, n=10,method="minutes", title='Nombre de mention par radio entre 6h et 10h top 10 semaine pre COP27')

# Mention au cours du temps

In [21]:
data_top_25_tv = data[data.channel_name.isin(TOP_CHANNELS_TV)]
data_gb = data_top_25_tv.groupby(['channel_name', data_top_25_tv['date'].dt.date]).count()['text'].reset_index()
data_gb.columns = ['Chaîne', 'Date', 'Nombre de mention']
fig = px.area(
    data_gb,
    x = "Date", y = 'Nombre de mention',color = "Chaîne",
    category_orders={"TIME":data_gb["Date"].unique()},
    title = "Evoluation des mentions au cours de la semaine pre COP 27 top 25 TV", height = 400,
)
fig.update_layout( xaxis=dict(type="category", categoryorder='category ascending'),
                     yaxis_title='Nomre de mention COP27 par jour')
fig.show()

In [19]:
data_top_8_tv = data[data.channel_name.isin(TOP_CHANNELS_TV_8)]
data_gb = data_top_8_tv.groupby(['channel_name', data_top_8_tv['date'].dt.date]).count()['text'].reset_index()
data_gb.columns = ['Chaîne', 'Date', 'Nombre de mention']
fig = px.area(
    data_gb,
    x = "Date", y = 'Nombre de mention',color = "Chaîne",
    category_orders={"TIME":data_gb["Date"].unique()},
    title = "Evoluation des mentions au cours de la semaine avant COP 27 top 8 TV", height = 400,
)
fig.update_layout( xaxis=dict(type="category", categoryorder='category ascending'),
                     yaxis_title='Nomre de mention COP27 par jour')
fig.show()

## All

In [20]:
data_radio = data[data.radio]
data_gb = data_radio.groupby(['channel_name', data_radio['date'].dt.date]).count()['text'].reset_index()
data_gb.columns = ['Chaîne', 'Date', 'Nombre de mention']
fig = px.area(
    data_gb,
    x = "Date", y = 'Nombre de mention',color = "Chaîne",
    category_orders={"TIME":data_gb["Date"].unique()},
    title = "Evoluation des mentions au cours de la semaine avant COP 27 toutes radio confondues", height = 400,
)
fig.update_layout( xaxis=dict(type="category", categoryorder='category ascending'),
                     yaxis_title='Nomre de mention COP27 par jour')
fig.show()

In [25]:
show_mentions_by_time_of_the_day(
    data,split = "channel_name",
    list_of_channels = TOP_CHANNELS_TV,kind = "bar",height = 700,method = "minutes"
)

# Couverture mediatique en pourcentage

In [37]:
# TV 18h a day
hours_covered = 18
data_tv_1920 = data[(~data.radio)]
print((data_tv_1920.shape[0] * 2 )/ (data_tv_1920.channel_name.nunique() * 60*hours_covered))
print("Top audimat")

data_tv_1920_top25 = data[(data.channel_name.isin(TOP_25_CHANNELS))]
print((data_tv_1920_top25.shape[0] * 2 )/ (25 *60*hours_covered))

0.019200779727095517
Top audimat
0.018962962962962963


In [33]:
# TV haute audience
data_tv_1920 = data[(~data.radio)&(data.date.dt.hour >= 19)&(data.date.dt.hour < 21)]
print((data_tv_1920.shape[0] * 2 )/ (data_tv_1920.channel_name.nunique() * 120))
print("Top audimat")

data_tv_1920_top25 = data[(data.channel_name.isin(TOP_25_CHANNELS))&(data.date.dt.hour >= 19)&(data.date.dt.hour < 21)]
print((data_tv_1920_top25.shape[0] * 2 )/ (25 * 120))


0.03787878787878788
Top audimat
0.023333333333333334


In [38]:
# radio 18h a day 
hours_covered = 18
data_radio = data[(data.radio)]
print((data_radio.shape[0] * 2 )/ (data_radio.channel_name.nunique() * 60*hours_covered))
print("Top audimat")



0.005680680680680681
Top audimat


In [39]:
# radio haute audiance
hours_covered = 18
data_radio_1920 = data[(data.radio)&(data.date.dt.hour >= 6)&(data.date.dt.hour < 10)]
print((data_radio_1920.shape[0] * 2 )/ (data_radio_1920.channel_name.nunique() * 60*4))
print("Top audimat")

0.014732142857142857
Top audimat


In [21]:
# Keywords oppposite

In [28]:
data_radio = data[data.radio]
data_gb = data_radio.groupby(['keyword', data_radio['date'].dt.date]).count()['text'].reset_index()
data_gb.columns = ['Keyword', 'Date', 'Nombre de mention']
fig = px.area(
    data_gb,
    x = "Date", y = 'Nombre de mention',color = "Keyword",
    category_orders={"TIME":data_gb["Date"].unique()},
    title = "Evoluation du nombre de mention par jour au cours de la 1er semaine de Novembe: Elon Musk VS COP 27 toutes radio confondues", height = 400,
)
fig.update_layout( xaxis=dict(type="category", categoryorder='category ascending'),
                     yaxis_title='Nomre de mention par jour')
fig.show()

In [47]:
data_top_8_tv = data[data.channel_name.isin(TOP_CHANNELS_TV_8)]
data_top_8_tv = filter_data_between_hours(data_top_8_tv,"19:00","21:00")
data_gb = data_top_8_tv.groupby(['keyword', data_top_8_tv['date'].dt.date]).count()['text'].reset_index()
data_gb.columns = ['Keyword', 'Date', 'Nombre de mention']
data_gb['Couverture entre 19 et 21h (%)'] = round(data_gb['Nombre de mention']/60 * 100, 1)
fig = px.area(
    data_gb,
    x = "Date", y = 'Couverture entre 19 et 21h (%)',color = "Keyword",
    category_orders={"TIME":data_gb["Date"].unique()},
    title = "Evoluation de la couverture mediatique de 4 mots clef, semaine pre COP27, top 8 TV", height = 400,
)
fig.update_layout( xaxis=dict(type="category", categoryorder='category ascending'),
                     )

fig.show()


# Theo's work:

# Préparation du Baromètre

## Travail préliminaire sur les données

**Méthodologie** : 
- Sélectionner les heures d'écoute les plus importantes sur TV et Radio
- TV : 19h-22h
- Radio : 6h30-9h30

In [69]:
from quotaclimat.data_analytics.exploration import filter_data_between_hours

In [104]:
data_tv = filter_data_between_hours(data,"19:00","22:00").query("media=='TV'")
data_radio = filter_data_between_hours(data,"06:30","09:30").query("media=='Radio'")
data_france_info = data_radio.query("channel_name=='France Info'")

## Niveau 1

### Calcul du % du temps médiatique

In [105]:
n_days = 16

In [40]:
# Multiplier is 2min / (n_channel * 60min * n_days)
show_mentions_by_time_of_the_day(
    data_tv,freq = "1H",
    method = 2 / (data_tv["channel_name"].nunique() * 60 * n_days)
)

NameError: name 'data_tv' is not defined

In [41]:
# Multiplier is 2min / (n_channel * 60min)
show_mentions_by_time_of_the_day(
    data_radio,freq = "1H",
    method = 2 / (data_radio["channel_name"].nunique() * 60 * n_days)
)

NameError: name 'n_days' is not defined

In [42]:
# Multiplier is 2min / (n_channel * 60min)
show_mentions_by_time_of_the_day(
    data_france_info,freq = "1H",
    method = 2 / (data_france_info["channel_name"].nunique() * 60 * n_days)
)

NameError: name 'data_france_info' is not defined

### Podiums TOP5 et FLOP5

#### TOP 5 et TOP 25 TV (toute audience confondues)

In [122]:
# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    n = 5,
    text_auto = ".2%"
).update_layout(yaxis_tickformat='0%').show()

# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    n = 30,
    text_auto = ".2%"
).update_layout(yaxis_tickformat='0%').show()

In [127]:
# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    list_of_channels = TOP_CHANNELS_TV,
    text_auto = ".1%"
).update_layout(yaxis_tickformat='0%').show()