In [1]:
%load_ext autoreload
%autoreload 2

# COP26

![](../coverquotaclimat.png)

> Notebook python d'exploration pour fournir une base d'analyse et de visualisation pour toute l'équipe

In [23]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import datetime

import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

from quotaclimat.utils.plotly_theme import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Récupération des données

>  A changer plus tard une fois la base de données SQL mise en place

In [3]:
os.listdir("../data/cop26/")

['20221031_20211103_20211103_all_COP26.xlsx',
 '20221031_20211114_20211114_all_COP26.xlsx',
 '20221031_20211104_20211105_all_COP26.xlsx',
 '20221031_20211102_20211102_all_COP26.xlsx',
 '20221031_20211109_20211111_all_COP26.xlsx',
 '20221031_20211112_20211113_all_COP26.xlsx',
 '20221031_20211101_20211101_all_COP26.xlsx',
 '20221031_20211030_20211031_all_COP26.xlsx',
 '20221031_20211106_20211108_all_COP26.xlsx']

In [24]:
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_one
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_all_data_dump
from quotaclimat.data_processing.read_format_deduplicate import deduplicate_extracts

In [7]:
data = pd.read_excel('../data/cop26/20221031_20211106_20211108_all_COP26.xlsx')

In [8]:
data.columns

Index(['CHANNEL', 'RADIO', 'DATE', 'TEXT', 'HIGHLIGHT', 'START CHUNK',
       'END CHUNK', 'ORIGIN', 'URL'],
      dtype='object')

In [5]:
data = read_and_format_all_data_dump(path_folder = "../data/cop26/",path_channel_metadata=None)
data.shape

(24684, 14)

## Filtrer sur les top audiences

In [25]:
top_audiences = pd.read_excel("../data/channels.xlsx",sheet_name = "top_audiences")
top_audiences["channel_id"] = top_audiences["channel_name"] + "_" + top_audiences["media"]
top_channels_tv = top_audiences.query("media=='TV'")["channel_name"].tolist()
top_channels_radio = top_audiences.query("media=='Radio'")["channel_name"].tolist()

In [26]:
data["channel_id"] = data["channel_name"] + "_" + data["media"]

In [27]:
data = data.merge(top_audiences[["channel_id"]],on = ["channel_id"],how = "inner")

In [28]:
data.shape

(7966, 15)

Nombre de chaînes TV ou Radio dans l'échantillon

In [29]:
data.drop_duplicates(subset = ["channel_name"]).groupby(["media"])["channel_name"].count()

media
Radio    20
TV       14
Name: channel_name, dtype: int64

## Filtrer dans les horaires d'antenne

In [30]:
from quotaclimat.data_analytics.exploration import filter_data_between_hours

In [31]:
data = filter_data_between_hours(data,"06:00","24:00")

In [32]:
data.shape

(7966, 15)

In [33]:
510/8526

0.059817030260380016

# Correction des données

In [14]:
from quotaclimat.data_processing.nlp_filtering import NLPFilteringModel

ModuleNotFoundError: No module named 'deepmultilingualpunctuation'

In [107]:
model = NLPFilteringModel()

In [135]:
model.predict(text,topic_change = True,as_percent_environment=True)

0.7619248628113129

In [141]:
from tqdm.auto import tqdm
results = []

for i in enumerate(tqdm(data["text"].tolist())):
    results_i = model.predict(text,topic_change = True,as_percent_environment = True)
    results.append(results_i)

  0%|          | 0/8526 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [137]:
from quotaclimat.data_processing.keyword_processor import KeywordModel

keyword_replace_dict = {
    "COP26":["cop vingt-six","cop vingt six","COP26"],
}

kw = KeywordModel(keyword_replace_dict)

In [138]:
data = kw.extract_mentions(data)

In [139]:
n_mentions = data.groupby(["channel_name","media"],as_index = False).agg({"n_mentions":"mean"})

In [140]:
n_mentions.query("media=='TV'").sort_values("n_mentions",ascending = False)

Unnamed: 0,channel_name,media,n_mentions
4,Canal+,TV,1.8
17,LCP,TV,1.504425
32,TMC,TV,1.442623
7,France 24,TV,1.430954
11,France Info:,TV,1.419479
19,M6,TV,1.378378
3,CNEWS,TV,1.333333
6,France 2,TV,1.315217
8,France 5,TV,1.314607
16,LCI,TV,1.303922


# Data exploration

In [34]:
from quotaclimat.utils.channels import TOP_25_CHANNELS,TOP_CHANNELS_TV,TOP_CHANNELS_TV_8
from quotaclimat.data_analytics.exploration import show_mentions_by_channel
from quotaclimat.data_analytics.exploration import show_mentions_by_time_of_the_day
from quotaclimat.data_analytics.exploration import show_mentions_over_time
from quotaclimat.data_analytics.exploration import show_mentions_treemap
from quotaclimat.data_analytics.exploration import show_piechart_split_tv_radio

## Seulement des données sur 18h de la journée
On enlève de minuit à 6h

In [156]:
show_mentions_by_time_of_the_day(
    data,split = "channel_name",kind = "bar",height = 500,method = "minutes"
)

In [157]:
show_mentions_by_time_of_the_day(
    data,split = "channel_name",
    list_of_channels = top_channels_tv[:10],kind = "bar",height = 700,method = "minutes"
)

## Analyse 1 - volume médiatique total

In [35]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 18h * n_days)
n_days = 16
media_time = data.groupby(["media"]).agg({"count":"sum","channel_name":"nunique"})
media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)
media_time["media_time"] = media_time["count"] * 2
media_time["total_time"] = media_time["channel_name"] * n_days * 18 * 60
media_time["media_part"] = media_time["media_time"] / media_time["total_time"]

media_time

  media_time = media_time.append(pd.DataFrame(media_time.sum(axis = 0).rename("Total")).T)


Unnamed: 0,count,channel_name,media_time,total_time,media_part
Radio,4297,20,8594,345600,0.024867
TV,3669,14,7338,241920,0.030332
Total,7966,34,15932,587520,0.027117


## Analyse 2 - TOP 3 TV et Radio

In [36]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 16)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_tv,n = 3,
                               title = "TOP 3 TV COP26",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%')

In [37]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 16)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_radio,n = 3,
                               title = "TOP 3 Radio COP26",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%')

## Analyse 3 - Classement complet TV et Radio

In [38]:
# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 16)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_tv,n = 25,
                               title = "Classement TV COP26",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%').show()


# Multiplier for one channel = n_mentions * 2 min / (60 minutes * 18h * n_days)
multiplier = 2 / (1 * 60 * 18 * 16)

fig = show_mentions_by_channel(data,list_of_channels=top_channels_radio,n = 25,
                               title = "Classement Radio COP26",
                               method = multiplier,height = 400,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%').show()

## Analyse 4 - évolution du volume médiatique le long de la COP

In [39]:
# Multiplier = n_mentions * 2 min / (n_channels * 60 minutes * 20h)
multiplier = 2 / (data["channel_name"].nunique() * 60 * 18)

fig = show_mentions_over_time(data,freq = "D",method = multiplier,height = 500,text_auto = ".1%")
fig.update_layout(yaxis_tickformat='0%')

## Analyse 5 - couverture horaire

In [40]:
top_audiences_tv = [
    "TF1",
    "France 2",
    "France 3",
    "M6",
    "France 5",
    "TMC",
    "BFMTV",
    "C8",
    "W9",
    "CNEWS",
]

# 10 chaînes TV couvrent 70% de l'audience tv en France

In [41]:
# Multiplier = n_mentions * 2 min / (1 * 60 minutes * 1h * n_days)
multiplier = 2 / (data["channel_name"].nunique() * 16 * 1 * 60)

# method = "count"
method = multiplier

fig = show_mentions_by_time_of_the_day(
    data,split = None,freq = "1H",
    kind = "bar",height = 500,
    text_auto = ".1%",
    method = method,
)
if method != "count": 
    fig.update_layout(yaxis_tickformat='0%')
fig

In [42]:
# Multiplier = n_mentions * 2 min / (1 * 60 minutes * 1h * n_days)
multiplier = 2 / (len(top_audiences_tv) * 16 * 1 * 60)

# method = "count"
method = multiplier

fig = show_mentions_by_time_of_the_day(
    data.query("media=='TV'"),split = None,freq = "1H",
    list_of_channels = top_audiences_tv,
    kind = "bar",height = 500,
    text_auto = ".1%",
    method = method,
)
if method != "count": 
    fig.update_layout(yaxis_tickformat='0%')
fig

In [43]:
# Multiplier = n_mentions * 2 min / (1 * 60 minutes * 1h * n_days)
multiplier = 2 / (14 * 16 * 1 * 60)

# method = "count"
method = multiplier

fig = show_mentions_by_time_of_the_day(
    data.query("media=='TV'"),split = None,freq = "1H",
    list_of_channels = top_channels_tv,
    kind = "bar",height = 500,
    method = method,
)
if method != "count": 
    fig.update_layout(yaxis_tickformat='0%')
fig

# Analyses

In [44]:
show_mentions_by_channel(data,n = 30,method = "minutes")

In [45]:
show_mentions_by_channel(data,list_of_channels=top_channels_tv[:10],method = "minutes").show()
show_mentions_by_channel(data,list_of_channels=top_channels_radio[:10],method = "minutes").show()

show_mentions_by_channel(data,list_of_channels=top_channels_tv,method = "minutes").show()
show_mentions_by_channel(data,list_of_channels=top_channels_radio,method = "minutes").show()

In [46]:
show_mentions_over_time(
    data,split = "channel_name",
    list_of_channels = top_channels_tv[:10],kind = "bar",height = 700,method = "minutes"
)

In [47]:
show_mentions_by_time_of_the_day(
    data,split = "channel_name",
    list_of_channels = top_channels_tv[:10],kind = "bar",height = 700,method = "minutes"
)

# Préparation du Baromètre

## Travail préliminaire sur les données

**Méthodologie** : 
- Sélectionner les heures d'écoute les plus importantes sur TV et Radio
- TV : 19h-22h
- Radio : 6h30-9h30

In [48]:
from quotaclimat.data_analytics.exploration import filter_data_between_hours

In [49]:
data_tv = filter_data_between_hours(data,"19:00","22:00").query("media=='TV'")
data_radio = filter_data_between_hours(data,"06:30","09:30").query("media=='Radio'")
data_france_info = data_radio.query("channel_name=='France Info'")

## Niveau 1

### Calcul du % du temps médiatique

In [50]:
n_days = 16

In [51]:
# Multiplier is 2min / (n_channel * 60min * n_days)
show_mentions_by_time_of_the_day(
    data_tv,freq = "1H",
    method = 2 / (data_tv["channel_name"].nunique() * 60 * n_days)
)

In [52]:
# Multiplier is 2min / (n_channel * 60min)
show_mentions_by_time_of_the_day(
    data_radio,freq = "1H",
    method = 2 / (data_radio["channel_name"].nunique() * 60 * n_days)
)

In [53]:
# Multiplier is 2min / (n_channel * 60min)
show_mentions_by_time_of_the_day(
    data_france_info,freq = "1H",
    method = 2 / (data_france_info["channel_name"].nunique() * 60 * n_days)
)

### Podiums TOP5 et FLOP5

#### TOP 5 et TOP 25 TV (toute audience confondues)

In [54]:
# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    n = 5,
    text_auto = ".2%"
).update_layout(yaxis_tickformat='0%').show()

# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    n = 30,
    text_auto = ".2%"
).update_layout(yaxis_tickformat='0%').show()

In [55]:
# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    list_of_channels = TOP_CHANNELS_TV,
    text_auto = ".1%"
).update_layout(yaxis_tickformat='0%').show()

# Stress test methodo

In [16]:
import scipy.stats as st
st.norm.ppf(.975)

1.959963984540054

In [19]:
zscore = 1.96 # 95% confidence
std = 1  # 1.5 minutes durée standard de l'extrait, +- 1 minute (large std ): 
error = 0.1
min_sample_size = (zscore**2 * std*std)/ (error)**2
min_sample_size

384.1599999999999

In [82]:
data_to_validate = pd.DataFrame()
data_to_validate = data[(data.channel_name == 'TMC') & (data.date > '2021-11-08')].sample(frac=0.10)
data_to_validate = pd.concat([data_to_validate, data[(data.channel_name == 'France 24') & (data.date > '2021-11-08')].sample(frac=0.10)])
data_to_validate = pd.concat([data_to_validate, data[(data.channel_name == 'TF1')& (data.date > '2021-11-08')].sample(frac=0.1)])
data_to_validate.shape

(45, 15)

In [83]:
data_to_validate = pd.concat([data_to_validate, data[(data.date > '2021-11-08')].sample(frac=0.13)])
data_to_validate.shape

(383, 15)

In [60]:
data.shape

(7966, 15)

In [63]:
data_to_validate[['channel_name','date', 'url']].to_csv("stress_test_manual_check.csv")

In [84]:
data_to_validate[['channel_name','date', 'url']].to_csv("stress_test_manual_check.csv")