# COP26

![](../coverquotaclimat.png)

> Notebook python d'exploration pour fournir une base d'analyse et de visualisation pour toute l'équipe

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import datetime

import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

from quotaclimat.utils.plotly_theme import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Récupération des données

>  A changer plus tard une fois la base de données SQL mise en place

In [4]:
os.listdir("../data/cop26/")

['20221031_20211103_20211103_all_COP26.xlsx',
 '20221031_20211114_20211114_all_COP26.xlsx',
 '20221031_20211104_20211105_all_COP26.xlsx',
 '20221031_20211102_20211102_all_COP26.xlsx',
 '20221031_20211109_20211111_all_COP26.xlsx',
 '20221031_20211112_20211113_all_COP26.xlsx',
 '20221031_20211101_20211101_all_COP26.xlsx',
 '20221031_20211030_20211031_all_COP26.xlsx',
 '20221031_20211106_20211108_all_COP26.xlsx']

In [5]:
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_one
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_all_data_dump
from quotaclimat.data_processing.read_format_deduplicate import deduplicate_extracts

In [6]:
data = read_and_format_all_data_dump(path_folder = "../data/cop26/",path_channel_metadata=None)
data.shape

(24728, 14)

# Data exploration

In [36]:
from quotaclimat.utils.channels import TOP_25_CHANNELS,TOP_CHANNELS_TV,TOP_CHANNELS_TV_8
from quotaclimat.data_analytics.exploration import show_mentions_by_channel
from quotaclimat.data_analytics.exploration import show_mentions_by_time_of_the_day
from quotaclimat.data_analytics.exploration import show_mentions_over_time
from quotaclimat.data_analytics.exploration import show_mentions_treemap
from quotaclimat.data_analytics.exploration import show_piechart_split_tv_radio

In [37]:
show_mentions_over_time(data,freq = "D",method = "minutes")

In [123]:
show_mentions_by_channel(data,n = 30,method = "minutes")

In [42]:
show_mentions_by_channel(data,list_of_channels=TOP_CHANNELS_TV_8,method = "minutes").show()
show_mentions_by_channel(data,list_of_channels=TOP_CHANNELS_TV,method = "minutes").show()

In [55]:
show_mentions_over_time(
    data,split = "channel_name",
    list_of_channels = TOP_CHANNELS_TV,kind = "bar",height = 700,method = "minutes"
)

In [56]:
show_mentions_by_time_of_the_day(
    data,split = "channel_name",
    list_of_channels = TOP_CHANNELS_TV,kind = "bar",height = 700,method = "minutes"
)

# Préparation du Baromètre

## Travail préliminaire sur les données

**Méthodologie** : 
- Sélectionner les heures d'écoute les plus importantes sur TV et Radio
- TV : 19h-22h
- Radio : 6h30-9h30

In [69]:
from quotaclimat.data_analytics.exploration import filter_data_between_hours

In [104]:
data_tv = filter_data_between_hours(data,"19:00","22:00").query("media=='TV'")
data_radio = filter_data_between_hours(data,"06:30","09:30").query("media=='Radio'")
data_france_info = data_radio.query("channel_name=='France Info'")

## Niveau 1

### Calcul du % du temps médiatique

In [105]:
n_days = 16

In [106]:
# Multiplier is 2min / (n_channel * 60min * n_days)
show_mentions_by_time_of_the_day(
    data_tv,freq = "1H",
    method = 2 / (data_tv["channel_name"].nunique() * 60 * n_days)
)

In [107]:
# Multiplier is 2min / (n_channel * 60min)
show_mentions_by_time_of_the_day(
    data_radio,freq = "1H",
    method = 2 / (data_radio["channel_name"].nunique() * 60 * n_days)
)

In [108]:
# Multiplier is 2min / (n_channel * 60min)
show_mentions_by_time_of_the_day(
    data_france_info,freq = "1H",
    method = 2 / (data_france_info["channel_name"].nunique() * 60 * n_days)
)

### Podiums TOP5 et FLOP5

#### TOP 5 et TOP 25 TV (toute audience confondues)

In [122]:
# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    n = 5,
    text_auto = ".2%"
).update_layout(yaxis_tickformat='0%').show()

# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    n = 30,
    text_auto = ".2%"
).update_layout(yaxis_tickformat='0%').show()

In [127]:
# Multiplier is 2min / (3h * 60min * n_days)
show_mentions_by_channel(
    data_tv,
    method = 2 / (3 * 60 * n_days),
    list_of_channels = TOP_CHANNELS_TV,
    text_auto = ".1%"
).update_layout(yaxis_tickformat='0%').show()

# Select data for stress test

We assumed that the lenght of a subject being covered around a non-generic keyword mention is following a normal distribution of mean 2 minutes. We would like to validate the normality (non-heavy tails). Also, the distribution is assumed uniform accross the different media (within TV and radio). To validate this hypothesis, we validate the distribution of mention of the top channels, as they seem to be popping up high. 

## Sample size


In [None]:
import scipy.stats as st
st.norm.ppf(.975)

In [76]:
2*0.8

1.6

In [77]:
zscore = 1.96 # 95% confidence
std = 1  # 1.5 minutes durée standard de l'extrait, +- 1 minute (large std ): 
error = 0.1
min_sample_size = (zscore**2 * std*std)/ (error)**2
min_sample_size


384.1599999999999

## Sampling



In [57]:
pattern = r'\b{}\b'.format('|'.join(['cop vingt-six', 'cop 26', 'cop vingt six']))

data['count_cop26'] = data.text.str.count(pattern)

In [63]:
data[data.channel_name.isin(['France 24', 'Euronews']) ]['count_cop26'].hist(backend='plotly')

In [60]:
data['count_cop26'].hist(backend='plotly')

In [54]:
data.loc[data['text'].isin(['cop vingt-six']),'text'].value_counts()

Series([], Name: text, dtype: int64)

In [17]:
np.random.seed(42)

data.shape


(24728, 14)

In [85]:
data_to_validate = pd.DataFrame()
data_to_validate = data[(data.channel_name == 'Euronews') & (data.date > '2021-11-06')].sample(frac=0.10)
data_to_validate = pd.concat([data_to_validate, data[(data.channel_name == 'France 24') & (data.date > '2021-11-06')].sample(frac=0.10)])
data_to_validate = pd.concat([data_to_validate, data[(data.channel_name == 'TF1')& (data.date > '2021-11-06')].sample(frac=0.1)])
data_to_validate = pd.concat([data_to_validate, data[(~data.radio)&(data.date > '2021-11-06')].sample(frac=0.05)])
data_to_validate.shape

(319, 15)

In [86]:
data_to_validate = data_to_validate[['channel_name', 'text', 'url', 'date']]


In [87]:
data_to_validate.to_csv('data_to_validate.csv')

In [25]:
data_to_validate

Unnamed: 0,index,channel_name,radio,text,highlight,url,date,time,time_of_the_day,media,path_file,count,duration,keyword
6837,967,Euronews,False,vidéo pour exhorter les dirigeants enfin agir ...,de la cop vingt-six le président russe a réce...,https://keywords.mediatree.fr/player/?fifo=eur...,2021-11-02 10:30:00,10:30:00,0 days 10:30:00,TV,../data/cop26/20221031_20211102_20211102_all_C...,1,2,COP26
23099,1450,Euronews,False,<unk> <unk> <unk> journée de mobilisation pour...,climatique au moment où se tient la cop vingt...,https://keywords.mediatree.fr/player/?fifo=eur...,2021-11-07 06:00:00,06:00:00,0 days 06:00:00,TV,../data/cop26/20221031_20211106_20211108_all_C...,1,2,COP26
4188,1198,Euronews,False,à rendre utrecht plus verte et investir dans l...,à rendre utrecht plus verte et investir dans l...,https://keywords.mediatree.fr/player/?fifo=eur...,2021-11-04 13:36:00,13:36:00,0 days 13:36:00,TV,../data/cop26/20221031_20211104_20211105_all_C...,1,2,COP26
11857,715,Euronews,False,<unk> <unk> <unk> <unk> <unk> <unk> elle devai...,elle devait s' achever vendredi soir mais la ...,https://keywords.mediatree.fr/player/?fifo=eur...,2021-11-13 06:40:00,06:40:00,0 days 06:40:00,TV,../data/cop26/20221031_20211112_20211113_all_C...,1,2,COP26
11275,133,Euronews,False,<unk> <unk> <unk> la cop vingt-six doit offici...,<unk> <unk> <unk> la cop vingt-six doit offici...,https://keywords.mediatree.fr/player/?fifo=eur...,2021-11-12 09:00:00,09:00:00,0 days 09:00:00,TV,../data/cop26/20221031_20211112_20211113_all_C...,1,2,COP26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,483,France 24,False,sont largement insuffisant rejoignez nous à gl...,de la cop vingt-six <unk> inquiétude en ethio...,https://keywords.mediatree.fr/player/?fifo=fra...,2021-11-03 15:30:00,15:30:00,0 days 15:30:00,TV,../data/cop26/20221031_20211103_20211103_all_C...,1,2,COP26
22651,1002,TF1,False,est toujours à la peine dans les sondages mais...,pour une justice climatique alors que la cop ...,https://keywords.mediatree.fr/player/?fifo=tf1...,2021-11-06 20:22:00,20:22:00,0 days 20:22:00,TV,../data/cop26/20221031_20211106_20211108_all_C...,1,2,COP26
6466,596,TF1,False,<unk> bonsoir à tous et ça on recevra jean cas...,sur cette cop vingt-six qui a débuté j' imagi...,https://keywords.mediatree.fr/player/?fifo=tf1...,2021-11-02 21:00:00,21:00:00,0 days 21:00:00,TV,../data/cop26/20221031_20211102_20211102_all_C...,1,2,COP26
1587,141,TF1,False,brest puis <unk> madame monsieur bonsoir bienv...,les titres l' accord qui ne satisfait personn...,https://keywords.mediatree.fr/player/?fifo=tf1...,2021-11-14 19:58:00,19:58:00,0 days 19:58:00,TV,../data/cop26/20221031_20211114_20211114_all_C...,1,2,COP26
