# COP26

![](../coverquotaclimat.png)

> Notebook python d'exploration pour fournir une base d'analyse et de visualisation pour toute l'équipe

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import datetime

import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

from quotaclimat.utils.plotly_theme import *

In [2]:
%%html
<style type="text/css">
@import url('http://fonts.googleapis.com/css?family=Poppins');
</style>

# Récupération des données

>  A changer plus tard une fois la base de données SQL mise en place

In [3]:
os.listdir("../data/cop26/")

['20221031_20211030_20211031_all_COP26.xlsx',
 '20221031_20211101_20211101_all_COP26.xlsx',
 '20221031_20211102_20211102_all_COP26.xlsx',
 '20221031_20211103_20211103_all_COP26.xlsx',
 '20221031_20211104_20211105_all_COP26.xlsx',
 '20221031_20211106_20211108_all_COP26.xlsx',
 '20221031_20211109_20211111_all_COP26.xlsx',
 '20221031_20211112_20211113_all_COP26.xlsx',
 '20221031_20211114_20211114_all_COP26.xlsx']

In [4]:
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_one
from quotaclimat.data_processing.read_format_deduplicate import read_and_format_all_data_dump
from quotaclimat.data_processing.read_format_deduplicate import deduplicate_extracts

In [5]:
data = read_and_format_all_data_dump(path_folder = "../data/cop26/",path_channel_metadata=None)
data.shape

(24681, 15)

## Filtrer sur les top audiences

In [6]:
top_audiences = pd.read_excel("../data/channels.xlsx",sheet_name = "top_audiences")
top_audiences["channel_id"] = top_audiences["channel_name"] + "_" + top_audiences["media"]
top_channels_tv = top_audiences.query("media=='TV'")["channel_name"].tolist()
top_channels_radio = top_audiences.query("media=='Radio'")["channel_name"].tolist()

In [7]:
data["channel_id"] = data["channel_name"] + "_" + data["media"]

In [8]:
data = data.merge(top_audiences[["channel_id"]],on = ["channel_id"],how = "inner")

In [9]:
data.shape

(8479, 16)

Nombre de chaînes TV ou Radio dans l'échantillon

In [10]:
data.drop_duplicates(subset = ["channel_name"]).groupby(["media"])["channel_name"].count()

media
Radio    20
TV       14
Name: channel_name, dtype: int64

## Filtrer dans les horaires d'antenne

In [11]:
from quotaclimat.data_analytics.exploration import filter_data_between_hours

In [12]:
data = filter_data_between_hours(data,"06:00","24:00")

In [13]:
data.shape

(7966, 16)

In [14]:
data.to_csv("test_cop26.csv")

# Correction des données

In [103]:
from quotaclimat.data_processing.nlp_filtering import NLPFilteringModel

In [107]:
model = NLPFilteringModel()

In [135]:
model.predict(text,topic_change = True,as_percent_environment=True)

0.7619248628113129

In [141]:
from tqdm.auto import tqdm
results = []

for i in enumerate(tqdm(data["text"].tolist())):
    results_i = model.predict(text,topic_change = True,as_percent_environment = True)
    results.append(results_i)

  0%|          | 0/8526 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [137]:
from quotaclimat.data_processing.keyword_processor import KeywordModel

keyword_replace_dict = {
    "COP26":["cop vingt-six","cop vingt six","COP26"],
}

kw = KeywordModel(keyword_replace_dict)

In [138]:
data = kw.extract_mentions(data)

In [139]:
n_mentions = data.groupby(["channel_name","media"],as_index = False).agg({"n_mentions":"mean"})

In [140]:
n_mentions.query("media=='TV'").sort_values("n_mentions",ascending = False)

Unnamed: 0,channel_name,media,n_mentions
4,Canal+,TV,1.8
17,LCP,TV,1.504425
32,TMC,TV,1.442623
7,France 24,TV,1.430954
11,France Info:,TV,1.419479
19,M6,TV,1.378378
3,CNEWS,TV,1.333333
6,France 2,TV,1.315217
8,France 5,TV,1.314607
16,LCI,TV,1.303922


# Analyse des keywords

## Extraction des mots clés

In [24]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [49]:
vectorizer = TfidfVectorizer(ngram_range = (1,3),stop_words = None,min_df = 0.05)

In [50]:
text = vectorizer.fit_transform(data["text"])

In [52]:
count = pd.DataFrame(text.sum(axis = 0)[0]).T
count.index = pd.Series(vectorizer.vocabulary_).sort_values().index.tolist()
count.to_excel("keywords_tfidf.xlsx")

In [53]:
! start .

## Analyse des topics à partir des mots clés

In [66]:
from quotaclimat.data_processing.keyword_tool import KeywordsTool

In [67]:
kwt = KeywordsTool(case_sensitive=False,lowercase = True)

In [68]:
kwt.load_from_airtable(airtable_table_name="COP27",keyword_col = "name",variants_col=["alternatives_mediatree","alternatives"])

In [50]:
counts = kwt.count_keywords_on_corpus(data["text"])

  0%|          | 0/7966 [00:00<?, ?it/s]

In [51]:
counts_category = kwt.count_keywords_on_corpus(data["text"],as_category = True)

  0%|          | 0/7966 [00:00<?, ?it/s]

In [101]:
counts_total = kwt.count_keywords_on_corpus(data["text"],as_melted = True)

NameError: name 'data_total' is not defined

In [78]:
px.bar(counts_category.sum().sort_values(ascending = False).drop(["Politique"]).reset_index().head(30),x = "index",y = 0)

In [76]:
" OU ".join([y for x in kwt.data.query("category=='Climat'")["alternatives_mediatree"].tolist() for y in x])

'CO2 OU co deux OU émissions OU gaz à effet de serre OU carbone OU émission OU effet de serre OU émissions de co deux OU méthane OU émetteur de carbone OU net zero OU neutralité carbone OU zéro émissions OU Climatique OU Réchauffement climatique OU changement climatique OU Dérèglement climatique OU crise climatique OU urgence climatique OU urgence écologique OU crise écologique OU changements climatiques OU lutter contre le réchauffement'

## Clustering

In [79]:
from sklearn.cluster import KMeans

In [80]:
kmeans = KMeans(n_clusters = 10)

In [84]:
clusters = kmeans.fit_predict(counts_category.drop(columns = ["Politique"]))

In [88]:
clusters.shape

(7966,)

In [89]:
counts_category["cluster"] = clusters

In [90]:
counts_category["cluster"].value_counts(

6    2726
5    1403
0    1147
4     696
8     515
7     478
1     427
9     209
2     202
3     163
Name: cluster, dtype: int64

In [100]:
px.area(
    counts_category.drop(columns = ["Politique"]).groupby(["cluster"]).mean().reset_index().melt(id_vars = "cluster"),
    x = "cluster",
    y = "value",
    color = "variable",
    groupnorm = "fraction",
)