In [1]:
from datetime import datetime
import os

from datasets import load_dataset
from dotenv import load_dotenv

import pandas as pd
import requests


load_dotenv()

True

In [2]:
dataset = load_dataset("DataForGood/climateguard")

In [3]:
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()
df = pd.concat([df_train, df_test])


In [4]:
display(df_train.channel.value_counts(normalize=True).round(3))
display(df_test.channel.value_counts(normalize=True).round(3))

channel
itele             0.304
europe1           0.137
sud-radio         0.129
franceinfotv      0.083
france-info       0.074
lci               0.060
rmc               0.046
france24          0.040
bfmtv             0.038
tf1               0.020
france2           0.018
france-culture    0.014
france-inter      0.012
rfi               0.010
rtl               0.010
arte              0.006
Name: proportion, dtype: float64

channel
itele             0.359
sud-radio         0.117
rmc               0.102
europe1           0.086
france24          0.062
lci               0.062
bfmtv             0.039
france-inter      0.031
tf1               0.031
franceinfotv      0.023
france2           0.023
arte              0.016
france-culture    0.016
france-info       0.008
rtl               0.008
fr3-idf           0.008
m6                0.008
Name: proportion, dtype: float64

In [5]:
train_counts = df_train.groupby("channel").agg(count=("id", "count")).reset_index()
test_counts = df_test.groupby("channel").agg(count=("id", "count")).reset_index()
counts = pd.merge(
    left=train_counts,
    right=test_counts,
    on="channel",
    how="outer",
    suffixes=["_train", "_test"],
)

In [6]:
counts

Unnamed: 0,channel,count_train,count_test
0,arte,3.0,2.0
1,bfmtv,19.0,5.0
2,europe1,69.0,11.0
3,fr3-idf,,1.0
4,france-culture,7.0,2.0
5,france-info,37.0,1.0
6,france-inter,6.0,4.0
7,france2,9.0,3.0
8,france24,20.0,8.0
9,franceinfotv,42.0,3.0


In [7]:
import plotly.express as px

display(
    px.bar(
        counts, x="channel", y=["count_train", "count_test"], barmode="group", log_y=True
    ).update_xaxes(
        categoryorder="array", categoryarray=sorted(df_train.channel.unique())
    ).update_yaxes(title="Count")
)

In [8]:
train_perc = (
    df_train.groupby("channel").agg(percentage=("id", "count")) / len(df_train) * 100
).round(2).reset_index()
test_perc = (
    df_test.groupby("channel").agg(percentage=("id", "count")) / len(df_test) * 100
).round(2).reset_index()
perc = pd.merge(
    left=train_perc,
    right=test_perc,
    on="channel",
    how="outer",
    suffixes=["_train", "_test"],
)

In [9]:
display(
    px.bar(
        perc, x="channel", y=["percentage_train", "percentage_test"], barmode="group", log_y=True
    ).update_xaxes(
        categoryorder="array", categoryarray=sorted(df_train.channel.unique())
    ).update_yaxes(title="% of Examples (Log)")
)

In [10]:
train_perc = (
    df_train.groupby("misinformation").agg(percentage=("id", "count"))
).round(2).reset_index()
test_perc = (
    df_test.groupby("misinformation").agg(percentage=("id", "count"))
).round(2).reset_index()
perc = pd.merge(
    left=train_perc,
    right=test_perc,
    on="misinformation",
    how="outer",
    suffixes=["_train", "_test"],
)
display(
    px.bar(
        perc, x="misinformation", y=["percentage_train", "percentage_test"], barmode="group"
    ).update_xaxes(
        categoryorder="array", categoryarray=sorted(df_train.channel.unique())
    ).update_yaxes(title="Count")
)

In [11]:
train_perc = (
    df_train.groupby("misinformation").agg(percentage=("id", "count")) / len(df_train) * 100
).round(2).reset_index()
test_perc = (
    df_test.groupby("misinformation").agg(percentage=("id", "count")) / len(df_test) * 100
).round(2).reset_index()
perc = pd.merge(
    left=train_perc,
    right=test_perc,
    on="misinformation",
    how="outer",
    suffixes=["_train", "_test"],
)
display(
    px.bar(
        perc, x="misinformation", y=["percentage_train", "percentage_test"], barmode="group"
    ).update_xaxes(
        categoryorder="array", categoryarray=sorted(df_train.channel.unique())
    ).update_yaxes(title="Percentage of Examples (%)")
)