# Parse the XMLtv file from [XMLtv.ch](xmltv.ch)

In [1]:
# from io import BytesIO
import pandas as pd
# from PIL import Image
import requests
import xmltodict
import plotly.express as px

pd.options.plotting.backend = "plotly"


In [2]:
# Get the latest verion of the file
headers = {'User-Agent': 'Mozilla'}
URL = "https://xmltv.ch/xmltv/xmltv-tnt.xml"

response = requests.get(URL, headers=headers)
with open("../data/xmltv-tnt.xml", 'wb') as outfile:
    _ = outfile.write(response.content)

In [3]:
# Reading the data inside the xml
with open('../data/xmltv-tnt.xml', 'r', encoding='utf-8') as f:
    data = f.read()

# Parsing
data = xmltodict.parse(data)

In [4]:
# channels to dataframe
df_channels = pd.DataFrame(data["tv"]["channel"])
df_channels = pd.json_normalize(df_channels.to_dict(orient="records"))
df_channels.rename(columns={"@id": "channel_id", "icon.@src": "channel_icon", "display-name": "channel_name"}, inplace=True)

df_channels.head()

Unnamed: 0,channel_id,channel_name,channel_icon
0,C192.api.telerama.fr,TF1,https://television.telerama.fr/sites/tr_master...
1,C4.api.telerama.fr,France 2,https://television.telerama.fr/sites/tr_master...
2,C80.api.telerama.fr,France 3,https://television.telerama.fr/sites/tr_master...
3,C34.api.telerama.fr,Canal+,https://television.telerama.fr/sites/tr_master...
4,C111.api.telerama.fr,Arte,https://television.telerama.fr/sites/tr_master...


In [5]:
# programs to dataframe
df_programs = pd.DataFrame(data["tv"]["programme"])
df_programs = pd.json_normalize(df_programs.to_dict(orient="records"), sep="_")
# Clean column names 
df_programs.columns = [col.replace("@", "").replace("#", "").replace("-", "") for col in df_programs.columns]
# Join with df_channels 
df_programs = df_programs.join(df_channels.set_index("channel_id"), on="channel")
# Drop empty columns
df_programs.dropna(axis=1, how='all', inplace=True)
# Convert some columns to datetime
df_programs["start"] = pd.to_datetime(df_programs["start"], infer_datetime_format=True)
df_programs["stop"] = pd.to_datetime(df_programs["stop"], infer_datetime_format=True)
df_programs["day"] = df_programs["start"].dt.floor('d')

df_programs.head(3)

Unnamed: 0,start,stop,channel,title,subtitle,date,desc_lang,desc_text,category_lang,category_text,...,audio_stereo,rating_system,rating_value,rating_icon_src,starrating_value,subtitles_type,subtitles_language,channel_name,channel_icon,day
0,2022-11-08 00:05:00+01:00,2022-11-08 00:55:00+01:00,C192.api.telerama.fr,New York : crime organisé,Une opération à risques,2021,fr,Saison:1 - Episode:7 - La famille de Bell rend...,fr,série policière,...,bilingual,CSA,-10,http://upload.wikimedia.org/wikipedia/commons/...,,,,TF1,https://television.telerama.fr/sites/tr_master...,2022-11-08 00:00:00+01:00
1,2022-11-08 00:55:00+01:00,2022-11-08 01:40:00+01:00,C192.api.telerama.fr,New York Unité Spéciale,Un moment de faiblesse,2021,fr,Saison:22 - Episode:12 - Olivia entre dans un ...,fr,série policière,...,bilingual,CSA,-10,http://upload.wikimedia.org/wikipedia/commons/...,,,,TF1,https://television.telerama.fr/sites/tr_master...,2022-11-08 00:00:00+01:00
2,2022-11-08 01:40:00+01:00,2022-11-08 02:35:00+01:00,C192.api.telerama.fr,New York : crime organisé,La mort en ligne,2021,fr,Saison:1 - Episode:4 - Alors que la famille Wh...,fr,série policière,...,bilingual,CSA,-10,http://upload.wikimedia.org/wikipedia/commons/...,,,,TF1,https://television.telerama.fr/sites/tr_master...,2022-11-08 00:00:00+01:00


In [6]:
# Compute duration in minutes
# With this we can ignore "length.units" and "length.text" which is too convoluted
df_programs["length_minutes"] = (df_programs["stop"] - df_programs["start"]).apply(lambda x: int(x.total_seconds() / 60))

In [7]:
# df_programs["desc_lang"].value_counts(dropna=False)
# df_programs["category_lang"].value_counts(dropna=False)
# df_programs["category_text"].value_counts(dropna=False)
# df_programs["date"].value_counts(dropna=False).sort_index(ascending=False)

In [8]:
# Select only relevant columns
columns = [
    'start',
    'stop',
    'day',
    'length_minutes',
    'channel_name',
    'title',
    'subtitle',
    'date',
    'desc_text',
    'category_text',
]
df = df_programs[columns]

In [9]:
df.describe(include='all')

  df.describe(include='all')
  df.describe(include='all')
  df.describe(include='all')


Unnamed: 0,start,stop,day,length_minutes,channel_name,title,subtitle,date,desc_text,category_text
count,10066,10066,10066,10066.0,10066,10066,4916,3742.0,9003,10066
unique,5369,5370,14,,26,1416,3421,50.0,4194,184
top,2022-11-11 06:00:00+01:00,2022-11-11 06:00:00+01:00,2022-11-10 00:00:00+01:00,,Gulli,Météo,Prévisions pour le lendemain,2022.0,Un journal télévisé complet qui revient en dét...,série d'animation
freq,15,15,856,,1113,208,145,468.0,171,1314
first,2022-11-08 00:00:00+01:00,2022-11-08 00:22:00+01:00,2022-11-08 00:00:00+01:00,,,,,,,
last,2022-11-21 04:59:00+01:00,2022-11-21 07:30:00+01:00,2022-11-21 00:00:00+01:00,,,,,,,
mean,,,,48.235744,,,,,,
std,,,,55.77673,,,,,,
min,,,,1.0,,,,,,
25%,,,,11.0,,,,,,


In [10]:
import ipywidgets as widgets

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


a = widgets.Dropdown(
    options=df_channels.channel_name.unique(),
    value='TF1',
    description='Channel:',
    disabled=False,
)

def fn(a):
    print(f' Viewing data for {a}')
    df_a = df[df.channel_name == a]
    grouped = df_a.groupby("category_text").length_minutes.sum().sort_values(ascending=False)
    fig = grouped.plot.barh()
    fig.show()

out = widgets.interactive_output(fn, {'a': a})

widgets.VBox([widgets.VBox([a]), out])

VBox(children=(VBox(children=(Dropdown(description='Channel:', options=('TF1', 'France 2', 'France 3', 'Canal+…

In [11]:
# The widgets might not work so here's an example
selected_channel = "TF1"
df_a = df[df.channel_name == selected_channel]
grouped = df_a.groupby("category_text").length_minutes.sum().sort_values(ascending=False)
fig = grouped.plot.barh()
fig.show()

In [12]:
# Show one day of data
start_date = "2022-11-10"
end_date = "2022-11-11"

df_day = df_programs[(df_programs.start >= start_date) & (df_programs.stop < end_date)]
df_day = df_programs[(df_programs.start >= start_date) & (df_programs.start < end_date)]

df_day.groupby(["channel_name", "category_text"]).length_minutes.sum()

df_day.groupby(["category_text", "channel_name"]).length_minutes.sum().reset_index()

Unnamed: 0,category_text,channel_name,length_minutes
0,autre,Arte,19
1,autre,RMC Découverte,354
2,autre,RMC Story,785
3,autre,TF1,50
4,clips,CSTAR,906
...,...,...,...
253,téléfilm sentimental,TF1,110
254,téléfilm sentimental,TMC,300
255,téléfilm sentimental,W9,230
256,téléréalité,TF1,155


In [13]:
df_day.groupby(["category_text", "channel_name"]).length_minutes.agg(["count", "sum"]).reset_index()

Unnamed: 0,category_text,channel_name,count,sum
0,autre,Arte,1,19
1,autre,RMC Découverte,1,354
2,autre,RMC Story,2,785
3,autre,TF1,1,50
4,clips,CSTAR,10,906
...,...,...,...,...
253,téléfilm sentimental,TF1,1,110
254,téléfilm sentimental,TMC,3,300
255,téléfilm sentimental,W9,2,230
256,téléréalité,TF1,4,155


In [14]:
df1 = df_day.groupby(["channel_name", "category_text"]).length_minutes.agg(["count", "sum", "mean"]).reset_index()
df1
fig1 = px.bar(df1,
              x="channel_name",
              y="sum",
              color="category_text",
              title=f"Number of minutes for each channel & category for {start_date}",
              hover_data=df1.columns)
_ = fig1.add_hline(
    y=1440,
    line_dash="dot",
    annotation_text="24 hours of content", 
    annotation_position="top left",
    annotation_font_size=12,
    annotation_font_color="black"
)
_ = fig1.update_yaxes(title="")
_ = fig1.update_xaxes(title="")
fig1.show()

fig2 = px.bar(df1,
              x="category_text",
              y="sum",
              color="channel_name",
              title=f"Number of minutes for each category & channel {start_date}",
              hover_data=df1.columns)
_ = fig2.update_yaxes(title="")
_ = fig2.update_xaxes(title="")
fig2.show()

In [15]:
# Try to make sub-categories
df_programs.category_text.value_counts().index.tolist()

df_programs.category_text = df_programs.category_text.str.replace(" : ", " ")
df_programs.category_text.str.split("série ", expand=True).value_counts()
df_programs.category_text.str.split("film ", expand=True).value_counts()
df_programs.category_text.str.split("jeunesse ", expand=True).value_counts()
df_programs.category_text.str.split("divertissement ", expand=True).value_counts()
df_programs.category_text.str.split("sport ", expand=True).value_counts()
df_programs.category_text.str.split("magazine ", expand=True).value_counts()
df_programs.category_text.str.split("documentaire ", expand=True).value_counts()

0  1                    
   téléréalité              271
   société                  210
   découvertes              125
   sciences et technique     64
   animalier                 56
   histoire                  43
   nature                    31
   justice                   30
   culture                   23
   civilisations             19
   aventures                 18
   cinéma                    16
   sport                     15
   environnement             15
   gastronomie               15
   politique                 12
   voyage                    12
   pêche                     10
   musique                   10
   beaux-arts                 6
   santé                      5
   musique classique          3
   education                  3
   art de vivre               2
   rock-pop                   2
   autre                      2
   lettres                    1
   géopolitique               1
   fiction                    1
dtype: int64

In [16]:
def removeprefix(text):
    for prefix in ["de ", "de la ", "du ", "de l'", "d'"]:   
        if text.startswith(prefix):
            return text[len(prefix):]
        return text

cat_prefixes = [
    "série",
    "film",
    "jeunesse",
    "divertissement",
    "sport",
    "magazine",
    "documentaire",
]

def split_category(cat_text):
    main_cat = cat_text
    sub_cat = None
    
    for cat_prefix in cat_prefixes:
        if cat_text.startswith(cat_prefix + " "):
            main_cat = cat_prefix
            sub_cat = cat_text.split(cat_prefix + " ")[1].removeprefix()
        
            return main_cat, sub_cat
    
    return main_cat, sub_cat

In [17]:
df_programs.category_text.str.split("film ", expand=True)[1].value_counts()

cat_text = df_programs.category_text.tolist()

for split_category in ["film"]:
    if split_category + " " in cat_text:
        main_cat = split_category
        sub_cat = cat_text.split(split_category + " ")[1].removeprefix()

In [18]:
df.head()

Unnamed: 0,start,stop,day,length_minutes,channel_name,title,subtitle,date,desc_text,category_text
0,2022-11-08 00:05:00+01:00,2022-11-08 00:55:00+01:00,2022-11-08 00:00:00+01:00,50,TF1,New York : crime organisé,Une opération à risques,2021.0,Saison:1 - Episode:7 - La famille de Bell rend...,série policière
1,2022-11-08 00:55:00+01:00,2022-11-08 01:40:00+01:00,2022-11-08 00:00:00+01:00,45,TF1,New York Unité Spéciale,Un moment de faiblesse,2021.0,Saison:22 - Episode:12 - Olivia entre dans un ...,série policière
2,2022-11-08 01:40:00+01:00,2022-11-08 02:35:00+01:00,2022-11-08 00:00:00+01:00,55,TF1,New York : crime organisé,La mort en ligne,2021.0,Saison:1 - Episode:4 - Alors que la famille Wh...,série policière
3,2022-11-08 02:35:00+01:00,2022-11-08 06:25:00+01:00,2022-11-08 00:00:00+01:00,230,TF1,Programmes de la nuit,,,Retrouvez tous vos programmes de nuit.,programme indéterminé
4,2022-11-08 06:25:00+01:00,2022-11-08 08:25:00+01:00,2022-11-08 00:00:00+01:00,120,TF1,TFou,,,MolangBarbapapa en Famille (ST)Thomas et ses a...,magazine jeunesse


In [30]:
# fig = px.timeline(df[(df.channel_name == "TF1") & (df_programs.start >= start_date) & (df_programs.stop < end_date)], x_start='start', x_end="stop", y="channel_name", color="category_text")
# fig = px.timeline(df[(df.channel_name == "CSTAR")], x_start='start', x_end="stop", y="channel_name", color="category_text")
# fig = px.timeline(df[df.category_text == "journal"], x_start='start', x_end="stop", y="channel_name", color="category_text")
fig = px.timeline(df, x_start='start', x_end="stop", y="channel_name", color="category_text")
fig.show()

In [20]:
df.category_text.nunique()

184

In [21]:
df.head()

Unnamed: 0,start,stop,day,length_minutes,channel_name,title,subtitle,date,desc_text,category_text
0,2022-11-08 00:05:00+01:00,2022-11-08 00:55:00+01:00,2022-11-08 00:00:00+01:00,50,TF1,New York : crime organisé,Une opération à risques,2021.0,Saison:1 - Episode:7 - La famille de Bell rend...,série policière
1,2022-11-08 00:55:00+01:00,2022-11-08 01:40:00+01:00,2022-11-08 00:00:00+01:00,45,TF1,New York Unité Spéciale,Un moment de faiblesse,2021.0,Saison:22 - Episode:12 - Olivia entre dans un ...,série policière
2,2022-11-08 01:40:00+01:00,2022-11-08 02:35:00+01:00,2022-11-08 00:00:00+01:00,55,TF1,New York : crime organisé,La mort en ligne,2021.0,Saison:1 - Episode:4 - Alors que la famille Wh...,série policière
3,2022-11-08 02:35:00+01:00,2022-11-08 06:25:00+01:00,2022-11-08 00:00:00+01:00,230,TF1,Programmes de la nuit,,,Retrouvez tous vos programmes de nuit.,programme indéterminé
4,2022-11-08 06:25:00+01:00,2022-11-08 08:25:00+01:00,2022-11-08 00:00:00+01:00,120,TF1,TFou,,,MolangBarbapapa en Famille (ST)Thomas et ses a...,magazine jeunesse


In [22]:
df.groupby(["channel_name", "day"])["length_minutes"].sum().unstack("day")

day,2022-11-08 00:00:00+01:00,2022-11-09 00:00:00+01:00,2022-11-10 00:00:00+01:00,2022-11-11 00:00:00+01:00,2022-11-12 00:00:00+01:00,2022-11-13 00:00:00+01:00,2022-11-14 00:00:00+01:00,2022-11-15 00:00:00+01:00,2022-11-16 00:00:00+01:00,2022-11-17 00:00:00+01:00,2022-11-18 00:00:00+01:00,2022-11-19 00:00:00+01:00,2022-11-20 00:00:00+01:00,2022-11-21 00:00:00+01:00
channel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6ter,1385.0,1470.0,1405.0,1465.0,1435.0,1480.0,1450.0,1415.0,1425.0,1460.0,1410.0,1450.0,1460.0,340.0
Arte,1430.0,1525.0,1385.0,1460.0,1420.0,1435.0,615.0,375.0,,,,,,
BFMTV,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,360.0
C8,1434.0,1438.0,1430.0,1456.0,1394.0,1445.0,1465.0,1460.0,1420.0,1440.0,1470.0,1790.0,1380.0,
CNEWS,1441.0,1439.0,1441.0,1439.0,1440.0,1440.0,1440.0,1441.0,1439.0,1441.0,1439.0,1440.0,1440.0,325.0
CSTAR,1328.0,1485.0,1425.0,1390.0,1460.0,1450.0,1415.0,1450.0,1497.0,1408.0,1420.0,1550.0,1340.0,330.0
Canal+,1394.0,1491.0,1451.0,1397.0,1413.0,1432.0,1469.0,1409.0,1483.0,1439.0,1450.0,1359.0,1445.0,294.0
Chérie 25,1435.0,1450.0,1485.0,1395.0,1460.0,1430.0,1420.0,1435.0,1430.0,1455.0,1470.0,1440.0,1430.0,310.0
France 2,1495.0,1380.0,1430.0,1440.0,1494.0,1381.0,1460.0,1450.0,1410.0,1445.0,1440.0,1505.0,1360.0,310.0
France 3,1415.0,1435.0,1475.0,1466.0,1389.0,1435.0,1480.0,1405.0,1426.0,1474.0,1500.0,1360.0,1435.0,295.0


In [23]:
start_date = "2022-11-08"
end_date = "2022-11-13"

df_extract = df[(df.start >= start_date) & (df.stop <= end_date)]
df_extract.to_csv("../data/20221108_20221113_Programme_TV.csv", index=False)

In [24]:
df_test = pd.read_csv("../data/20221108_20221113_Programme_TV.csv")

In [25]:
df_test.head()

Unnamed: 0,start,stop,day,length_minutes,channel_name,title,subtitle,date,desc_text,category_text
0,2022-11-08 00:05:00+01:00,2022-11-08 00:55:00+01:00,2022-11-08 00:00:00+01:00,50,TF1,New York : crime organisé,Une opération à risques,2021.0,Saison:1 - Episode:7 - La famille de Bell rend...,série policière
1,2022-11-08 00:55:00+01:00,2022-11-08 01:40:00+01:00,2022-11-08 00:00:00+01:00,45,TF1,New York Unité Spéciale,Un moment de faiblesse,2021.0,Saison:22 - Episode:12 - Olivia entre dans un ...,série policière
2,2022-11-08 01:40:00+01:00,2022-11-08 02:35:00+01:00,2022-11-08 00:00:00+01:00,55,TF1,New York : crime organisé,La mort en ligne,2021.0,Saison:1 - Episode:4 - Alors que la famille Wh...,série policière
3,2022-11-08 02:35:00+01:00,2022-11-08 06:25:00+01:00,2022-11-08 00:00:00+01:00,230,TF1,Programmes de la nuit,,,Retrouvez tous vos programmes de nuit.,programme indéterminé
4,2022-11-08 06:25:00+01:00,2022-11-08 08:25:00+01:00,2022-11-08 00:00:00+01:00,120,TF1,TFou,,,MolangBarbapapa en Famille (ST)Thomas et ses a...,magazine jeunesse


In [26]:
from datetime import date
import datetime

today = date.today()
print(today)

2022-11-08


In [27]:
date_end = today + datetime.timedelta(days=4)
date_end = date_end.strftime("%Y-%m-%d")
today = today.strftime("%Y-%m-%d")

In [28]:
df[(df.start >= today) & (df.stop <= date_end)].shape

(3354, 10)