# Parse the XMLtv file from [XMLtv.ch](xmltv.ch)

In [1]:
from io import BytesIO
import pandas as pd
from PIL import Image
import requests
import xmltodict
import plotly.express as px

pd.options.plotting.backend = "plotly"


In [2]:
# Get the latest verion of the file
headers = {'User-Agent': 'Mozilla'}
URL = "https://xmltv.ch/xmltv/xmltv-tnt.xml"

response = requests.get(URL, headers=headers)
with open("../data/xmltv-tnt.xml", 'wb') as outfile:
    _ = outfile.write(response.content)

In [3]:
# Reading the data inside the xml
with open('../data/xmltv-tnt.xml', 'r', encoding='utf-8') as f:
    data = f.read()

# Parsing
data = xmltodict.parse(data)

In [4]:
# channels to dataframe
df_channels = pd.DataFrame(data["tv"]["channel"])
df_channels = pd.json_normalize(df_channels.to_dict(orient="records"))
df_channels.rename(columns={"@id": "channel_id", "icon.@src": "channel_icon", "display-name": "channel_name"}, inplace=True)

df_channels.head()

Unnamed: 0,channel_id,channel_name,channel_icon
0,C192.api.telerama.fr,TF1,https://television.telerama.fr/sites/tr_master...
1,C4.api.telerama.fr,France 2,https://television.telerama.fr/sites/tr_master...
2,C80.api.telerama.fr,France 3,https://television.telerama.fr/sites/tr_master...
3,C34.api.telerama.fr,Canal+,https://television.telerama.fr/sites/tr_master...
4,C111.api.telerama.fr,Arte,https://television.telerama.fr/sites/tr_master...


In [5]:
# programs to dataframe
df_programs = pd.DataFrame(data["tv"]["programme"])
df_programs = pd.json_normalize(df_programs.to_dict(orient="records"), sep="_")
# Clean column names 
df_programs.columns = [col.replace("@", "").replace("#", "").replace("-", "") for col in df_programs.columns]
# Join with df_channels 
df_programs = df_programs.join(df_channels.set_index("channel_id"), on="channel")
# Drop empty columns
df_programs.dropna(axis=1, how='all', inplace=True)
# Convert some columns to datetime
df_programs["start"] = pd.to_datetime(df_programs["start"], infer_datetime_format=True)
df_programs["stop"] = pd.to_datetime(df_programs["stop"], infer_datetime_format=True)

df_programs.head(3)

Unnamed: 0,start,stop,channel,title,subtitle,date,desc_lang,desc_text,category_lang,category_text,...,episodenum_text,audio_stereo,rating_system,rating_value,rating_icon_src,starrating_value,subtitles_type,subtitles_language,channel_name,channel_icon
0,2022-11-03 00:30:00+01:00,2022-11-03 01:15:00+01:00,C192.api.telerama.fr,Debris,Nous ne sommes pas seuls,2021,fr,"Saison:1 - Episode:2 - Finola Jones, agente du...",fr,série dramatique,...,0.1.,bilingual,CSA,-10,http://upload.wikimedia.org/wikipedia/commons/...,4/5,,,TF1,https://television.telerama.fr/sites/tr_master...
1,2022-11-03 01:15:00+01:00,2022-11-03 02:05:00+01:00,C192.api.telerama.fr,Debris,Le rectangle,2021,fr,Saison:1 - Episode:3 - Un mystérieux objet rec...,fr,série dramatique,...,0.2.,bilingual,CSA,-10,http://upload.wikimedia.org/wikipedia/commons/...,4/5,,,TF1,https://television.telerama.fr/sites/tr_master...
2,2022-11-03 02:05:00+01:00,2022-11-03 02:50:00+01:00,C192.api.telerama.fr,Debris,Pluie toxique,2021,fr,"Saison:1 - Episode:4 - Finola Jones, agente du...",fr,série dramatique,...,0.3.,bilingual,CSA,-10,http://upload.wikimedia.org/wikipedia/commons/...,4/5,,,TF1,https://television.telerama.fr/sites/tr_master...


In [6]:
df_programs.columns.tolist()

['start',
 'stop',
 'channel',
 'title',
 'subtitle',
 'date',
 'desc_lang',
 'desc_text',
 'category_lang',
 'category_text',
 'length_units',
 'length_text',
 'icon_src',
 'episodenum_system',
 'episodenum_text',
 'audio_stereo',
 'rating_system',
 'rating_value',
 'rating_icon_src',
 'starrating_value',
 'subtitles_type',
 'subtitles_language',
 'channel_name',
 'channel_icon']

In [7]:
# Compute duration in minutes
# With this we can ignore "length.units" and "length.text" which is too convoluted
df_programs["length_minutes"] = (df_programs["stop"] - df_programs["start"]).apply(lambda x: int(x.total_seconds() / 60))

In [8]:
df_programs["desc_lang"].value_counts(dropna=False)
df_programs["category_lang"].value_counts(dropna=False)
df_programs["category_text"].value_counts(dropna=False)
df_programs["date"].value_counts(dropna=False).sort_index(ascending=False)

fr     8906
NaN    1062
Name: desc_lang, dtype: int64

fr    9968
Name: category_lang, dtype: int64

série d'animation                       1342
journal                                  654
jeunesse : dessin animé dessin animé     639
météo                                    583
série humoristique                       511
                                        ... 
film : biographie                          1
documentaire lettres                       1
sport : triathlon                          1
emission spéciale                          1
film : court métrage dramatique            1
Name: category_text, Length: 187, dtype: int64

2022     410
2021     456
2020     337
2019     343
2018     307
2017     307
2016     151
2015     165
2014     114
2013     164
2012      89
2011      84
2010     168
2009      93
2008      28
2007      29
2006      66
2005      44
2004      38
2003       7
2002       2
2001      37
2000       8
1999      27
1997       3
1996      51
1995      85
1993      23
1992      78
1990       3
1989       3
1988       2
1985       2
1984       2
1983      18
1981       1
1980      36
1976       2
1974       1
1972       1
1969       1
1966       1
1957       1
1937       1
1933       1
NaN     6178
Name: date, dtype: int64

In [9]:
# Select only relevant columns
columns = [
    'start',
    'stop',
    'length_minutes',
    'channel_name',
    'title',
    'subtitle',
    'date',
    'desc_text',
    'category_text',
]
df = df_programs[columns]

In [10]:
import ipywidgets as widgets

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [11]:
a = widgets.Dropdown(
    options=df_channels.channel_name.unique(),
    value='TF1',
    description='Channel:',
    disabled=False,
)

def fn(a):
    print(f' Viewing data for {a}')
    df_a = df[df.channel_name == a]
    grouped = df_a.groupby("category_text").length_minutes.sum().sort_values(ascending=False)
    fig = grouped.plot.barh()
    fig.show()

out = widgets.interactive_output(fn, {'a': a})

widgets.VBox([widgets.VBox([a]), out])

VBox(children=(VBox(children=(Dropdown(description='Channel:', options=('TF1', 'France 2', 'France 3', 'Canal+…