In [50]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [51]:
data = pd.read_parquet(".." + os.sep + "docs"  + os.sep +  "data" + os.sep + "agcomdata.parquet")

In [52]:
data.columns

Index(['channel', 'program', 'day', 'lastname', 'name', 'affiliation', 'topic',
       'duration', 'kind'],
      dtype='object')

# Description
<table>
<tr>
<th>columns</th>
<th>description</th>
<th>content</th>
<tr>
<tr>
<td>channel</td><td>channel name</td><td>category</td>
</tr>
<tr>
<td>program</td><td>name of the program</td><td>category</td>
</tr>
<tr>
<td>day</td><td>day of the event</td><td>format dd-mm-yyyy (python datetime)</td>
</tr>
<tr>
<td>name</td><td>name of the person concerned or "political movement" involved"</td><td>text - if the value is "political movement" this means that is a political movement and you can identify the name from the field lastname </td>
</tr>
<tr>
<td>lastname</td><td>surname of the person concerned or political movement</td><td>text, if name=="political movement" this field contains the name of the movement</td>
</tr>
<tr>
<td>topic</td><td>main topic of the discussion</td><td>category</td>
</tr>
<tr>
<td>affiliation</td><td>affiliation</td><td>category</td>
</tr>
<tr>
<td>duration</td><td>minutes duration of intervention</td><td>integer</td>
</tr>
<tr>
<td>kind</td><td>type of intervention</td><td>can take on the values talk or news</td>
</tr>
<tr>
</table>

In [53]:
data.channel.unique()

array(['CANALE 5', 'CIELO', 'ITALIA 1', 'LA7', 'LA7D',
       'Mediaset TgCom 24', 'NOVE', 'RAI 1', 'RAI 2', 'RAI 3', 'RAINEWS',
       'RETE 4', 'SKY TG 24 CAN. 50', 'SKY TG24', 'TV8', 'RTL102.5',
       'Rai Radio1', 'Rai Radio2', 'Rai Radio3'], dtype=object)

In [54]:
data.program.unique()

array(['TG5', 'TG5 PRIMA PAGINA', 'MATTINO CINQUE - NEWS', ...,
       'MESSAGGIO DI FINE ANNO DEL PRESIDENTE DELLA REPUBBLICA SERGIO MATTARELLA',
       'CLIP: FATTI ED EVENTI INTERNAZIONALI CHE HANNO CARATTERIZZATO IL 2020',
       'CLIP: MESSAGGIO DI FINE ANNO DEL PRESIDENTE DELLA REPUBBLICA SERGIO MATTARELLA'],
      dtype=object)

In [55]:
data.day.head(3)

2   2023-01-01
3   2023-01-01
4   2023-01-01
Name: day, dtype: datetime64[ns]

In [56]:
name_unique = data.name.unique()
print(name_unique[:5])

['political movement' 'Silvio' 'Giuseppe' 'Enrico' 'Matteo']


In [57]:
data.lastname.unique()

array(['Azione-Italia Viva-Renew Europe', 'Berlusconi', 'Conte', ...,
       'Delli Colli', 'Calearo', 'Gramazio'], dtype=object)

In [58]:
data[data.name == "political movement"]['lastname'].head(3)

2     Azione-Italia Viva-Renew Europe
8     Azione-Italia Viva-Renew Europe
18    Azione-Italia Viva-Renew Europe
Name: lastname, dtype: object

In [59]:
data[data.name != "political movement"]['lastname'].head(3)

3    Berlusconi
4         Conte
5         Letta
Name: lastname, dtype: object

In [60]:
data.topic.unique()

array(['Politica e attività istituzionali',
       'Religione e questioni religiose', 'Medicina, salute e scienza',
       'Esteri', 'Economia, finanza e lavoro', 'Costume e società',
       'Sport', 'Società', 'Cronaca', 'Cultura e istruzione', 'Ambiente',
       'Giustizia', 'Altro', 'Mass media e spettacolo', 'Programma',
       'Fine', 'Informazioni di servizio', 'Pubblicità'], dtype=object)

In [61]:
unique_affiliations = data['affiliation'].unique()
print(unique_affiliations[:5])

['Azione-Italia Viva-Renew Europe' 'PDL - Forza Italia'
 'Movimento 5 Stelle' 'Partito Democratico' 'Lega Nord']


In [62]:
data.kind.unique()

array(['Notizia', 'Parola'], dtype=object)

In [63]:
data.duration.head(5)

2     7
3    42
4    24
5    25
6    17
Name: duration, dtype: int64