## Load the .parquet file to get all the speakers with political parties

In [14]:
import pandas as pd
import pywikibot
import os

data_dir = '../../data/'
# Get all the speakers that have a party in the parquet file
parquet_dir = data_dir + 'speaker_attributes/speaker_attributes.parquet/'
politicians = pd.DataFrame(columns=['id', 'label', 'party'])
for file in os.listdir(parquet_dir):
    if file.endswith('.parquet'):
        df = pd.read_parquet(os.path.join(parquet_dir, file))
        politicians = politicians.append(df.loc[df.party.notnull(), ['id', 'label', 'party']])         # Speakers with a political party
politicians.set_index('id', inplace=True)

## Get all the different political parties

In [2]:
# Get the list of parties
parties = pd.DataFrame(index=politicians['party'].explode().unique())
print('Number of politicians:', len(politicians), 'Number of parties:', len(parties))

Number of politicians: 399176 Number of parties: 9632


## Append the politicians to each party

In [50]:
politicians_expanded = politicians.explode('party')
parties['politicians'] = None
for group_name, df_group in politicians_expanded.groupby('party'):
    parties.loc[group_name, 'politicians'] = df_group.index.tolist()

In [3]:
# repo = pywikibot.Site("wikidata", "wikidata").data_repository()
# def follow_links_to_label(node_id, links):
#     initial_node = pywikibot.ItemPage(repo, node_id)
#     return follow_links_to_label_repo(initial_node, links, repo)

# def follow_links_to_label_repo(node, links, repo):
#     # repo = pywikibot.Site("wikidata", "wikidata").data_repository()
#     item_dict = node.get()
#     if len(links) > 0:
#         labels = []
#         clm_dict = item_dict["claims"]
#         if links[0] in clm_dict:
#             clm_list = clm_dict[links[0]]
#             for clm in clm_list:
#                 # This is the party
#                 if clm.getTarget():
#                     labels.append(follow_links_to_label_repo(clm.getTarget(), links[1:], repo))
#         return labels
#     else:
#         # Get label
#         if 'labels' in item_dict and 'en' in item_dict['labels']:
#             return item_dict['labels']['en']
#         else:
#             return None

## Fetch the political alignment and country from the wikidata

In [13]:
from wikidata_fetch import follow_links_to_label

parties['political_alignment'] = None
parties['country'] = None
parties['label'] = None

file_out = data_dir + 'politician_quotes_dataset/parties_new.csv.gz'


property_mapping = {'political_alignment': 'P1387',
                    'member_of_political_party': 'P102',
                    'country': 'P17'}

columns = {'political_alignment': [property_mapping['political_alignment']],
           'country': [property_mapping['country']],
           'label': []}

# For each of the parties, add the political alignment and countries
for index, row in parties.iloc[3698:].iterrows():
    for col in columns:
        try:
            parties.at[index, col] =  follow_links_to_label(index, columns[col])
        except Exception as e:
            print('Invalid index:', index, 'Col:', col)
parties.to_csv(file_out, compression="gzip")


Invalid index: Q3345151 Col: political_alignment
Invalid index: Q3345151 Col: country
Invalid index: Q3345151 Col: label
Invalid index: Q7287097 Col: political_alignment
Invalid index: Q7287097 Col: country
Invalid index: Q7287097 Col: label




Invalid index: Q6974404 Col: political_alignment
Invalid index: Q6974404 Col: country
Invalid index: Q6974404 Col: label




Invalid index: Q55548162 Col: political_alignment
Invalid index: Q55548162 Col: country
Invalid index: Q55548162 Col: label
