## 5. Analyse van onderscheidende woorden

Ten slotte vergelijken we in dit notebook de woordkeuzes van de rechtse partijen in hun programma’s. Welke woorden karakteriseren bijvoorbeeld de VVD als het gaat over migratie? 

Dat berekenden we aan de hand van chikwadraatoets, waarbij we de frequentie van een woord in een programma afzetten tegen van de frequentie van datzelfde woord in de andere programma’s van rechtse partijen van dat jaar. 

We nemen voor deze analyse alleen zelfstandige en bijvoegelijke naamwoorden mee, en woorden die minder dan vijf keer voorkwamen hebben we eruit gefilterd. 

Om het aantal onderwerpen behapbaar te maken, brachten we de 56 categorieën van het Manifesto Project terug tot 12 onderwerpen. Daarvoor gebruikten we de groepering van onderzoekers van de VU.   

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from collections import Counter

from helpers.keyword_analysis import significant_words

%config InlineBackend.figure_format='retina'
%matplotlib inline
mpl.style.use('../groene.mpl')

In [2]:
right_wing_parties = ['CDA', 'VVD', 'PVV', 'FVD', 'BBB', 'JA21', 'NSC', 'FvD', 'BVNL']
sents = pd.concat([pd.read_csv('data/manifestos_nl_2023_coded_sents.csv'),
                   pd.read_csv('data/manifestos_2000_2022_coded_sents.csv')])

data = sents[(sents.country == 'Netherlands')&(sents.party.isin(right_wing_parties))].reset_index(drop=True)
proportions = pd.read_csv('data/manifestos_2000_2023_proportions.csv')
proportions['year'] = proportions.date.apply(lambda x: int(x[:4]))

In [3]:
import spacy

nlp = spacy.load('nl_core_news_lg')

In [4]:
from tqdm.auto import tqdm

def remove_non_printable(input_string):
    cleaned_string = "".join(c for c in input_string if c.isprintable())
    return cleaned_string

texts = list(data.text)
texts = [remove_non_printable(text) for text in tqdm(texts)]
processed_texts = [text for text in tqdm(nlp.pipe(texts, 
                                              n_process=4, # four threads
                                              disable=["ner",
                                                       "parser"]),
                                          total=len(texts))]

tokenized_texts = [[word.lemma_.lower() for word in text 
                    if word.pos_ in ['NOUN', 'ADJ'] and len(word.lemma_.lower()) > 2] 
                   for text in processed_texts]

  0%|          | 0/41350 [00:00<?, ?it/s]

  0%|          | 0/41350 [00:00<?, ?it/s]

In [5]:
data['tokenized_texts'] = tokenized_texts

In [6]:
party_dict = dict()

## Eerst: onderscheidende woorden algemeen

In [7]:
issue = 'Algemeen'

aflatten = lambda t: [item for sublist in t for item in sublist]

data['year'] = data.date.apply(lambda x: int(x[:4]))

filter_words = ['forum', 'democratie', 'ers']
for idx,(year, year_df) in enumerate(data[(data.year > 2003)].groupby('date')):
    parties = set(year_df.party)
    for i, party in enumerate(parties):
        if party not in party_dict:
            party_dict[party] = dict()
        if issue not in party_dict[party]:
            party_dict[party][issue] = dict()
        source = year_df[year_df.party == party].tokenized_texts.values
        target = year_df[year_df.party != party].tokenized_texts.values
        results = pd.DataFrame(significant_words(source, target))
        results = results[(~results.word.str.contains('-|x'))&(~results.word.isin(filter_words))]
        results = results[(results.freq_c1>=5)&(results.freq_c2>0)].sort_values('chi', ascending=False)
        results = results[results.chi_p < 0.05]
        if len(results) > 0:
            party_dict[party][issue][year[:4]] = list(results.word.values[:5])

  0%|          | 0/1199 [00:00<?, ?it/s]

  0%|          | 0/3618 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]

  0%|          | 0/2347 [00:00<?, ?it/s]

  0%|          | 0/3557 [00:00<?, ?it/s]

  0%|          | 0/1227 [00:00<?, ?it/s]

  0%|          | 0/2380 [00:00<?, ?it/s]

  0%|          | 0/1957 [00:00<?, ?it/s]

  0%|          | 0/1424 [00:00<?, ?it/s]

  0%|          | 0/3288 [00:00<?, ?it/s]

  0%|          | 0/2510 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/1082 [00:00<?, ?it/s]

  0%|          | 0/3230 [00:00<?, ?it/s]

  0%|          | 0/1302 [00:00<?, ?it/s]

  0%|          | 0/2889 [00:00<?, ?it/s]

  0%|          | 0/4813 [00:00<?, ?it/s]

  0%|          | 0/1524 [00:00<?, ?it/s]

  0%|          | 0/2231 [00:00<?, ?it/s]

  0%|          | 0/2891 [00:00<?, ?it/s]

  0%|          | 0/1207 [00:00<?, ?it/s]

  0%|          | 0/2169 [00:00<?, ?it/s]

  0%|          | 0/3519 [00:00<?, ?it/s]

  0%|          | 0/3351 [00:00<?, ?it/s]

  0%|          | 0/3961 [00:00<?, ?it/s]

  0%|          | 0/1716 [00:00<?, ?it/s]

  0%|          | 0/2044 [00:00<?, ?it/s]

In [8]:
party_dict['VVD']

{'Algemeen': {'2006': ['procent',
   'provincie',
   'euro',
   'burgemeester',
   'regel'],
  '2010': ['overheid', 'krijgsmacht', 'euro', 'ondernemer', 'kind'],
  '2012': ['ondernemer', 'opleiding', 'periode', 'deel', 'belangrijk'],
  '2017': ['mogelijk', 'makkelijk', 'weg', 'techniek', 'klimaat'],
  '2021': ['bedrijf', 'ondernemer', 'jaar', 'europees', 'mogelijkheid'],
  '2023': ['schoon', 'grip', 'fijn', 'veilig', 'vlot']}}

In [9]:
party_dict['BBB']

{'Algemeen': {'2021': ['platteland',
   'boer',
   'voedselproductie',
   'dier',
   'gratis'],
  '2023': ['platteland', 'hoofdstuk', 'regio', 'agrarisch', 'burger']}}

In [10]:
party_dict['NSC']

{'Algemeen': {'2023': ['bestaanszekerheid',
   'uitvoering',
   'kamer',
   'verkiezingsprogramma',
   'bestuur']}}

## Dan: onderscheidende woorden per onderwerp

In [11]:
code2issue = pd.read_csv('https://raw.githubusercontent.com/vanatteveldt/2023-manifestos-nl/main/data/raw/cmp_topics.csv')
code2issue = dict(zip(code2issue['cmp'], code2issue['label']))

data['issue'] = data.code.apply(lambda x: code2issue[x] if x in code2issue else '')

In [12]:
issue2include = ['Migratie', 'Bestaanszekerheid', 'Veiligheid', 'Milieu', 'Economie', 'Normen']

for issue in issue2include:
    aflatten = lambda t: [item for sublist in t for item in sublist]

    data['year'] = data.date.apply(lambda x: int(x[:4]))

    for idx,(year, year_df) in enumerate(data[(data.year > 2003)&(data.issue == issue)].groupby('date')):
        parties = set(year_df.party)
        for i, party in enumerate(parties):
            if party not in party_dict:
                party_dict[party] = dict()
            if issue not in party_dict[party]:
                party_dict[party][issue] = dict()

            source = year_df[year_df.party == party].tokenized_texts.values
            target = year_df[year_df.party != party].tokenized_texts.values
            results = pd.DataFrame(significant_words(source, target, tests=('llr', 'chi')))
            results = results[(~results.word.str.contains('-|x'))&(~results.word.isin(filter_words))]
            results = results[(results.freq_c1>=5)&(results.freq_c2>0)].sort_values('chi', ascending=False)
            results = results[results.chi_p < 0.05]
            if len(results) > 0:
                party_dict[party][issue][year[:4]] = list(results.word.values[:5])

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/427 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/226 [00:00<?, ?it/s]

  0%|          | 0/344 [00:00<?, ?it/s]

  0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/180 [00:00<?, ?it/s]

  0%|          | 0/334 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

  0%|          | 0/354 [00:00<?, ?it/s]

  0%|          | 0/624 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/307 [00:00<?, ?it/s]

  0%|          | 0/416 [00:00<?, ?it/s]

  0%|          | 0/311 [00:00<?, ?it/s]

  0%|          | 0/260 [00:00<?, ?it/s]

  0%|          | 0/414 [00:00<?, ?it/s]

  0%|          | 0/474 [00:00<?, ?it/s]

  0%|          | 0/405 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/926 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/540 [00:00<?, ?it/s]

  0%|          | 0/455 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

  0%|          | 0/514 [00:00<?, ?it/s]

  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/744 [00:00<?, ?it/s]

  0%|          | 0/275 [00:00<?, ?it/s]

  0%|          | 0/566 [00:00<?, ?it/s]

  0%|          | 0/1069 [00:00<?, ?it/s]

  0%|          | 0/249 [00:00<?, ?it/s]

  0%|          | 0/440 [00:00<?, ?it/s]

  0%|          | 0/634 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/744 [00:00<?, ?it/s]

  0%|          | 0/659 [00:00<?, ?it/s]

  0%|          | 0/677 [00:00<?, ?it/s]

  0%|          | 0/269 [00:00<?, ?it/s]

  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/247 [00:00<?, ?it/s]

  0%|          | 0/486 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

  0%|          | 0/517 [00:00<?, ?it/s]

  0%|          | 0/563 [00:00<?, ?it/s]

  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/533 [00:00<?, ?it/s]

  0%|          | 0/269 [00:00<?, ?it/s]

  0%|          | 0/222 [00:00<?, ?it/s]

  0%|          | 0/755 [00:00<?, ?it/s]

  0%|          | 0/521 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/213 [00:00<?, ?it/s]

  0%|          | 0/476 [00:00<?, ?it/s]

  0%|          | 0/1149 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/266 [00:00<?, ?it/s]

  0%|          | 0/656 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/311 [00:00<?, ?it/s]

  0%|          | 0/606 [00:00<?, ?it/s]

  0%|          | 0/969 [00:00<?, ?it/s]

  0%|          | 0/623 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

  0%|          | 0/197 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/646 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

  0%|          | 0/717 [00:00<?, ?it/s]

  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/294 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/764 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/531 [00:00<?, ?it/s]

  0%|          | 0/826 [00:00<?, ?it/s]

  0%|          | 0/670 [00:00<?, ?it/s]

  0%|          | 0/171 [00:00<?, ?it/s]

  0%|          | 0/624 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/472 [00:00<?, ?it/s]

  0%|          | 0/736 [00:00<?, ?it/s]

  0%|          | 0/769 [00:00<?, ?it/s]

  0%|          | 0/1151 [00:00<?, ?it/s]

  0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/257 [00:00<?, ?it/s]

  0%|          | 0/680 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/583 [00:00<?, ?it/s]

  0%|          | 0/973 [00:00<?, ?it/s]

  0%|          | 0/257 [00:00<?, ?it/s]

  0%|          | 0/745 [00:00<?, ?it/s]

  0%|          | 0/464 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/556 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/570 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/626 [00:00<?, ?it/s]

  0%|          | 0/1113 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/398 [00:00<?, ?it/s]

  0%|          | 0/371 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/717 [00:00<?, ?it/s]

  0%|          | 0/512 [00:00<?, ?it/s]

  0%|          | 0/558 [00:00<?, ?it/s]

  0%|          | 0/334 [00:00<?, ?it/s]

  0%|          | 0/339 [00:00<?, ?it/s]

  0%|          | 0/257 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/566 [00:00<?, ?it/s]

  0%|          | 0/894 [00:00<?, ?it/s]

  0%|          | 0/121 [00:00<?, ?it/s]

  0%|          | 0/484 [00:00<?, ?it/s]

  0%|          | 0/618 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/503 [00:00<?, ?it/s]

  0%|          | 0/706 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/604 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/930 [00:00<?, ?it/s]

  0%|          | 0/944 [00:00<?, ?it/s]

  0%|          | 0/297 [00:00<?, ?it/s]

  0%|          | 0/517 [00:00<?, ?it/s]

  0%|          | 0/397 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/657 [00:00<?, ?it/s]

  0%|          | 0/918 [00:00<?, ?it/s]

  0%|          | 0/673 [00:00<?, ?it/s]

  0%|          | 0/1129 [00:00<?, ?it/s]

  0%|          | 0/416 [00:00<?, ?it/s]

  0%|          | 0/451 [00:00<?, ?it/s]

In [14]:
party_dict['VVD']['Migratie']

{'2010': ['verblijfsvergunning', 'immigratie'],
 '2012': ['nederlands', 'migrant'],
 '2017': ['leven'],
 '2021': ['europees', 'veilig', 'bijdrage', 'vrij', 'grens'],
 '2023': ['nodig', 'grip', 'instroom', 'talent', 'arbeidsmigrant']}

In [15]:
party_dict['BBB']['Migratie']

{'2021': ['mens'], '2023': ['verhaal', 'eerlijk', 'mens', 'politiek', 'trots']}

In [16]:
party_dict['FvD']['Migratie']

{'2017': ['immigrant'], '2021': ['model', 'immigratie', 'nederlands', 'land']}

In [17]:
party_dict['NSC']['Migratie']

{'2023': ['migratiesaldo', 'regionaal', 'taal', 'beleid', 'afspraak']}

In [19]:
party_dict['VVD']['Milieu']

{'2010': ['schoon', 'brandstof', 'natuur', 'fossiel', 'kans'],
 '2012': ['mens'],
 '2017': ['duurzaam', 'economie'],
 '2021': ['europees', 'uitstoot', 'kernenergie', 'auto', 'bedrijf'],
 '2023': ['schoon', 'huis', 'manier', 'wijk', 'innovatief']}