# Imports

In [1]:
import xml
import xml.etree.ElementTree as ET

import string
import unidecode
import re

import spacy
from spacy.lang.nl.stop_words import STOP_WORDS

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss

from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset

In [2]:
nlp = spacy.load('nl_core_news_lg')
stops_words = STOP_WORDS

## Exploring Data

#### Extracting Text Data

In [60]:
document = xml.dom.minidom.parse('data/Political_election_manifestos/VP_1986.party-topicnr-content.xml')

In [61]:
def apply_text_preprocessing(text: str):
    # Strip + Lower case
    text = text.strip().lower()
    # Remove digits
    text = unidecode.unidecode(text)
    # Remove Punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #  Remove stop words and Lemmatisation
    text = ' '.join([token.lemma_ for token in nlp(text) if token.is_stop == False and token.text != ' '])

    return text

In [62]:
def create_dictionary(xml_document):
    chapters = {}
    for chapter in document.firstChild.getElementsByTagName('chapter'):
        party = chapter.getAttribute('party')
        # print(chapter.getAttribute('party'))
        par = {}
        for paragraph in chapter.getElementsByTagName('p'):
            id = paragraph.getAttribute('id')
            # print(paragraph.getAttribute('id'))
            
            paragraph_value = paragraph.childNodes[2].nodeValue
            paragraph_value = apply_text_preprocessing(paragraph_value)
            # print(paragraph_value)

            paragraph_themes = [theme.getAttribute('id').lower() for theme in paragraph.getElementsByTagName('theme')]
            # print(paragraph_themes)

            par[id] = {'p': paragraph_value, 'themes': paragraph_themes}
        chapters[party] = par
    return chapters

In [63]:
def create_X_y(chapters: dict):
    X, y = [], []
    for chapter in chapters.keys():
        for text in chapters[chapter].keys():
            # if chapters[chapter][text]['p'] != '':
            X.append(chapters[chapter][text]['p']) 
            y.append(chapters[chapter][text]['themes'])
    return X, y

In [64]:
X, y = create_X_y(create_dictionary(document))

#### Extracting All Themes

In [65]:
# taxonomy_xml = ET.parse(f'data/Political_election_manifestos/taxonomy.1986.xml')
taxonomy_xml = xml.dom.minidom.parse(f'data/Political_election_manifestos/taxonomy.1986.xml')

In [66]:
def extract_themes(xml_document, y_labels: list):
    themes_set = set()
    for theme in xml_document.getElementsByTagName('theme'):
        themes_set.add(theme.getAttribute('id').lower())

    for theme in y_labels:
        for elem in theme:
            themes_set.add(elem)

    return themes_set

In [67]:
themes_set = extract_themes(taxonomy_xml, y)

### Label Binarizer

In [68]:
multilabel_bin = MultiLabelBinarizer()
multilabel_bin.fit_transform([themes_set])
y_bin = multilabel_bin.transform(y)

### TF_IDF

In [69]:
# play around with n_gram and max_features
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000)

X_tfidf = vectorizer.fit_transform(X)

# Models

### Train-Test Split

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size=0.33, random_state=42)

## BinaryRelevance

In [71]:
def binary_relevance(model, X_train, y_train, X_test, y_test):
    bn_model = BinaryRelevance(model)
    bn_model.fit(X_train, y_train)
    preds = bn_model.predict(X_test)
    print('Precision score:', precision_score(y_test, preds, average='micro'))
    print('Recall score:', recall_score(y_test, preds, average='micro'))
    print('F1 score:', f1_score(y_test, preds, average='micro'))
    print('Hamming distance:', hamming_loss(y_test, preds))

#### MultinomialNB

In [72]:
binary_relevance(MultinomialNB(), X_train, y_train, X_test, y_test)

Precision score: 0.8106508875739645
Recall score: 0.04928057553956835
F1 score: 0.0929128518141743
Hamming distance: 0.0433016058016058


#### DecisionTreeClassifier

In [18]:
binary_relevance(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.5929961089494163
Recall score: 0.5482014388489208
F1 score: 0.5697196261682242
Hamming distance: 0.03726366226366226


#### ExtraTreesClassifier

In [19]:
binary_relevance(ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.8558692421991084
Recall score: 0.20719424460431654
F1 score: 0.33362293657688963
Hamming distance: 0.037247474747474744


#### RandomForestClassifier

In [20]:
binary_relevance(RandomForestClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.9011627906976745
Recall score: 0.1672661870503597
F1 score: 0.28216019417475724
Hamming distance: 0.0382996632996633


#### KNeighborsClassifier

In [21]:
binary_relevance(KNeighborsClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.6
Recall score: 0.002158273381294964
F1 score: 0.004301075268817205
Hamming distance: 0.04496891996891997


#### MLPClassifier

In [32]:
binary_relevance(MLPClassifier(hidden_layer_sizes=(50,)), X_train, y_train, X_test, y_test)

Precision score: 0.7338877338877339
Recall score: 0.2539568345323741
F1 score: 0.37733832175307325
Hamming distance: 0.037716912716912715


## Classifier Chaining

In [16]:
def classifier_chaining(model, X_train, y_train, X_test, y_test):
    cc_model = ClassifierChain(model)
    cc_model.fit(X_train, y_train)
    preds = cc_model.predict(X_test)
    print('Precision score:', precision_score(y_test, preds, average='micro'))
    print('Recall score:', recall_score(y_test, preds, average='micro'))
    print('F1 score:', f1_score(y_test, preds, average='micro'))
    print('Hamming distance:', hamming_loss(y_test, preds))

#### MultinomialNB

In [17]:
classifier_chaining(MultinomialNB(), X_train, y_train, X_test, y_test)

Precision score: 0.875
Recall score: 0.05539568345323741
F1 score: 0.10419485791610283
Hamming distance: 0.042864542864542866


#### DecisionTreeClassifier

In [18]:
classifier_chaining(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.5770810003732736
Recall score: 0.556115107913669
F1 score: 0.5664041033156255
Hamming distance: 0.038315850815850816


#### ExtraTreesClassifier

In [19]:
classifier_chaining(ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.8615384615384616
Recall score: 0.16115107913669063
F1 score: 0.27151515151515154
Hamming distance: 0.03891478891478892


#### RandomForestClassifier

In [20]:
classifier_chaining(RandomForestClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.8930232558139535
Recall score: 0.1381294964028777
F1 score: 0.23925233644859814
Hamming distance: 0.03952991452991453


#### KNeighborsClassifier

In [21]:
classifier_chaining(KNeighborsClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.2959770114942529
Recall score: 0.03705035971223022
F1 score: 0.06585677749360615
Hamming distance: 0.047299922299922297


#### MLPClassifier

In [29]:
classifier_chaining(MLPClassifier(hidden_layer_sizes=(50,)), X_train, y_train, X_test, y_test)

Precision score: 0.7320061255742726
Recall score: 0.17194244604316547
F1 score: 0.27847363821730264
Hamming distance: 0.040096477596477595


## Label Powerset

In [22]:
def label_powerset(model, X_train, y_train, X_test, y_test):
    cc_model = LabelPowerset(model)
    cc_model.fit(X_train, y_train)
    preds = cc_model.predict(X_test)
    print('Precision score:', precision_score(y_test, preds, average='micro'))
    print('Recall score:', recall_score(y_test, preds, average='micro'))
    print('F1 score:', f1_score(y_test, preds, average='micro'))
    print('Hamming distance:', hamming_loss(y_test, preds))

#### MultinomialNB

In [23]:
label_powerset(MultinomialNB(), X_train, y_train, X_test, y_test)

Precision score: 0.2971014492753623
Recall score: 0.029496402877697843
F1 score: 0.05366492146596859
Hamming distance: 0.046814296814296816


#### DecisionTreeClassifier

In [24]:
label_powerset(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.24855491329479767
Recall score: 0.1856115107913669
F1 score: 0.21252059308072485
Hamming distance: 0.0619010619010619


#### ExtraTreesClassifier

In [25]:
label_powerset(ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.37398879900435594
Recall score: 0.21618705035971222
F1 score: 0.27399133804422154
Hamming distance: 0.05155723905723906


#### RandomForestClassifier

In [26]:
label_powerset(RandomForestClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.40870724122612173
Recall score: 0.33093525179856115
F1 score: 0.3657324587557146
Hamming distance: 0.051654364154364155


#### KNeighborsClassifier

In [27]:
label_powerset(KNeighborsClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.15272727272727274
Recall score: 0.015107913669064749
F1 score: 0.02749590834697218
Hamming distance: 0.04809311059311059


#### MLPClassifier

In [28]:
label_powerset(MLPClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.35537190082644626
Recall score: 0.07733812949640288
F1 score: 0.12703101920236337
Hamming distance: 0.047834110334110336




## Testing the other manifestos

### 1994

In [77]:
document_94 = xml.dom.minidom.parse('data/Political_election_manifestos/VP_1994.party-topicnr-content.xml')
X_94, y_94 = create_X_y(create_dictionary(document_94))
taxonomy_94 = xml.dom.minidom.parse('data/Political_election_manifestos/taxonomy.1994.xml')
themes_set_94 = extract_themes(taxonomy_94, y_94)

In [78]:
multilabel_bin = MultiLabelBinarizer()
multilabel_bin.fit_transform([themes_set_94])
y_94 = multilabel_bin.transform(y_94)

In [79]:
X_94 = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000).fit_transform(X_94)

In [80]:
X_train_94, X_test_94, y_train_94, y_test_94 = train_test_split(X_94, y_94, test_size=0.33, random_state=42)

In [81]:
binary_relevance(DecisionTreeClassifier(), X_train_94, y_train_94, X_test_94, y_test_94)

Precision score: 0.5945098039215686
Recall score: 0.5453237410071943
F1 score: 0.5688555347091933
Hamming distance: 0.023589554077358955


In [82]:
classifier_chaining(DecisionTreeClassifier(), X_train_94, y_train_94, X_test_94, y_test_94)

Precision score: 0.5652332361516035
Recall score: 0.5579136690647482
F1 score: 0.561549601737871
Hamming distance: 0.02486244559415291


In [91]:
binary_relevance(RandomForestClassifier(), X_train_94, y_train_94, X_test_94, y_test_94)

Precision score: 0.8862275449101796
Recall score: 0.15971223021582734
F1 score: 0.2706491923194148
Hamming distance: 0.02456475322328981


In [92]:
classifier_chaining(RandomForestClassifier(), X_train_94, y_train_94, X_test_94, y_test_94)

Precision score: 0.9
Recall score: 0.13597122302158274
F1 score: 0.23625000000000002
Hamming distance: 0.02508828118584216


### 1998

In [85]:
document_98 = xml.dom.minidom.parse('data/Political_election_manifestos/VP_1998.party-topicnr-content.xml')
X_98, y_98 = create_X_y(create_dictionary(document_98))
taxonomy_98 = xml.dom.minidom.parse('data/Political_election_manifestos/taxonomy.1998.xml')
themes_set_98 = extract_themes(taxonomy_98, y_98)

In [86]:
multilabel_bin = MultiLabelBinarizer()
multilabel_bin.fit_transform([themes_set_98])
y_98 = multilabel_bin.transform(y_98)

In [87]:
X_98 = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000).fit_transform(X_98)

In [88]:
X_train_98, X_test_98, y_train_98, y_test_98 = train_test_split(X_98, y_98, test_size=0.33, random_state=42)

In [89]:
binary_relevance(DecisionTreeClassifier(), X_train_98, y_train_98, X_test_98, y_test_98)

Precision score: 0.6054097056483692
Recall score: 0.5474820143884892
F1 score: 0.5749905553456742
Hamming distance: 0.0219093246085534


In [90]:
classifier_chaining(DecisionTreeClassifier(), X_train_98, y_train_98, X_test_98, y_test_98)

Precision score: 0.5657414170927685
Recall score: 0.5571942446043165
F1 score: 0.5614353026458861
Hamming distance: 0.02356469580119966


In [93]:
binary_relevance(RandomForestClassifier(), X_train_98, y_train_98, X_test_98, y_test_98)

Precision score: 0.8794466403162056
Recall score: 0.16007194244604317
F1 score: 0.27084601339014
Hamming distance: 0.023330996338708422


In [94]:
classifier_chaining(RandomForestClassifier(), X_train_98, y_train_98, X_test_98, y_test_98)

Precision score: 0.8947368421052632
Recall score: 0.14064748201438848
F1 score: 0.24308361827789865
Hamming distance: 0.02371075796525668
