# Imports

In [36]:
import xml
import xml.etree.ElementTree as ET

import string
import unidecode
import re

import spacy
from spacy.lang.nl.stop_words import STOP_WORDS

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss

from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset

In [2]:
nlp = spacy.load('nl_core_news_lg')
stops_words = STOP_WORDS

## Exploring Data

In [3]:
year = ['1986', '1994', '1998']
SELECTED_YEAR = 0 # for now

Simple theme taxonomy exrtaction method: take all themes and ignore the relationship between themes.

More complex list that captures the relation between themes (not complete yet).

In [4]:
# taxonomy = {}
# for theme in taxonomy_xml.getroot():
#     tax = {}
#     # print(theme)
#     for sub in theme:
#         for subsub in sub:
#             if sub.tag == 'related_themes':
#                 print('RELATED', subsub.get('id'))
#             else:
#                 print('REFERRES', subsub.text)
#             # print(subsub.text)

#### Extracting Text Data

In [5]:
document = xml.dom.minidom.parse(f'data/Political_election_manifestos/VP_{year[SELECTED_YEAR]}.party-topicnr-content.xml')

In [6]:
def apply_text_preprocessing(text: str):
    # Strip + Lower case
    text = text.strip().lower()
    # Remove digits
    text = unidecode.unidecode(text)
    # Remove Punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #  Remove stop words and Lemmatisation
    text = ' '.join([token.lemma_ for token in nlp(text) if token.is_stop == False and token.text != ' '])

    return text

In [7]:
chapters = {}
for chapter in document.firstChild.getElementsByTagName('chapter'):
    party = chapter.getAttribute('party')
    # print(chapter.getAttribute('party'))
    par = {}
    for paragraph in chapter.getElementsByTagName('p'):
        id = paragraph.getAttribute('id')
        # print(paragraph.getAttribute('id'))
        
        paragraph_value = paragraph.childNodes[2].nodeValue
        # paragraph_value = apply_text_preprocessing(paragraph_value)
        # print(paragraph_value)

        paragraph_themes = [theme.getAttribute('id').lower() for theme in paragraph.getElementsByTagName('theme')]
        # print(paragraph_themes)

        par[id] = {'p': paragraph_value, 'themes': paragraph_themes}
    chapters[party] = par

In [8]:
X, y = [], []
for chapter in chapters.keys():
    for text in chapters[chapter].keys():
        # if chapters[chapter][text]['p'] != '':
        X.append(chapters[chapter][text]['p']) 
        y.append(chapters[chapter][text]['themes'])

In [9]:
print(len(X))
print(len(y))

797
797


In [10]:
print(type(X))
print(type(y))

<class 'list'>
<class 'list'>


#### Extracting All Themes

In [11]:
# taxonomy_xml = ET.parse(f'data/Political_election_manifestos/taxonomy.{year[SELECTED_YEAR]}.xml')
taxonomy_xml = xml.dom.minidom.parse(f'data/Political_election_manifestos/taxonomy.{year[SELECTED_YEAR]}.xml')

In [12]:
# SIMPLE THEME COLLECTION
themes_set = set()
for theme in taxonomy_xml.getElementsByTagName('theme'):
    themes_set.add(theme.getAttribute('id').lower())
# them we want to combine this themes list with the themes present in the manifesto
for elem in y:
    for sub_elem in elem:
        themes_set.add(sub_elem)

### Label Binarizer

In [13]:
multilabel_bin = MultiLabelBinarizer()
multilabel_bin.fit_transform([themes_set])
y_bin = multilabel_bin.transform(y)

### TF_IDF

In [15]:
# play around with n_gram and max_features
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000)

X_tfidf = vectorizer.fit_transform(X)

# Models

### Train-Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size=0.33, random_state=42)

## BinaryRelevance

In [39]:
def binary_relevance(model, X_train, y_train, X_test, y_test):
    bn_model = BinaryRelevance(model)
    bn_model.fit(X_train, y_train)
    preds = bn_model.predict(X_test)
    print('Precision score:', precision_score(y_test, preds, average='micro'))
    print('Recall score:', recall_score(y_test, preds, average='micro'))
    print('F1 score:', f1_score(y_test, preds, average='micro'))
    print('Hamming distance:', hamming_loss(y_test, preds))

#### MultinomialNB

In [40]:
binary_relevance(MultinomialNB(), X_train, y_train, X_test, y_test)

Precision score: 0.9305555555555556
Recall score: 0.02410071942446043
F1 score: 0.046984572230014024
Hamming distance: 0.043997668997669


#### DecisionTreeClassifier

In [41]:
binary_relevance(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.5421127765881513
Recall score: 0.5464028776978417
F1 score: 0.5442493729845933
Hamming distance: 0.041181041181041184


#### ExtraTreesClassifier

In [42]:
binary_relevance(ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.8700564971751412
Recall score: 0.22158273381294963
F1 score: 0.35321100917431186
Hamming distance: 0.03651903651903652


#### RandomForestClassifier

In [43]:
binary_relevance(RandomForestClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.8721934369602763
Recall score: 0.18165467625899281
F1 score: 0.30068472759749926
Hamming distance: 0.03802447552447553


#### KNeighborsClassifier

In [44]:
binary_relevance(KNeighborsClassifier(), X_train, y_train, X_test, y_test)

Precision score: 0.7948717948717948
Recall score: 0.05575539568345324
F1 score: 0.10420168067226891
Hamming distance: 0.04313973063973064


#### MLPClassifier

In [45]:
binary_relevance(MLPClassifier(), X_train, y_train, X_test, y_test)

## Classifier Chaining

In [None]:
def classifier_chaining(model, X_train, y_train, X_test, y_test):
    cc_model = ClassifierChain(model)
    cc_model.fit(X_train, y_train)
    preds = cc_model.predict(X_test)
    print('Precision score:', precision_score(y_test, preds, average='micro'))
    print('Recall score:', recall_score(y_test, preds, average='micro'))
    print('F1 score:', f1_score(y_test, preds, average='micro'))
    print('Hamming distance:', hamming_loss(y_test, preds))

#### MultinomialNB

In [None]:
classifier_chaining(MultinomialNB(), X_train, y_train, X_test, y_test)

#### DecisionTreeClassifier

In [None]:
classifier_chaining(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

#### ExtraTreesClassifier

In [None]:
classifier_chaining(ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

#### RandomForestClassifier

In [None]:
classifier_chaining(RandomForestClassifier(), X_train, y_train, X_test, y_test)

#### KNeighborsClassifier

In [None]:
classifier_chaining(KNeighborsClassifier(), X_train, y_train, X_test, y_test)

#### MLPClassifier

In [None]:
classifier_chaining(MLPClassifier(), X_train, y_train, X_test, y_test)

## Label Powerset

In [None]:
def label_powerset(model, X_train, y_train, X_test, y_test):
    cc_model = LabelPowerset(model)
    cc_model.fit(X_train, y_train)
    preds = cc_model.predict(X_test)
    print('Precision score:', precision_score(y_test, preds, average='micro'))
    print('Recall score:', recall_score(y_test, preds, average='micro'))
    print('F1 score:', f1_score(y_test, preds, average='micro'))
    print('Hamming distance:', hamming_loss(y_test, preds))

#### MultinomialNB

In [None]:
label_powerset(MultinomialNB(), X_train, y_train, X_test, y_test)

#### DecisionTreeClassifier

In [None]:
label_powerset(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

#### ExtraTreesClassifier

In [None]:
label_powerset(ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

#### RandomForestClassifier

In [None]:
label_powerset(RandomForestClassifier(), X_train, y_train, X_test, y_test)

#### KNeighborsClassifier

In [None]:
label_powerset(KNeighborsClassifier(), X_train, y_train, X_test, y_test)

#### MLPClassifier

In [None]:
label_powerset(MLPClassifier(), X_train, y_train, X_test, y_test)