# Imports

In [336]:
import xml
import xml.etree.ElementTree as ET

import string
import unidecode
import re

import spacy
from spacy.lang.nl.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import precision_score, recall_score, f1_score

from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN

from scipy.sparse import lil_matrix

In [303]:
nlp = spacy.load('nl_core_news_lg')
stops_words = STOP_WORDS

## Exploring Data

In [304]:
year = ['1986', '1994', '1998']
SELECTED_YEAR = 0 # for now

Simple theme taxonomy exrtaction method: take all themes and ignore the relationship between themes.

More complex list that captures the relation between themes (not complete yet).

In [305]:
# taxonomy = {}
# for theme in taxonomy_xml.getroot():
#     tax = {}
#     # print(theme)
#     for sub in theme:
#         for subsub in sub:
#             if sub.tag == 'related_themes':
#                 print('RELATED', subsub.get('id'))
#             else:
#                 print('REFERRES', subsub.text)
#             # print(subsub.text)

#### Extracting Text Data

In [306]:
document = xml.dom.minidom.parse(f'data/Political_election_manifestos/VP_{year[SELECTED_YEAR]}.party-topicnr-content.xml')

In [307]:
def apply_text_preprocessing(text: str):
    # Strip + Lower case
    text = text.strip().lower()
    # Remove digits
    text = unidecode.unidecode(text)
    # Remove Punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #  Remove stop words and Lemmatisation
    text = ' '.join([token.lemma_ for token in nlp(text) if token.is_stop == False and token.text != ' '])

    return text

In [308]:
chapters = {}
for chapter in document.firstChild.getElementsByTagName('chapter'):
    party = chapter.getAttribute('party')
    # print(chapter.getAttribute('party'))
    par = {}
    for paragraph in chapter.getElementsByTagName('p'):
        id = paragraph.getAttribute('id')
        # print(paragraph.getAttribute('id'))
        
        paragraph_value = paragraph.childNodes[2].nodeValue
        # paragraph_value = apply_text_preprocessing(paragraph_value)
        # print(paragraph_value)

        paragraph_themes = [theme.getAttribute('id') for theme in paragraph.getElementsByTagName('theme')]
        # print(paragraph_themes)

        par[id] = {'p': paragraph_value, 'themes': paragraph_themes}
    chapters[party] = par

In [309]:
X, y = [], []
for chapter in chapters.keys():
    for text in chapters[chapter].keys():
        # if chapters[chapter][text]['p'] != '':
        X.append(chapters[chapter][text]['p']) 
        y.append(chapters[chapter][text]['themes'])

In [310]:
print(len(X))
print(len(y))

797
797


In [311]:
print(type(X))
print(type(y))

<class 'list'>
<class 'list'>


#### Extracting All Themes

In [312]:
# taxonomy_xml = ET.parse(f'data/Political_election_manifestos/taxonomy.{year[SELECTED_YEAR]}.xml')
taxonomy_xml = xml.dom.minidom.parse(f'data/Political_election_manifestos/taxonomy.{year[SELECTED_YEAR]}.xml')

In [313]:
# SIMPLE THEME COLLECTION
themes_set = set()
for theme in taxonomy_xml.getElementsByTagName('theme'):
    themes_set.add(theme.getAttribute('id'))
# them we want to combine this themes list with the themes present in the manifesto
for elem in y:
    for sub_elem in elem:
        themes_set.add(sub_elem)

### Label Binarizer

In [314]:
multilabel_bin = MultiLabelBinarizer()
multilabel_bin.fit_transform([themes_set])
y_bin = multilabel_bin.transform(y)

### TF_IDF

In [315]:
# play around with n_gram and max_features
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000)

X_tfidf = vectorizer.fit_transform(X)

# Models

## BinaryRelevance

### Train-Test Split

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size=0.33, random_state=42)

In [317]:
print(type(X_train))
print(type(y_train))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>


In [330]:
lp_classifier = LabelPowerset(MultinomialNB())

In [331]:
lp_classifier.fit(X_train, y_train)

LabelPowerset(classifier=MultinomialNB(), require_dense=[True, True])

In [332]:
lp_preds = lp_classifier.predict(X_test)

In [334]:
print(precision_score(y_test, lp_preds, average='micro'))
print(recall_score(y_test, lp_preds, average='micro'))
print(f1_score(y_test, lp_preds, average='micro'))

0.27238805970149255
0.026258992805755395
0.047900262467191604


In [373]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV

In [345]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
print('PRECISION', precision_score(y_test, dt_preds, average='micro'))
print('RECALL', recall_score(y_test, dt_preds, average='micro'))
print('F1', f1_score(y_test, dt_preds, average='micro'))

PRECISION 0.37753510140405616
RECALL 0.26115107913669067
F1 0.308739102700404


In [346]:
exts = ExtraTreesClassifier()
exts.fit(X_train, y_train)
exts_preds = exts.predict(X_test)
print('PRECISION', precision_score(y_test, exts_preds, average='micro'))
print('RECALL', recall_score(y_test, exts_preds, average='micro'))
print('F1', f1_score(y_test, exts_preds, average='micro'))

PRECISION 0.8773946360153256
RECALL 0.08237410071942446
F1 0.15060835251561985


In [355]:
knn = KNeighborsClassifier(10)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
print('PRECISION', precision_score(y_test, knn_preds, average='micro'))
print('RECALL', recall_score(y_test, knn_preds, average='micro'))
print('F1', f1_score(y_test, knn_preds, average='micro'))

PRECISION 0.9111111111111111
RECALL 0.014748201438848921
F1 0.029026548672566373


In [359]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
mlp_preds = mlp.predict(X_test)
print('PRECISION', precision_score(y_test, mlp_preds, average='micro'))
print('RECALL', recall_score(y_test, mlp_preds, average='micro'))
print('F1', f1_score(y_test, mlp_preds, average='micro'))

PRECISION 0.7726737338044759
RECALL 0.23597122302158274
F1 0.36153210250757783




In [368]:
rnn = RadiusNeighborsClassifier(radius=1.5)
rnn.fit(X_train, y_train)
print(rnn.score(X_test, y_test))
rnn_preds = rnn.predict(X_test)
print('PRECISION', precision_score(y_test, rnn_preds, average='micro'))
print('RECALL', recall_score(y_test, rnn_preds, average='micro'))
print('F1', f1_score(y_test, rnn_preds, average='micro'))

0.0
PRECISION 0.0
RECALL 0.0
F1 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [371]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print('PRECISION', precision_score(y_test, rf_preds, average='micro'))
print('RECALL', recall_score(y_test, rf_preds, average='micro'))
print('F1', f1_score(y_test, rf_preds, average='micro'))

PRECISION 0.874439461883408
RECALL 0.07014388489208633
F1 0.12987012987012986


In [375]:
ridge = GradientBoostingClassifier()
ridge.fit(X_train, y_train)
ridge_preds = ridge.predict(X_test)
print('PRECISION', precision_score(y_test, ridge_preds, average='micro'))
print('RECALL', recall_score(y_test, ridge_preds, average='micro'))
print('F1', f1_score(y_test, ridge_preds, average='micro'))

ValueError: y should be a 1d array, got an array of shape (533, 234) instead.

In [382]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression

multi_forest = ClassifierChain(RandomForestClassifier(), order='random')
preds = multi_forest.fit(X_train, y_train).predict(X_test)
print('PRECISION', precision_score(y_test, preds, average='micro'))
print('RECALL', recall_score(y_test, preds, average='micro'))
print('F1', f1_score(y_test, preds, average='micro'))

PRECISION 0.8836206896551724
RECALL 0.1474820143884892
F1 0.25277435265104814
