In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd
import time

In [2]:
# welke partij hoort de spreker bij?
# K-means indelen en kijken of ze in hun eigen groepen zitten
# Veranderen sprekers van visie als ze in de coalitie zitten?
# Maken van een spectrum van sprekers (links/rechts of andere schalen)

import sqlite3
import pandas as pd

db = sqlite3.connect('reports.sqlite')

agenda_items = pd.read_sql_query("SELECT * FROM agenda_items", db)
speaker_items = pd.read_sql_query("SELECT * FROM speaker_turns", db)
speakers = pd.read_sql_query("SELECT * FROM speakers", db)

In [3]:
agenda_items.head()

Unnamed: 0,id,filename,item_date,agenda_no,title
0,1,h-tk-20162017-59-2.xml,2017-03-23 00:00:00,2,Mededelingen
1,2,h-tk-20162017-59-3.xml,2017-03-23 00:00:00,3,Beëdiging tijdelijke voorzitter
2,3,h-tk-20162017-59-4.xml,2017-03-23 00:00:00,4,Beëdiging leden
3,4,h-tk-20162017-59-5.xml,2017-03-23 00:00:00,5,Vaststelling profielschets nieuwe Voorzitter
4,5,h-tk-20162017-59-6.xml,2017-03-23 00:00:00,6,Regeling van werkzaamheden


In [4]:
speaker_items.head()

Unnamed: 0,id,agenda_item,turn_no,speaker,txt
0,1,1,1,1,Detijdelijke voorzitter:Ik deel aan de Kamer m...
1,2,5,1,2,Voorzitter. Ik zal het idee van de heer Pechto...
2,3,5,2,2,Zo mogelijk in aanwezigheid van de minister-pr...
3,4,5,3,3,Ik kan mij die behoefte heel goed voorstellen....
4,5,5,4,4,Steun namens D66 voor het verzoek van de heer ...


In [5]:
speakers.groupby('political').count()

Unnamed: 0_level_0,id,prefix,name
political,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50PLUS,5,5,5
CDA,22,22,22
ChristenUnie,6,6,6
D66,23,23,23
DENK,3,3,3
EP/CDA,1,1,1
EP/CU/SGP,1,1,1
EP/D66,1,1,1
EP/GroenLinks,1,1,1
EP/PvdA,1,1,1


In [6]:
merged = speaker_items.merge(speakers, left_on='speaker', right_on='id')[['txt','political']]

In [7]:
merged.groupby('political').count()
# er zijn te weinig EP's. Deze nemen we dus niet mee. 
# Misschien interessant om te kijken of hun spreekbeurten langer zijn dan die van kamerleden

Unnamed: 0_level_0,txt
political,Unnamed: 1_level_1
50PLUS,3477
CDA,9360
ChristenUnie,3568
D66,9504
DENK,3773
EP/CDA,3
EP/CU/SGP,1
EP/D66,1
EP/GroenLinks,1
EP/PvdA,1


In [8]:
merged = merged[~merged['political'].isin(['EP/CDA', 'EP/CU/SGP', 'EP/D66', 
                                           'EP/GroenLinks', 'EP/PvdA', 'EP/PvdD',
                                           'EP/SP', 'EP/VVD'])]
merged = merged.dropna()
merged = merged.replace('Forum voor Democratie','FvD')
print(merged.groupby('political').count())

                txt
political          
50PLUS         3477
CDA            9360
ChristenUnie   3568
D66            9504
DENK           3773
FvD            2095
GroenLinks    10093
PVV            8442
PvdA           6795
PvdD           4049
SGP            2720
SP            11911
VVD            9611


In [19]:
N = np.min(merged.groupby('political').size())

sample50PLUS = merged[merged['political'] == '50PLUS'].sample(n = N, random_state = 1)
sampleCDA = merged[merged['political'] == 'CDA'].sample(n = N, random_state = 1)
sampleCU = merged[merged['political'] == 'ChristenUnie'].sample(n = N, random_state = 1)
sampleD66 = merged[merged['political'] == 'D66'].sample(n = N, random_state = 1)
sampleDENK = merged[merged['political'] == 'DENK'].sample(n = N, random_state = 1)
sampleFvD = merged[merged['political'] == 'FvD'].sample(n = N, random_state = 1)
sampleGL = merged[merged['political'] == 'GroenLinks'].sample(n = N, random_state = 1)
samplePVV = merged[merged['political'] == 'PVV'].sample(n = N, random_state = 1)
samplePvdA = merged[merged['political'] == 'PvdA'].sample(n = N, random_state = 1)
samplePvdD = merged[merged['political'] == 'PvdD'].sample(n = N, random_state = 1)
sampleSGP = merged[merged['political'] == 'SGP'].sample(n = N, random_state = 1)
sampleSP = merged[merged['political'] == 'SP'].sample(n = N, random_state = 1)
sampleVVD = merged[merged['political'] == 'VVD'].sample(n = N, random_state = 1)

data = pd.concat([sample50PLUS,sampleCDA,sampleCU,sampleD66,sampleDENK,sampleFvD,
                  sampleGL,samplePVV,samplePvdA,samplePvdD,sampleSGP,sampleSP,sampleVVD])

print(data.groupby('political').count())
data['political'] = data.political.astype('category').values.codes

               txt
political         
50PLUS        2095
CDA           2095
ChristenUnie  2095
D66           2095
DENK          2095
FvD           2095
GroenLinks    2095
PVV           2095
PvdA          2095
PvdD          2095
SGP           2095
SP            2095
VVD           2095


In [20]:
text_file = open("stopwoordenlijst.txt", "r")
stopwoorden = text_file.readlines()
stopwoorden = [line[:-1] for line in stopwoorden]
stopwoorden[0] = 'a'

nb_pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwoorden, ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())])
     
X = data['txt']
Y = data.political.astype('category').values.codes

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [22]:
t = time.time()
nb_pipeline.fit(X_train,Y_train)
predicted = nb_pipeline.predict(X_test)
print(accuracy_score(predicted,Y_test))
print(time.time() - t)

  'stop_words.' % sorted(inconsistent))


0.3392693225628787
12.361681938171387


In [23]:
X_train.shape

(21788,)