
# Sentiment analysis

Genruik is gemaakt van `pattern`, die een submodule voor Nederlands heeft. Het algoritme filtert uit zinnen eerst woorden die iets zeggen over emotie, zoals 'goed' of 'spannend'. Ook zoekt het versterkers zoals 'echt', of '!'. Het waardeert ontkennende woorden samen met de daaropvolgende woorden, dus 'niet achterhaald' scoort anders dan 'achterhaald' Woorden die iets zeggen over het sentiment worden gescoord die gebruikt worden voor een uiteindelijke zinscore, dat het gemiddelde is van alle woordbrokjes in een zin. Gescoord wordt met een (polarity, subjectivity)-tuple met polarity tussen -1.0 and 1.0 en subjectivity tusswen 0.0 and 1.0.

Voorbeeld: de zin `text = 'het beginselprogramma is niet achterhaald'` wordt positief gescoord, daar waar `text = 'het beginselprogramma is achterhaald'` negatief gescoord wordt.




In [None]:
from pattern.nl import sentiment, positive

In [None]:
text = 'het beginselprogramma is niet achterhaald'
print(sentiment(text).assessments, positive(text))
text = 'het beginselprogramma is achterhaald'
print(sentiment(text).assessments, positive(text))

## Benodigde libaries 

In [None]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import Text
import nltk
import re
from nltk.tokenize import RegexpTokenizer

output_notebook()

tokenizer = RegexpTokenizer(r'\w+')

## Inladen data 

In [None]:
df = pd.read_excel('ledenenquete_over_heel_de_mens.xlsx', skiprows=2, names = ['Serienummer', 'SID', 'Submitted Time', 'Verwerkingstijd',
       'Modified Time', 'Kladversie', 'IP-adres', 'UID', 'Gebruikersnaam',
       'Naam', 'Email', 'Afdeling',
       'indruk', #12
       'ontwikkelingen', #13
       'redenen', #14
       'meegeven']) #15
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df[df.Kladversie == 0]

## Benodigde functies

In [None]:
def assess_sentiment(text):
    """ 
    calculates (polarity, subjectivity)-tuple for the given sentence
    with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0
    """
    return positive(text)


def get_clean_sentences(text):
    """
    function to clean a text from punctuations and tokenize text into sentences
    """
    text = text.replace(r"/\s/g", "")
    text = re.sub(r"(?<!\b[A-Z])\.(?!\d)", ". ", text) # this changes text.text into text. text
    sent_text = nltk.sent_tokenize(text) # this gives us a list of sentences
    #filter out sentences that have only .
    sent_text = [sent for sent in sent_text if len(sent) > 2]
    return sent_text


def plot_sentiment(filtered_df, title, column):
    """
    function to plot sentiment from a dataframe
    it uses bokeh plot
    """
    sentimenten = ['positief', 'negatief']
    pos, neg = filtered_df[column].value_counts()
    counts = [pos, neg]

    p = figure(x_range=sentimenten, y_range = [0, max(pos,neg) + (pos+neg)/10], plot_height=300, plot_width=300, 
        title=title, 
       )
    p.vbar(x=sentimenten, top=counts, width=0.9, alpha = 0.6, color = ['green', 'red'])
    p.outline_line_color = None
    p.grid.grid_line_color = None
    p.axis.axis_line_color = 'lightgrey'
    p.axis.major_tick_line_color = None
    p.axis.minor_tick_line_color = None
    p.title.align = 'center'

    #add text
    pos_text = f'{round(pos / (pos+neg) * 100)} %'
    neg_text = f'{round(neg / (pos+neg) * 100)} %'
    positieven = Text(x=0.4, y=pos + 3, text=[pos_text], text_font_size='11pt')
    p.add_glyph(positieven)
    negatieven = Text(x=1.4, y=neg + 3, text=[neg_text], text_font_size='11pt')
    p.add_glyph(negatieven)

    p.axis.major_label_text_font_size = '11pt'
    p.title.text_font_size = '11pt'
    show(p)
    print('aantal reacties: ', pos+neg)
    print('aantal positieve sentimenten: ', pos)
    print('aantal negatieve sentimenten : ', neg)


def plot_sentiment_nondf(pos, neg, title):
    """
    function to plot sentiment given pos, neg counts
    it uses bokeh to plot
    """
    sentimenten = ['positief', 'negatief']
    counts = [pos, neg]

    p = figure(x_range=sentimenten, y_range = [0, max(pos,neg) + (pos+neg)/5], plot_height=300, plot_width=300, 
        title=title, 
       )
    p.vbar(x=sentimenten, top=counts, width=0.9, alpha = 0.6, color = ['green', 'red'])
    p.outline_line_color = None
    p.grid.grid_line_color = None
    p.axis.axis_line_color = 'lightgrey'
    p.axis.major_tick_line_color = None
    p.axis.minor_tick_line_color = None
    p.title.align = 'center'

    #add text
    pos_text = f'{round(pos / (pos+neg) * 100)} %'
    neg_text = f'{round(neg / (pos+neg) * 100)} %'
    positieven = Text(x=0.4, y=pos + 3, text=[pos_text], text_font_size='11pt')
    p.add_glyph(positieven)
    negatieven = Text(x=1.4, y=neg + 3, text=[neg_text], text_font_size='11pt')
    p.add_glyph(negatieven)

    p.axis.major_label_text_font_size = '11pt'
    p.title.text_font_size = '11pt'
    show(p)
    print('aantal reacties: ', pos+neg)
    print('aantal positieve sentimenten: ', pos)
    print('aantal negatieve sentimenten : ', neg)
    

def coun_sentiment(neg=0, pos=0, sentences=[], file='out.txt', title=''):
    """
    function that assesses the sentiment and count number of positives and negatived
    input: sentences list
    output: neg, pos tuple
    """
    with open(file, 'a') as o:
        o.write(title+'\n')
        for i in sentences:
            # @{str(positive(i))}
            o.write(f'{i} \n')
            if positive(i):
                pos = pos + 1
            else:
                neg = neg + 1
    return neg, pos

## Analyse

In [None]:
# Wat is uw algemene indruk van Heel de mens? 
# Is ons beginselprogramma achterhaald, of nog altijd actueel? 
# Wat vindt u van de taal en toon?

df['indruk_sentiment'] = df['indruk'].apply(assess_sentiment)
filtered_df = df[df['indruk'].notnull()]
plot_sentiment(filtered_df, 'Algemene indruk heel de mens', 'indruk_sentiment')

In [None]:
# Redenen om lid te worden? 
df['redenen_sentiment'] = df['redenen'].apply(assess_sentiment)
filtered_df = df[df['indruk'].notnull()]
plot_sentiment(filtered_df, 'Reden om lid te worden', 'redenen_sentiment')

In [None]:
#  Hoe ziet u de toekomst van de SP?# 

df['redenen_sentiment'] = df['redenen'].apply(assess_sentiment)
filtered_df = df[df['redenen'].notnull()]
filtered_df = filtered_df[filtered_df['redenen'].str.contains('toekomst')]
plot_sentiment(filtered_df, 'Hoe ziet u de toekomst van de SP?', 'redenen_sentiment')

In [None]:
#  Navo?

def get_navo(column):
    df_navo = df[df[column].str.contains('(navo|NAVO|Navo)(\s|\.)', regex= True, na=False)]
    text = "\n".join(df_navo[column].tolist())
    sentences = get_clean_sentences(text)
    return list(filter(lambda k: 'navo' in k.lower(), sentences))


navo_sentences = get_navo('indruk')
neg, pos = coun_sentiment(pos=0, neg=0, sentences=navo_sentences, title = '\n-----reacties vraag indruk----\n')
navo_sentences = get_navo('redenen')
neg, pos = coun_sentiment(neg, pos, sentences=navo_sentences, title = '\n----reacties redenen om lid te worden----\n')
navo_sentences = get_navo('ontwikkelingen')
neg, pos = coun_sentiment(neg, pos, sentences=navo_sentences, title ='\n----reacties vraag ontwikkeling----\n')
navo_sentences = get_navo('meegeven')
neg, pos = coun_sentiment(neg, pos, sentences=navo_sentences,title = '\n----meegeven aan de commissie----\n')
neg = 91 - 14
pos = 14

plot_sentiment_nondf(pos=pos, neg=neg, title='Reacties over NAVO standpunt') #handmatig geteld in navo.txt
    

In [None]:
#  Trots?

def get_trots(column):
    df_trots = df[df[column].str.contains('trots') | df[column].str.contains('Trots')]
    text = "\n".join(df_trots[column].tolist())
    sentences = get_clean_sentences(text)
    return list(filter(lambda k: 'trots' in k.lower(), sentences))


_sentences = get_trots('indruk')
neg, pos = coun_sentiment(sentences=_sentences)
_sentences = get_trots('redenen')
neg, pos = coun_sentiment(neg, pos, sentences=_sentences)
_sentences = get_trots('ontwikkelingen')
neg, pos = coun_sentiment(neg, pos, sentences=_sentences)
_sentences = get_trots('meegeven')
neg, pos = coun_sentiment(neg, pos, sentences=_sentences)
print(neg, pos)


In [None]:
plot_sentiment_nondf(pos, neg, 'Trots sentiment')

In [None]:
#  Samenwerken?

def get_query(column):
    query = '(EU|eu|Europa|Euro|euro|Europese)(-\s|\.)'
    df_query = df[df[column].str.contains(query, regex= True, na=False)]
    text = "\n".join(df_query[column].tolist())
    sentences = get_clean_sentences(text)
    s_eu = list(filter(lambda k: 'EU' in k, sentences))
    s_euro = list(filter(lambda k: 'Euro' in k, sentences))
    return list(set(s_eu + s_euro))


_sentences = get_query('indruk')
neg, pos = coun_sentiment(sentences=_sentences, title = '\n-----reacties vraag indruk----\n')
_sentences = get_query('redenen')
neg, pos = coun_sentiment(neg, pos, sentences=_sentences, title = '\n----reacties redenen om lid te worden----\n')
_sentences = get_query('ontwikkelingen')
neg, pos = coun_sentiment(neg, pos, sentences=_sentences, title = '\n----reacties vraag ontwikkeling----\n')
_sentences = get_query('meegeven')
neg, pos = coun_sentiment(neg, pos, sentences=_sentences, title = '\n----meegeven aan de commissie----\n')
print(neg, pos)


In [None]:
plot_sentiment_nondf(pos, neg, 'Reacties mbt de EU en de Euro')