# Notebook voor het analyseren van woord frequenties

Met dit notebook kunnen de volgende kolommen geanalyseerd worden
- indruk
- ontwikkelingen
- redenen
- meegeven

De tekst kan als zijn geheel weggeschreven worden of er kan op type woord geselecteerd worden en frequenties geanalyseerd worden. Gebruik wordt gemaakt van de nltk library en spacy


In [None]:
# import libraries
import pandas as pd
import numpy as np
import nltk
import re

from nltk.corpus import stopwords          
from nltk.corpus import words
from nltk.tokenize import sent_tokenize, word_tokenize

import spacy
from spacy import displacy
import nl_core_news_sm
from nltk.stem.snowball import SnowballStemmer

In [None]:
# readfile
df = pd.read_excel('ledenenquete_over_heel_de_mens.xlsx', skiprows=2, names = ['Serienummer', 'SID', 'Submitted Time', 'Verwerkingstijd',
       'Modified Time', 'Kladversie', 'IP-adres', 'UID', 'Gebruikersnaam',
       'Naam', 'Email', 'Afdeling',
       'indruk', #12
       'ontwikkelingen', #13
       'redenen', #14
       'meegeven']) #15
df = df.replace(r'^\s*$', np.nan, regex=True)
df_complete = df[df.Verwerkingstijd.notnull()]

In [None]:
# configurations
with open('wordlist.txt') as f: # get dutch dictionary
    s = f.read()
    DUTCH = word_tokenize(s) 

nlp = nl_core_news_sm.load()    # set memory to high
nlp.max_length = 1500000

stemmer = SnowballStemmer('dutch') # set dutch stemmer
stopwords_dutch = list(stopwords.words('dutch')) # set dutch stopwords

In [None]:
# functions
def clean_line(line):
    """ function to clean a line from tabs, enters, punctuations"""
    line = str(line)
    line = re.sub(r'[^\w\s.\/-:\']', '', line)
    line = re.sub(r'[\n\t]','', line)
    line = line.lower().strip()
    if not line.endswith('.'):
        line = line + '.' 
    #print('@@', line)
    return line


# spell checker
def edit_distance(entry='validatrie', wordlist = DUTCH):
    """ function that check the closest word to correct spellingserrors"""
    output = [entry]
    try:
        if (len(entry) - 2) > 4:
            # get first 4 letters of each word with v
            v = [i for i in wordlist if i[0:5]==entry[0:5]]
            distance = [((nltk.edit_distance(entry, a)), a) for a in v]
            output = [sorted(distance)[0][1]]
        return output[0]
    except:
        return entry
    
    
def write_subject_to_file(df, subject, column, nr):
    """ function that select al the rows that contain a keyword in a column and writes it to a file"""
    sub = df[(df[column].str.contains(subject))]
    file = subject + '.txt'
    with open(file, 'w') as o:
        for index, row in sub.iterrows():   
            o.write(row[nr])
            
            
# schrijf bijvoorbeeld alles over visie in meegeven column naar een bestand
# write_subject_to_file(df_complete, 'visie', 'meegeven', 15)

# Analyse 

- 'indruk' #12
- 'ontwikkelingen' #13
- 'redenen' #14
- 'meegeven #15


In [None]:
# schrijf weg in tekst bestanden voor de summary
df_complete = df_complete[df_complete.indruk.notnull()]

with open('indruk_txt', 'w') as out:
    for index, row in df_complete.iterrows():
        out.write('\n'+clean_line(row[12]))


df_complete = df_complete[df_complete.ontwikkelingen.notnull()]

with open('ontwikkelingen_txt', 'w') as out:
    for index, row in df_complete.iterrows():
        out.write('\n'+clean_line(row[13]))


df_complete = df_complete[df_complete.redenen.notnull()]

with open('redenen_txt', 'w') as out:
    for index, row in df_complete.iterrows():
        out.write('\n'+clean_line(row[14]))
        

df_complete = df_complete[df_complete.meegeven.notnull()]

with open('meegeven_txt', 'w') as out:
    for index, row in df_complete.iterrows():
        out.write('\n'+clean_line(row[15]))

# Woord frequenties

In [None]:
# read the text, dit kan dus veranderd worden voor een andere tekst
file = open('meegeven_txt', "r")
text = "".join(file.readlines())

In [None]:
# tag the words
doc = nlp(text.lower(), disable = ['ner', 'parser'])
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in doc]
df = pd.DataFrame(spacy_pos_tagged, columns=['word', 'pos_tag', 'tag_type'])
df.head()

In [None]:
# check the types
print(set(df['tag_type']))

In [None]:
# select certain types
advs = df.loc[((df['pos_tag'] == 'BW') & (df['tag_type'] == 'ADV'))]
vreemd = df[df['tag_type'] == 'X']
nouns = df[df.tag_type == 'NOUN']
verbs = df[df.tag_type == 'VERB']
intjs = df[df.tag_type == 'INTJ']
props = df[df.tag_type == 'PROPN']
adj = df[df.tag_type == 'ADJ']

In [None]:
# create wordlists with selected types 
sub = pd.concat([nouns, vreemd])
words = sub.word.astype(str).tolist()
words = [word for word in words if word not in stopwords_dutch]
#words = [stemmer.stem(word) for word in words]
#words = [edit_distance(word, wordlist = DUTCH) for word in words if word not in DUTCH] 

In [None]:
# create frequencies
from nltk.probability import FreqDist
fdist = FreqDist(words)

In [None]:
fdist.most_common(25)

In [None]:
# check words
fdist['gelijkwaardigheid']

# Wordclouds

In [None]:
# make wordcloud
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud

%matplotlib notebook
def show_wordcloud(dictionary):
    wc = WordCloud(background_color="white",
                   width=1000,height=1000,
                   min_word_length=3,
                   include_numbers=False,
                   colormap = 'tab20',
                   collocations=True,
                   normalize_plurals=False).generate_from_frequencies(dictionary)
    
    plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
    
show_wordcloud(fdist)
plt.savefig('wordcloud.jpg')