# Analysis

In [141]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## Load datasets

In [111]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft_2 = ft_sygeplej2x.copy() 
dr_2 = dr_sygeplej2x.copy() 
tv2_2 = tv2_sygeplej2x.copy()

# Preprocessing

## Remove non-alphanumerical characters

In [112]:
for df in [ft_2, dr_2, tv2_2]:
    df['content'] = df['content'].str.replace(r'\W', ' ')\
                                 .str.replace('  ', ' ')

  df['content'] = df['content'].str.replace(r'\W', ' ')\


## Tokenization

In [113]:
# Download tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [114]:
dr_2['tokenized'] = dr_2.apply(lambda row: nltk.tokenize.word_tokenize(row["content"]), axis = 1)
tv2_2['tokenized'] = tv2_2.apply(lambda row: nltk.tokenize.word_tokenize(row["content"]), axis = 1)
ft_2['tokenized'] = ft_2.apply(lambda row: nltk.tokenize.word_tokenize(row["content"]), axis = 1)

### Stemming of entire wordlist

In [115]:
stemmer = SnowballStemmer("danish")

In [119]:
def stem_sentences(document):
    non_alpha = re.sub(r'[^\w\s]', '', document)
    tokens = non_alpha.split()
    stemmed_tokens=[stemmer.stem(word) for word in tokens] # all words in a list
    no_stop = [word for word in stemmed_tokens if not word in stopwords]
    stems=' '.join(no_stop)
    stems_no_num = re.sub(r'[^\D+]', '', stems)
    return stems_no_num

#### Bag  of words

In [169]:
def add_stem_col(df):
    df_stemmed_list=[]
    for row in tqdm.tqdm(df['content']):
        stems=stem_sentences(row)
        df_stemmed_list.append(stems)
    df['stems'] = df_stemmed_list
    return df

In [170]:
dr_analysis = add_stem_col(dr_2)
tv2_analysis = add_stem_col(tv2_2)
ft_analysis = add_stem_col(ft_2)

100%|███████████████████████████████████████████████████████████████████████████████| 528/528 [00:03<00:00, 147.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3607/3607 [00:32<00:00, 111.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 296/296 [01:37<00:00,  3.03it/s]


In [171]:
def BoW(df): 
    count = CountVectorizer()
    df_array = df['stems']
    bag = count.fit_transform(df_array)
    
    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names())
    matrix_sum = matrix.sum().transpose()
    matrix_sum.sort_values(ascending = False, inplace = True)
    return matrix_sum

In [172]:
dr_bow = BoW(dr_analysis)
tv2_bow = BoW(tv2_analysis)
ft_bow = BoW(ft_analysis)

In [179]:
dr_bow[200:260]

hvilk               112
lær                 110
mindr               110
virk                109
stud                109
nyhed               108
stig                107
praktis             106
stilling            106
dår                 105
faktisk             104
nødt                103
netop               103
job                 102
selvfølg            102
ansæt               102
faggrup             101
nej                 101
vurd                101
sker                101
følg                101
gjort               101
altså               100
måsk                 99
stad                 98
forsøg               98
milliard             96
arbejdsmiljø         95
klart                95
handl                95
vej                  95
privat               95
uger                 95
mennesk              94
ansvar               94
stil                 94
haft                 93
begynd               91
understreg           91
enkelt               91
tænk                 91
fokus           

In [158]:
test = BoW(dr_2)

In [159]:
test

sygeplejersk            2969
ikk                     2533
kan                     1905
så                      1531
patient                 1328
                        ... 
næstsidst                  1
diskussionsspørgsmål       1
dispensation               1
nærpolitistation           1
lægevagtplanlægning        1
Length: 9194, dtype: int64

In [136]:

matrix_sum.to_csv("dr_word_frequency.csv")

#### TV2

In [146]:
stemmed_list=[]
for i in tv2_2['content']:
    stems=stem_sentences(i)
    stemmed_list.append(stems)

tv2_2['stems']=stemmed_list

KeyboardInterrupt: 

In [147]:
count = CountVectorizer() #Store the class in 'count' to ease coding
count_array = tv2_2['stems'] #Take the first two reviews and store them in an array
bag = count.fit_transform(count_array) #fit_

In [148]:
array = bag.toarray() #Make the bag to an array
matrix = pd.DataFrame(data=array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe

In [140]:
matrix_sum = matrix.sum().transpose()
matrix_sum.sort_values(ascending = False)
matrix_sum.to_csv("tv2_word_frequency.csv")

#### Folketinget

In [129]:
stemmed_list=[]
for i in tqdm.tqdm(ft_2['content']):
    stems=stem_sentences(i)
    stemmed_list.append(stems)

ft_2['stems']=stemmed_list

100%|████████████████████████████████████████████████████████████████████████████████| 296/296 [01:50<00:00,  2.69it/s]


In [130]:
count = CountVectorizer() #Store the class in 'count' to ease coding
count_array = ft_2['stems'] #Take the first two reviews and store them in an array
bag = count.fit_transform(count_array) #fit_

In [131]:
array = bag.toarray() #Make the bag to an array
matrix = pd.DataFrame(data=array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe

In [132]:
matrix_sum = matrix.sum().transpose()
matrix_sum.sort_values(ascending = False)

ikk                               98174
så                                82428
kan                               63835
vær                               45419
kom                               30076
                                  ...  
landedokumentationskontor             1
landdistriktsvækstpilotordning        1
landdistriktsudspil                   1
landdistriktssegment                  1
øvsag                                 1
Length: 45808, dtype: int64

### tf-idf

In [150]:
def tfidf(df):
    ############################## bag #################################
    count = CountVectorizer() #Choose only 2-grams
    
    df_array = df['stems']
    bag = count.fit_transform(df_array)
    ############################## bag #################################
    
    tfidf = TfidfTransformer()
    bag_tfidf = tfidf.fit_transform(bag) 

    tfidf_array = bag_tfidf.toarray() #Make the bag to an array
    matrix_tfidf = pd.DataFrame(data=tfidf_array,columns = count.get_feature_names())
    return matrix_tfidf

In [144]:
tf_idf_matrix = tfidf(dr_2)



Unnamed: 0,aabenraa,aag,aagaard,aahaug,aaholm,aalborg,aarhus,aarhusiansk,aaskov,abc,...,østjylland,østjysk,østkyst,østr,øve,øvels,øver,øverst,øvet,øvr
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
525,0.0,0.0,0.0,0.0,0.0,0.034848,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
526,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Remove stopwords

In [7]:
# Get stopwords list
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('danish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenized content for three datasets

In [8]:
# Remove stopwords from token-list
dr_nostop = [word for word in dr_2_tokens if not word in stopwords]
tv2_nostop = [word for word in tv2_2_tokens if not word in stopwords]
ft_nostop = [word for word in ft_2_tokens if not word in stopwords]

### Create set of unique words

In [9]:
wordlist_complete = dr_nostop + tv2_nostop + ft_nostop

['antallet',
 'danske',
 'sygeplejersker',
 'fået',
 'autorisation',
 'norge',
 'næsten',
 'tredoblet',
 'år',
 'krise',
 'fyringer',
 'ansættelsesstop',
 'får',
 'sygeplejerskerne',
 'tage',
 'norge',
 'arbejde',
 'sygeplejersker',
 'brænder',
 'fag',
 'helt',
 'naturligt',
 'søger',
 'derhen',
 'arbejde',
 'desværre',
 'situation',
 'danmark',
 'arbejdsgiverne',
 'valgt',
 'ansætte',
 'kompetente',
 'sygeplejersker',
 'bekymrende',
 'siger',
 'grete',
 'christensen',
 'formand',
 'dansk',
 'sygeplejeråd',
 'seneste',
 'tal',
 'viser',
 'antallet',
 'danske',
 'sygeplejersker',
 'fået',
 'autorisation',
 'norge',
 'steget',
 '154',
 'januar',
 '2010',
 '434',
 'januar',
 '2011',
 'helt',
 'afgørende',
 'herhjemme',
 'får',
 'gjort',
 'muligt',
 'tilbyde',
 'vores',
 'sygeplejersker',
 'job',
 'kommuner',
 'regionerne',
 'siger',
 'grete',
 'christensen',
 'trods',
 'fyringsrunder',
 'sygehusene',
 'både',
 '2010',
 '2011',
 'nedlæggelse',
 'sygehuse',
 'faxe',
 'nakskov',
 'kalundborg

In [10]:
len(wordlist_complete)

5356096

The total number of words across our three datasets is 5356096.

Our unique wordset contains 115189 words.

## Stemming

### Stemming of content by source

In [15]:
def sourcestemmer(wordlist):
    wordlist_stemmed = [stemmer.stem(word) for word in wordlist]
    return wordlist_stemmed

In [16]:
dr_stemmed = sourcestemmer(dr_nostop)
tv2_stemmed = sourcestemmer(tv2_nostop)
ft_stemmed = sourcestemmer(ft_nostop)

### Wordcount for complete content of datasets

In [17]:
# Putting lists together to one large string as preprocessing for wordcount
dr_string = " ".join(dr_stemmed)
tv2_string = " ".join(tv2_stemmed)
ft_string = " ".join(ft_stemmed)

In [18]:
count = CountVectorizer() 

#### DR

In [20]:
#Store the class in 'count' to ease coding
dr_bag = count.fit_transform(dr_stemmed) #fit_transform takes an array as input and outputs the bag of words

dr_count_array = dr_bag.toarray() #Make the bag to an array
dr_matrix = pd.DataFrame(data=dr_count_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
dr_matrix_sum = dr_matrix.sum().transpose()
dr_matrix_sum.sort_values(ascending = False)

MemoryError: Unable to allocate 8.96 GiB for an array with shape (125596, 9574) and data type int64

In [26]:
os.listdir()

AttributeError: module 'os' has no attribute 'list_dir'

#### TV2

In [None]:
#Store the class in 'count' to ease coding
tv2_bag = count.fit_transform(tv2_stemmed) #fit_transform takes an array as input and outputs the bag of words

tv2_array = tv2_bag.toarray() #Make the bag to an array
tv2_matrix = pd.DataFrame(data=tv2_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
tv2_matrix_sum = tv2_matrix.sum().transpose()
tv2_matrix_sum.sort_values(ascending = False)

#### Folketinget

In [None]:
#Store the class in 'count' to ease coding
ft_bag = count.fit_transform(ft_stemmed) #fit_transform takes an array as input and outputs the bag of words

ft_count_array = ft_bag.toarray() #Make the bag to an array
ft_matrix = pd.DataFrame(data=ft_count_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
ft_matrix_sum = ft_matrix.sum().transpose()
ft_matrix_sum.sort_values(ascending = False)

## Lemmatization

In [None]:
# Load Danish lemmatizer
lem = lemmy.load("da")

In [None]:
wordlist_lem = [lem.lemmatize("", word) for word in wordset]

In [None]:
# Create a list instead of list of list
wordlist_lem = [word for sublist in wordlist_lem for word in sublist]

In [None]:
wordlist_lem_2 = wordlist_lem_2 

Comment: The lemmatization returns a list of lists that also contains more than two words which could lead to problems.
    

## Stemming and bag of words for each article

# Playground

In [None]:
"sygeplej"+r"

In [None]:
patter = re.compile

In [None]:
findallcan be anywhere

In [None]:
prog = re.compile(pattern)
result = prog.match(string)