# Analysis

In [365]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## Load datasets

In [366]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft_2 = ft_sygeplej2x.copy() 
dr_2 = dr_sygeplej2x.copy() 
tv2_2 = tv2_sygeplej2x.copy()

# Analysis cleaning

In [367]:
def analysis_cleaner(document):
    document = re.sub(r'[^\w\s]', '', document) #Remove non-alphanumeric characters
    document = re.sub(r'[^\D+]', '', document) #Remove non-numberic characters
    document = document.replace(r'\W', ' ')\
                .replace('  ', ' ')
    return document

In [368]:
dr_analysis = dr_2.copy()
tv2_analysis = tv2_2.copy()
ft_analysis = ft_2.copy()

dr_analysis["content_cleaned"] = dr_2["content"].apply(analysis_cleaner)
tv2_analysis["content_cleaned"] = tv2_2["content"].apply(analysis_cleaner)
ft_analysis["content_cleaned"] = ft_2["content"].apply(analysis_cleaner)

## Preprocessing

### Add tokenized column

In [369]:
# Download tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [370]:
dr_analysis['tokenized'] = dr_analysis.apply(lambda row: nltk.tokenize.word_tokenize(row["content_cleaned"]), axis = 1)
tv2_analysis['tokenized'] = tv2_analysis.apply(lambda row: nltk.tokenize.word_tokenize(row["content_cleaned"]), axis = 1)
ft_analysis['tokenized'] = ft_analysis.apply(lambda row: nltk.tokenize.word_tokenize(row["content_cleaned"]), axis = 1)

### Add preprocess column

In [371]:
def pre_process(document):
    stemmer = SnowballStemmer("danish")
    
    document = nltk.tokenize.word_tokenize(document) # tokenize
    document = [word for word in document if not word in stopwords] # delete stopwords
    document = [stemmer.stem(word) for word in document] # all stemmed words in a list
    document =' '.join(document) # joining stemmed words
    return document

In [372]:
dr_analysis["content_prepr"] = dr_analysis["content_cleaned"].apply(pre_process)
tv2_analysis["content_prepr"] = tv2_analysis["content_cleaned"].apply(pre_process)
ft_analysis["content_prepr"] = ft_analysis["content_cleaned"].apply(pre_process)

### Bag  of words - Wordcount

In [373]:
def BoW(df): 
    count = CountVectorizer()
    df_array = df["content_prepr"]
    bag = count.fit_transform(df_array)
    
    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names())
    matrix_sum = matrix.sum().transpose()
    matrix_sum.sort_values(ascending = False, inplace = True)
    return matrix_sum

In [374]:
def BoW_relevant(df): 
    count = CountVectorizer()
    df_array = df['relevant']
    bag = count.fit_transform(df_array)
    
    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names())
    matrix_sum = matrix.sum().transpose()
    matrix_sum.sort_values(ascending = False, inplace = True)
    return matrix_sum

In [375]:
dr_bow = BoW(dr_analysis)
tv2_bow = BoW(tv2_analysis)
ft_bow = BoW(ft_analysis)

### Wordcount with relevant words

In [307]:
def extract_surround_words2(text, keyword, n):
    '''
    text : input text
    keyword : the search keyword we are looking
    n : number of words around the keyword
    '''
    surround_words=[]
    #extracting all the words from text
    words = words = re.findall(r'\w+', text)
    
    #iterate through all the words
    for index, word in enumerate(words):

        #check if search keyword matches
        if word == keyword:
            #fetch left side words
            left_side_words = words[index-n : index]
            
            #fetch right side words
            right_side_words = words[index+1 : index + n + 1]
            
            surround_words.append(left_side_words)
            surround_words.append(right_side_words)
    return surround_words

In [308]:
def find_relevant(df, word, n):
    r=[]
    for row in df["content_prepr"]:
        temp=extract_surround_words2(row, word, n)
        temp_list=[]
        for i in temp:
            stems=' '.join(i)
            temp_list.append(stems)
            s=" ".join(temp_list)
        r.append(s)
    df['relevant']=r
    return df

#### Create list of most relevant words
##### DR

In [381]:
# With three words at each side of "sygeplejsk*"
dr_relevant_3 = find_relevant(dr_analysis, "sygeplejersk", 3)
dr_relevant_3_BoW  = BoW_relevant(dr_relevant_3)

# With three words at each side of "sygeplejsk*"
dr_relevant_5 = find_relevant(dr_analysis, "sygeplejersk", 5)
dr_relevant_5_BoW  = BoW_relevant(dr_relevant_5)

# With three words at each side of "sygeplejsk*"
dr_relevant_15 = find_relevant(dr_analysis, "sygeplejersk", 15)
dr_relevant_15_BoW  = BoW_relevant(dr_relevant_15)

In [382]:
dr_relevant_3.to_csv("dr_relevant_3.csv")
display(dr_relevant_3_BoW .head(30))

kan             274
læg             266
fler            262
sig             223
arbejd          206
så              195
dansk           155
strejk          146
sygeplejersk    122
kom             119
patient         118
region          112
men             105
år               97
bland            96
tid              92
land             91
tag              89
få               87
får              86
dag              84
ansat            84
and              83
sygehus          81
andr             80
uddan            78
mer              74
gør              71
mangl            70
ved              69
dtype: int64

In [383]:
display(dr_relevant_5_BoW .head(30))

kan             444
sig             412
sygeplejersk    352
fler            347
så              337
læg             326
arbejd          287
patient         270
dansk           265
strejk          228
kom             209
region          203
år              174
tid             156
få              153
men             151
ved             142
bland           135
tag             134
får             133
sygehus         133
dag             132
sygeplejeråd    127
and             122
mer             120
andr            116
land            116
løn             115
gør             115
ansat           112
dtype: int64

##### TV2

In [384]:
# With three words at each side of "sygeplejsk*"
tv2_relevant_3 = find_relevant(tv2_analysis, "sygeplejersk", 3)
tv2_relevant_3_BoW = BoW_relevant(tv2_relevant_3)

# With three words at each side of "sygeplejsk*"
tv2_relevant_5 = find_relevant(tv2_analysis, "sygeplejersk", 5)
tv2_relevant_5_BoW  = BoW_relevant(tv2_relevant_5)

# With three words at each side of "sygeplejsk*"
tv2_relevant_15 = find_relevant(tv2_analysis, "sygeplejersk", 15)
tv2_relevant_15_BoW  = BoW_relevant(tv2_relevant_15)

In [385]:
tv2_relevant_3.to_csv("tv2_relevant_3.csv")
display(tv2_relevant_3_BoW .head(30))

læg             1174
arbejd           517
sig              470
strejk           412
fler             376
kan              369
så               304
kom              290
hospital         289
årig             258
men              238
dansk            235
bland            235
to               229
ved              228
andr             218
dag              210
patient          209
sid              204
sygeplejersk     204
tag              202
tid              199
and              195
uddan            194
land             191
pædagog          178
blev             178
region           177
år               177
løn              150
dtype: int64

In [386]:
display(tv2_relevant_5_BoW .head(30))

læg             1282
sig              789
arbejd           706
kan              643
sygeplejersk     564
så               551
strejk           534
fler             527
kom              482
patient          456
hospital         448
dansk            432
ved              393
dag              390
men              368
årig             346
år               341
bland            338
tid              333
andr             317
ifølg            316
tag              315
to               307
region           307
land             291
sid              290
blev             283
and              277
få               259
mer              233
dtype: int64

##### FT

In [387]:
# With three words at each side of "sygeplejsk*"
ft_relevant_3 = find_relevant(ft_analysis, "sygeplejersk", 3)
ft_relevant_3_BoW  = BoW_relevant(ft_relevant_3)

# With three words at each side of "sygeplejsk*"
ft_relevant_5 = find_relevant(ft_analysis, "sygeplejersk", 5)
ft_relevant_5_BoW  = BoW_relevant(ft_relevant_5)

# With three words at each side of "sygeplejsk*"
ft_relevant_15 = find_relevant(ft_analysis, "sygeplejersk", 15)
ft_relevant_15_BoW  = BoW_relevant(ft_relevant_15)

In [388]:
ft_relevant_3.to_csv("ft_relevant_3.csv")
display(ft_relevant_3_BoW .head(30))

læg              308
fler             246
så               208
kan              204
pædagog          131
andr             102
sig               96
kom               80
uddan             79
vor               77
sygeplejersk      76
fek               74
lær               70
sosu              67
sosuassistent     64
ansæt             60
jordemødr         59
arbejd            57
ved               54
mer               53
ansat             52
altså             51
få                51
får               49
tag               49
gør               47
social            47
sam               46
politibetjent     45
lig               45
dtype: int64

In [389]:
display(ft_relevant_5_BoW .head(30))

så              364
læg             351
kan             342
fler            315
pædagog         151
sygeplejersk    150
andr            139
sig             139
kom             134
vor             119
arbejd          115
uddan           114
ved             103
fek              96
tag              87
mer              86
lær              86
dag              84
få               83
lig              83
altså            82
ansat            80
peng             77
gør              76
regering         76
sosu             75
syn              75
brug             74
dansk            73
sam              73
dtype: int64

### tf-idf

In [380]:
tv2_relevant_3

læg               1174
arbejd             517
sig                470
strejk             412
fler               376
                  ... 
situationerog        1
situationenpå        1
charmeoffensiv       1
sindssyg             1
prisfald             1
Length: 5549, dtype: int64

In [377]:
def tfidf(df):
    ############################## bag #################################
    count = CountVectorizer() #Choose only 2-grams
    
    df_array = df['relevant']
    bag = count.fit_transform(df_array)
    ############################## bag #################################
    
    tfidf = TfidfTransformer()
    bag_tfidf = tfidf.fit_transform(bag) 

    tfidf_array = bag_tfidf.toarray() #Make the bag to an array
    matrix_tfidf = pd.DataFrame(data=tfidf_array,columns = count.get_feature_names())
    return matrix_tfidf

In [378]:
tf_idf_matrix = tfidf(dr_analysis)

KeyError: 'relevant'

## Remove stopwords

In [7]:
# Get stopwords list
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('danish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenized content for three datasets

In [8]:
# Remove stopwords from token-list
dr_nostop = [word for word in dr_2_tokens if not word in stopwords]
tv2_nostop = [word for word in tv2_2_tokens if not word in stopwords]
ft_nostop = [word for word in ft_2_tokens if not word in stopwords]

### Create set of unique words

In [9]:
wordlist_complete = dr_nostop + tv2_nostop + ft_nostop

['antallet',
 'danske',
 'sygeplejersker',
 'fået',
 'autorisation',
 'norge',
 'næsten',
 'tredoblet',
 'år',
 'krise',
 'fyringer',
 'ansættelsesstop',
 'får',
 'sygeplejerskerne',
 'tage',
 'norge',
 'arbejde',
 'sygeplejersker',
 'brænder',
 'fag',
 'helt',
 'naturligt',
 'søger',
 'derhen',
 'arbejde',
 'desværre',
 'situation',
 'danmark',
 'arbejdsgiverne',
 'valgt',
 'ansætte',
 'kompetente',
 'sygeplejersker',
 'bekymrende',
 'siger',
 'grete',
 'christensen',
 'formand',
 'dansk',
 'sygeplejeråd',
 'seneste',
 'tal',
 'viser',
 'antallet',
 'danske',
 'sygeplejersker',
 'fået',
 'autorisation',
 'norge',
 'steget',
 '154',
 'januar',
 '2010',
 '434',
 'januar',
 '2011',
 'helt',
 'afgørende',
 'herhjemme',
 'får',
 'gjort',
 'muligt',
 'tilbyde',
 'vores',
 'sygeplejersker',
 'job',
 'kommuner',
 'regionerne',
 'siger',
 'grete',
 'christensen',
 'trods',
 'fyringsrunder',
 'sygehusene',
 'både',
 '2010',
 '2011',
 'nedlæggelse',
 'sygehuse',
 'faxe',
 'nakskov',
 'kalundborg

In [10]:
len(wordlist_complete)

5356096

The total number of words across our three datasets is 5356096.

Our unique wordset contains 115189 words.

## Stemming

### Stemming of content by source

In [15]:
def sourcestemmer(wordlist):
    wordlist_stemmed = [stemmer.stem(word) for word in wordlist]
    return wordlist_stemmed

In [16]:
dr_stemmed = sourcestemmer(dr_nostop)
tv2_stemmed = sourcestemmer(tv2_nostop)
ft_stemmed = sourcestemmer(ft_nostop)

### Wordcount for complete content of datasets

In [17]:
# Putting lists together to one large string as preprocessing for wordcount
dr_string = " ".join(dr_stemmed)
tv2_string = " ".join(tv2_stemmed)
ft_string = " ".join(ft_stemmed)

In [18]:
count = CountVectorizer() 

#### DR

In [20]:
#Store the class in 'count' to ease coding
dr_bag = count.fit_transform(dr_stemmed) #fit_transform takes an array as input and outputs the bag of words

dr_count_array = dr_bag.toarray() #Make the bag to an array
dr_matrix = pd.DataFrame(data=dr_count_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
dr_matrix_sum = dr_matrix.sum().transpose()
dr_matrix_sum.sort_values(ascending = False)

MemoryError: Unable to allocate 8.96 GiB for an array with shape (125596, 9574) and data type int64

In [26]:
os.listdir()

AttributeError: module 'os' has no attribute 'list_dir'

#### TV2

In [None]:
#Store the class in 'count' to ease coding
tv2_bag = count.fit_transform(tv2_stemmed) #fit_transform takes an array as input and outputs the bag of words

tv2_array = tv2_bag.toarray() #Make the bag to an array
tv2_matrix = pd.DataFrame(data=tv2_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
tv2_matrix_sum = tv2_matrix.sum().transpose()
tv2_matrix_sum.sort_values(ascending = False)

#### Folketinget

In [None]:
#Store the class in 'count' to ease coding
ft_bag = count.fit_transform(ft_stemmed) #fit_transform takes an array as input and outputs the bag of words

ft_count_array = ft_bag.toarray() #Make the bag to an array
ft_matrix = pd.DataFrame(data=ft_count_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
ft_matrix_sum = ft_matrix.sum().transpose()
ft_matrix_sum.sort_values(ascending = False)

## Lemmatization

In [None]:
# Load Danish lemmatizer
lem = lemmy.load("da")

In [None]:
wordlist_lem = [lem.lemmatize("", word) for word in wordset]

In [None]:
# Create a list instead of list of list
wordlist_lem = [word for sublist in wordlist_lem for word in sublist]

In [None]:
wordlist_lem_2 = wordlist_lem_2 

Comment: The lemmatization returns a list of lists that also contains more than two words which could lead to problems.
    

## Stemming and bag of words for each article

# Playground