# Analysis

In [25]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools
import os
from sklearn.feature_extraction.text import CountVectorizer

## Load datasets

In [2]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft_2 = ft_sygeplej2x.copy() 
dr_2 = dr_sygeplej2x.copy() 
tv2_2 = tv2_sygeplej2x.copy()

# Preprocessing

## Remove non-alphanumerical characters

In [3]:
for df in [ft_2, dr_2, tv2_2]:
    df['content'] = df['content'].str.replace(r'\W', ' ')\
                                 .str.replace('  ', ' ')

  df['content'] = df['content'].str.replace(r'\W', ' ')\


## Tokenization

In [4]:
# Download tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Define a tokenizer function
def tokenizer(df):
    df_tokenslist = []
    for document in tqdm.tqdm(df['content']):
        tokens = nltk.tokenize.word_tokenize(document, language = 'danish')
        df_tokenslist.append(tokens)
    df_tokens = list(itertools.chain(*df_tokenslist))
    return df_tokens

In [6]:
dr_2_tokens = tokenizer(dr_2)
print(len(dr_2_tokens))
tv2_2_tokens = tokenizer(tv2_2)
print(len(tv2_2_tokens))
ft_2_tokens = tokenizer(ft_2)
print(len(ft_2_tokens))

100%|███████████████████████████████████████████████████████████████████████████████| 528/528 [00:01<00:00, 327.22it/s]


232998


100%|█████████████████████████████████████████████████████████████████████████████| 3607/3607 [00:14<00:00, 243.34it/s]


2562567


100%|████████████████████████████████████████████████████████████████████████████████| 296/296 [00:30<00:00,  9.60it/s]


7843399


## Remove stopwords and create word dictionary

In [7]:
# Get stopwords list
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('danish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenized content for three datasets

In [8]:
# Remove stopwords from token-list
dr_nostop = [word for word in dr_2_tokens if not word in stopwords]
tv2_nostop = [word for word in tv2_2_tokens if not word in stopwords]
ft_nostop = [word for word in ft_2_tokens if not word in stopwords]

### Create set of unique words

In [9]:
wordlist_complete = dr_nostop + tv2_nostop + ft_nostop

In [10]:
len(wordlist_complete)

5356096

The total number of words across our three datasets is 5356096.

Our unique wordset contains 115189 words.

## Stemming

### Stemming of entire wordlist

In [11]:
stemmer = SnowballStemmer("danish")

In [12]:
wordlist_stemmed_with_num = [stemmer.stem(word) for word in wordlist_complete]

In [13]:
# Delete duplicates
wordlist_stemmed_with_num = list(set(wordlist_stemmed_with_num))
len(wordlist_stemmed_with_num)

75802

In [14]:
# Remove numbers
wordlist_stemmed = [word for word in wordlist_stemmed_with_num if not word.isdigit()]            

### Stemming of content by source

In [15]:
def sourcestemmer(wordlist):
    wordlist_stemmed = [stemmer.stem(word) for word in wordlist]
    return wordlist_stemmed

In [16]:
dr_stemmed = sourcestemmer(dr_nostop)
tv2_stemmed = sourcestemmer(tv2_nostop)
ft_stemmed = sourcestemmer(ft_nostop)

### Wordcount for complete content of datasets

In [17]:
# Putting lists together to one large string as preprocessing for wordcount
dr_string = " ".join(dr_stemmed)
tv2_string = " ".join(tv2_stemmed)
ft_string = " ".join(ft_stemmed)

In [18]:
count = CountVectorizer() 

#### DR

In [20]:
#Store the class in 'count' to ease coding
dr_bag = count.fit_transform(dr_stemmed) #fit_transform takes an array as input and outputs the bag of words

dr_count_array = dr_bag.toarray() #Make the bag to an array
dr_matrix = pd.DataFrame(data=dr_count_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
dr_matrix_sum = dr_matrix.sum().transpose()
dr_matrix_sum.sort_values(ascending = False)

MemoryError: Unable to allocate 8.96 GiB for an array with shape (125596, 9574) and data type int64

In [26]:
os.listdir()

AttributeError: module 'os' has no attribute 'list_dir'

#### TV2

In [None]:
#Store the class in 'count' to ease coding
tv2_bag = count.fit_transform(tv2_stemmed) #fit_transform takes an array as input and outputs the bag of words

tv2_array = tv2_bag.toarray() #Make the bag to an array
tv2_matrix = pd.DataFrame(data=tv2_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
tv2_matrix_sum = tv2_matrix.sum().transpose()
tv2_matrix_sum.sort_values(ascending = False)

#### Folketinget

In [None]:
#Store the class in 'count' to ease coding
ft_bag = count.fit_transform(ft_stemmed) #fit_transform takes an array as input and outputs the bag of words

ft_count_array = ft_bag.toarray() #Make the bag to an array
ft_matrix = pd.DataFrame(data=ft_count_array,columns = count.get_feature_names_out()) #Input the bag and the words into a dataframe
ft_matrix_sum = ft_matrix.sum().transpose()
ft_matrix_sum.sort_values(ascending = False)

## Lemmatization

In [None]:
# Load Danish lemmatizer
lem = lemmy.load("da")

In [None]:
wordlist_lem = [lem.lemmatize("", word) for word in wordset]

In [None]:
# Create a list instead of list of list
wordlist_lem = [word for sublist in wordlist_lem for word in sublist]

In [None]:
wordlist_lem_2 = wordlist_lem_2 

Comment: The lemmatization returns a list of lists that also contains more than two words which could lead to problems.
    

## Stemming and bag of words for each article