# Analysis

In [198]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools

from sklearn.feature_extraction.text import CountVectorizer

## Load datasets

In [14]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft_2 = ft_sygeplej2x.copy() 
dr_2 = dr_sygeplej2x.copy() 
tv2_2 = tv2_sygeplej2x.copy()

# Preprocessing

## Remove non-alphanumerical characters

In [22]:
for df in [ft_2, dr_2, tv2_2]:
    df['content'] = df['content'].str.replace(r'\W', ' ')\
                                 .str.replace('  ', ' ')

  df['content'] = df['content'].str.replace(r'\W', ' ')\


## Tokenization

In [92]:
# Download tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [124]:
# Define a tokenizer function
def tokenizer(df):
    df_tokenslist = []
    for document in tqdm.tqdm(df['content']):
        tokens = nltk.tokenize.word_tokenize(document, language = 'danish')
        df_tokenslist.append(tokens)
    df_tokens = list(itertools.chain(*df_tokenslist))
    return df_tokens

In [125]:
dr_2_tokens = tokenizer(dr_2)
print(len(dr_2_tokens))
tv2_2_tokens = tokenizer(tv2_2)
print(len(tv2_2_tokens))
ft_2_tokens = tokenizer(ft_2)
print(len(ft_2_tokens))

100%|███████████████████████████████████████████████████████████████████████████████| 528/528 [00:01<00:00, 338.61it/s]


232998


100%|█████████████████████████████████████████████████████████████████████████████| 3607/3607 [00:16<00:00, 216.11it/s]


2562567


100%|████████████████████████████████████████████████████████████████████████████████| 296/296 [00:33<00:00,  8.94it/s]


NameError: name 'ft_2_tokens_tokens' is not defined

## Remove stopwords and create word dictionary

In [126]:
# Get stopwords list
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('danish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jgb569\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenized content for three datasets

In [156]:
# Remove stopwords from token-list
dr_nostop = [word for word in dr_2_tokens if not word in stopwords]
tv2_nostop = [word for word in tv2_2_tokens if not word in stopwords]
ft_nostop = [word for word in ft_2_tokens if not word in stopwords]

### Create set of unique words

In [157]:
wordlist_complete = dr_nostop + tv2_nostop + ft_nostop

In [147]:
len(wordlist_complete)

5356096

The total number of words across our three datasets is 5356096.

Our unique wordset contains 115189 words.

## Stemming

### Stemming of entire wordlist

In [169]:
stemmer = SnowballStemmer("danish")

In [178]:
wordlist_stemmed_with_num = [stemmer.stem(word) for word in wordlist_complete]

In [185]:
# Delete duplicates
wordlist_stemmed_with_num = list(set(wordlist_stemmed_with_num))
len(wordlist_stemmed_with_num)

75802

In [187]:
# Remove numbers
wordlist_stemmed = [word for word in wordlist_stemmed_with_num if not word.isdigit()]            

### Stemming of content by source

In [193]:
def sourcestemmer(wordlist):
    wordlist_stemmed = [stemmer.stem(word) for word in wordlist]
    return wordlist_stemmed

In [194]:
dr_stemmed = sourcestemmer(dr_nostop)
tv2_stemmed = sourcestemmer(tv2_nostop)
ft_stemmed = sourcestemmer(ft_nostop)

### Wordcount for complete content of datasets

In [197]:
# Putting lists together to one large string as preprocessing for wordcount
dr_string = " ".join(dr_stemmed)
tv2_string = " ".join(tv2_stemmed)
ft_string = " ".join(ft_stemmed)

In [203]:
vectorizer = CountVectorizer() #Store the class in 'count' to ease coding

df_bag = vectorizer.fit_transform(dr_stemmed)
df_resutl = vectorizer.get_feature_names()
print(df_bag.toarray())

MemoryError: Unable to allocate 8.96 GiB for an array with shape (125596, 9574) and data type int64

In [None]:
bag = co

## Lemmatization

In [28]:
# Load Danish lemmatizer
lem = lemmy.load("da")

In [166]:
wordlist_lem = [lem.lemmatize("", word) for word in wordset]

In [None]:
# Create a list instead of list of list
wordlist_lem = [word for sublist in wordlist_lem for word in sublist]

In [164]:
wordlist_lem_2 = wordlist_lem_2 

list

Comment: The lemmatization returns a list of lists that also contains more than two words which could lead to problems.
    

## Bag of words

In [None]:
count = CountVectorizer() #Store the class in 'count' to ease coding

review_array = df['review'].values[0:2] #Take the first two reviews and store them in an array
bag = count.fit_transform(review_array) #fit_transform takes an array as input and outputs the bag of words