# Analysis

In [3]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
import itertools

## Load datasets

In [4]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft_2 = ft_sygeplej2x.copy() 
dr_2 = dr_sygeplej2x.copy() 
tv2_2 = tv2_sygeplej2x.copy()

# Preprocessing

## Remove non-alphanumerical characters

In [5]:
for df in [ft_2, dr_2, tv2_2]:
    df['content'] = df['content'].str.replace(r'\W', ' ')\
                                 .str.replace('  ', ' ')

  df['content'] = df['content'].str.replace(r'\W', ' ')\


## Tokenization

In [6]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kim\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [7]:
# Define a tokenizer function
def tokenizer(df):
    df_tokenslist = []
    for document in tqdm.tqdm(df['content']):
        tokens = nltk.tokenize.word_tokenize(document, language = 'danish')
        df_tokenslist.append(tokens)
    df_tokens = list(itertools.chain(*df_tokenslist))
    return df_tokens

In [8]:
dr_2_tokens = tokenizer(dr_2)
print(len(dr_2_tokens))
tv2_2_tokens = tokenizer(tv2_2)
print(len(tv2_2_tokens))
ft_2_tokens = tokenizer(ft_2)
print(len(ft_2_tokens))

100%|███████████████████████████████████████████████████████████████████████████████| 528/528 [00:00<00:00, 700.69it/s]


232998


100%|█████████████████████████████████████████████████████████████████████████████| 3607/3607 [00:08<00:00, 447.34it/s]


2562567


100%|████████████████████████████████████████████████████████████████████████████████| 296/296 [00:20<00:00, 14.15it/s]


7843399


## Remove stopwords

In [9]:
# Get stopwords list
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('danish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
type(stopwords)

list

In [13]:
dr_word_stop = [word for word in dr_2_tokens if not word in stopwords]
ft_word_stop = [word for word in ft_2_tokens if not word in stopwords]
tv2_word_stop = [word for word in tv2_2_tokens if not word in stopwords]

In [14]:
full = dr_word_stop+ft_word_stop+tv2_word_stop

In [15]:
len(full)

5356096

In [18]:
unique = set(full)
ft_unique = set(ft_word_stop)
dr_unique = set(dr_word_stop)
tv2_unique = set(tv2_word_stop)

In [20]:
len(unique), len(ft_unique), len(dr_unique), len(tv2_unique)

(115189, 76460, 14498, 66591)

## Lemmatization

In [28]:
# Load Danish lemmatizer
lem = lemmy.load("da")

['sygeplejerske']

In [None]:
for df in [ft_2, dr_2, tv2_2]:
    df['content_lem'] = 
    

Collecting lemmyNote: you may need to restart the kernel to use updated packages.
  Downloading lemmy-2.1.0-py2.py3-none-any.whl (1.1 MB)
Installing collected packages: lemmy
Successfully installed lemmy-2.1.0

