# Analysis

In [3]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools

from sklearn.feature_extraction.text import CountVectorizer

## Load datasets

In [199]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft = ft_sygeplej2x.copy() 
dr = dr_sygeplej2x.copy() 
tv2 = tv2_sygeplej2x.copy()

# Sample

In [180]:
sample = tv2[0:3].copy()

In [181]:
#sample = tv2

# Function

### Cleaning
- lower case (already done)
- remove non-alphanumeric characters
- remove numbers

In [182]:
def cleaner(document):
    document = document.lower() #To lower case
    document = re.sub(r'[^\w\s]','', document) #Remove non-alphanumeric characters
    document = re.sub(r'[^\D+]','', document) #Remove numeric characters
    return document

In [197]:
# apply cleaner
sample['content'] = sample['content'].apply(cleaner)

### Pre-processing
- Tokenize
- Remove stopwords
- Stemming

In [210]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('danish')
stemmer = SnowballStemmer("danish")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dana_tiger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [215]:
def pre_processing(df): 
    tokens = [nltk.tokenize.word_tokenize(df['content'][i]) for i in range(0, len(df))]
    tokens = list(itertools.chain(*tokens)) # list of lists to list
    
    nostop = [i for i in sample_tokens if i not in stopwords]

    stemmed = [stemmer.stem(word) for word in nostop]

In [216]:
pre_processing(sample)

In [184]:
##### tokenize #####
import nltk
sample_tokens = [nltk.tokenize.word_tokenize(sample['content'][i]) for i in range(0, len(sample))]
sample_tokens = list(itertools.chain(*sample_tokens)) # list of lists to list

In [185]:
##### stopwords #####
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('danish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dana_tiger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [186]:
sample_nostop = [i for i in sample_tokens if i not in stop]

In [194]:
###### stemming ######
stemmer = SnowballStemmer("danish")
sample_stemmed = [stemmer.stem(word) for word in sample_nostop]

In [195]:
print(len(sample_tokens))
print(len(sample_nostop))
print(len(sample_stemmed))

903
508
508


# Bag of words

In [188]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def BoW(df): 
    count = CountVectorizer()
    df_array = df['content'].values[0:2] 
    bag = count.fit_transform(df_array)
    
    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
    
    return matrix
    

In [189]:
count = CountVectorizer() #Store the class in 'count' to ease coding

sample_array = sample['content'].values[0:2] #Take the first two rows and store them in an array
bag = count.fit_transform(sample_array) #fit_transform takes an array as input and outputs the bag of words

In [190]:
count_array = bag.toarray() #Make the bag to an array
matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
matrix

Unnamed: 0,af,afgørende,afholder,alle,alligevel,alt,altså,anderledes,andet,anni,...,virkelig,virksomheder,viser,vist,vores,waste,what,år,økonomiske,ønsker
0,6,0,1,0,1,2,1,1,1,5,...,1,1,1,1,2,1,1,2,1,1
1,2,1,0,1,0,1,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0


# 2-gram

In [219]:
def two_gram(df):
    count = CountVectorizer(ngram_range=(2,2)) #Choose only 2-grams

    df_array = df['content'].values[0:2]
    bag = count.fit_transform(df_array)

    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
    
    return matrix

In [191]:
count = CountVectorizer(ngram_range=(2,2)) #Choose only 2-grams

sample_array = df['content'].values[0:2]
bag = count.fit_transform(sample_array)

count_array = bag.toarray() #Make the bag to an array
matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
matrix

Unnamed: 0,100 jobs,11 arbejdsgivere,154 januar,2009 til,2010 til,2011 det,2011 således,300 sygeplejersker,434 januar,af danske,...,vores dygtige,vores nordiske,vores sygeplejersker,waste for,what waste,år er,år har,år krise,økonomiske politik,ønsker et
0,1,1,0,1,0,0,1,1,0,0,...,1,1,0,1,1,1,1,0,1,1
1,0,0,1,0,1,1,0,0,1,2,...,0,0,1,0,0,0,0,1,0,0


# tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

def tfidf(df):
    tfidf = TfidfTransformer()
    bag_tfidf = tfidf.fit_transform(bag) 

    tfidf_array = bag_tfidf.toarray() #Make the bag to an array
    matrix_tfidf = pd.DataFrame(data=tfidf_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
    matrix_tfidf



In [192]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer() #Ease coding
bag_tfidf = tfidf.fit_transform(bag) #Compute the tf-idf score from the bag of words from before ('bag')

In [193]:
tfidf_array = bag_tfidf.toarray() #Make the bag to an array
matrix_tfidf = pd.DataFrame(data=tfidf_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
matrix_tfidf

Unnamed: 0,100 jobs,11 arbejdsgivere,154 januar,2009 til,2010 til,2011 det,2011 således,300 sygeplejersker,434 januar,af danske,...,vores dygtige,vores nordiske,vores sygeplejersker,waste for,what waste,år er,år har,år krise,økonomiske politik,ønsker et
0,0.041818,0.041818,0.0,0.041818,0.0,0.0,0.041818,0.041818,0.0,0.0,...,0.041818,0.041818,0.0,0.041818,0.041818,0.041818,0.041818,0.0,0.041818,0.041818
1,0.0,0.0,0.082953,0.0,0.082953,0.082953,0.0,0.0,0.082953,0.165905,...,0.0,0.0,0.082953,0.0,0.0,0.0,0.0,0.082953,0.0,0.0
