# Analysis

In [50]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools

from sklearn.feature_extraction.text import CountVectorizer

In [13]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords = stopwords.words('danish')
stemmer = SnowballStemmer("danish")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dana_tiger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load datasets

In [14]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft = ft_sygeplej2x.copy() 
dr = dr_sygeplej2x.copy() 
tv2 = tv2_sygeplej2x.copy()

### Cleaning
- lower case (already done)
- remove non-alphanumeric characters
- remove numbers

In [16]:
def cleaner(document):
    document = document.lower() #To lower case
    document = re.sub(r'[^\w\s]','', document) #Remove non-alphanumeric characters
    document = re.sub(r'[^\D+]','', document) #Remove numeric characters
    return document

In [17]:
# apply cleaner
df['content'] = df['content'].apply(cleaner)

### Pre-processing
- Tokenize
- Remove stopwords
- Stemming

In [18]:
def pre_processing(df): 
    tokens = [nltk.tokenize.word_tokenize(df['content'][i]) for i in range(0, len(df))]
    tokens = list(itertools.chain(*tokens)) # list of lists to list
    
    nostop = [i for i in tokens if i not in stopwords]

    stemmed = [stemmer.stem(word) for word in nostop]

# Bag of words

In [20]:
def BoW(df): 
    count = CountVectorizer()
    df_array = df['content']
    bag = count.fit_transform(df_array)
    
    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names())
    
    return matrix


# 2-gram

In [21]:
def two_gram(df):
    count = CountVectorizer(ngram_range=(2,2)) #Choose only 2-grams

    df_array = df['content']
    bag = count.fit_transform(df_array)

    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
    
    return matrix


# tf-idf

In [43]:
from sklearn.feature_extraction.text import TfidfTransformer

def tfidf(df):
    ############################## bag #################################
    count = CountVectorizer(ngram_range=(2,2)) #Choose only 2-grams
    
    df_array = df['content']
    bag = count.fit_transform(df_array)
    ############################## bag #################################
    
    tfidf = TfidfTransformer()
    bag_tfidf = tfidf.fit_transform(bag) 

    tfidf_array = bag_tfidf.toarray() #Make the bag to an array
    matrix_tfidf = pd.DataFrame(data=tfidf_array,columns = count.get_feature_names())
    return matrix_tfidf

# Apply functions

In [None]:
sample1 = tv2[0:3].copy()

In [44]:
df = sample1

In [45]:
df['content'] = df['content'].apply(cleaner)

In [46]:
pre_processing(df)

In [51]:
BoW(df)

Unnamed: 0,af,afgørende,afholder,alle,alligevel,alt,altså,anderledes,andet,anni,...,virkelig,virksomheder,viser,vist,vores,waste,what,år,økonomiske,ønsker
0,6,0,1,0,1,2,1,1,1,5,...,1,1,1,1,2,1,1,2,1,1
1,2,1,0,1,0,1,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0


In [52]:
two_gram(df)

Unnamed: 0,af danske,af de,af dem,af den,af sygeplejersker,af vores,afgørende at,afholder messen,alle de,alligevel låner,...,vores dygtige,vores nordiske,vores sygeplejersker,waste for,what waste,år er,år har,år krise,økonomiske politik,ønsker et
0,0,1,1,2,1,1,0,1,0,1,...,1,1,0,1,1,1,1,0,1,1
1,2,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,0,0,1,0,0


In [53]:
tfidf(df)

Unnamed: 0,af danske,af de,af dem,af den,af koreansk,af skræmte,af sygeplejersker,af vores,affyre mange,affyret omkring,...,what waste,åbnede ild,år er,år han,år har,år krise,øjenvidne fortæller,øjenvidne til,økonomiske politik,ønsker et
0,0.0,0.042007,0.042007,0.084014,0.0,0.0,0.042007,0.042007,0.0,0.0,...,0.042007,0.0,0.042007,0.0,0.042007,0.0,0.0,0.0,0.042007,0.042007
1,0.166289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.083144,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.058964,0.058964,0.0,0.0,0.058964,0.058964,...,0.0,0.058964,0.0,0.058964,0.0,0.0,0.058964,0.058964,0.0,0.0
