# Analysis

In [11]:
import time
import tqdm
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import lemmy # For lemmatization
import nltk
from nltk.stem import SnowballStemmer
import itertools
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

In [12]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords = stopwords.words('danish')
stemmer = SnowballStemmer("danish")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dana_tiger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load datasets

In [13]:
ft_sygeplej2x = pd.read_csv('ft_sygeplej2x.csv')
dr_sygeplej2x = pd.read_csv('dr_sygeplej2x.csv')
tv2_sygeplej2x = pd.read_csv('tv2_sygeplej2x.csv')

ft = ft_sygeplej2x.copy() 
dr = dr_sygeplej2x.copy() 
tv2 = tv2_sygeplej2x.copy()

### Cleaning
- lower case (already done)
- remove non-alphanumeric characters
- remove numbers

In [14]:
def cleaner(document):
    document = document.lower() #To lower case
    document = re.sub(r'[^\w\s]','', document) #Remove non-alphanumeric characters
    document = re.sub(r'[^\D+]','', document) #Remove numeric characters
    return document

### Pre-processing
- Tokenize
- Remove stopwords
- Stemming

In [114]:
def pre_process(df): 
    tokens = [nltk.tokenize.word_tokenize(df['content'][i]) for i in range(0, len(df))]
    tokens = list(itertools.chain(*tokens)) # list of lists to list
    
    nostop = [i for i in tokens if i not in stopwords]

    stemmed = [stemmer.stem(word) for word in nostop]
    
    return stemmed

# Bag of words

In [16]:
def BoW(df): 
    count = CountVectorizer()
    df_array = df['content']
    bag = count.fit_transform(df_array)
    
    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names())
    
    return matrix


# 2-gram

In [17]:
def two_gram(df):
    count = CountVectorizer(ngram_range=(2,2)) #Choose only 2-grams

    df_array = df['content']
    bag = count.fit_transform(df_array)

    count_array = bag.toarray() #Make the bag to an array
    matrix = pd.DataFrame(data=count_array,columns = count.get_feature_names()) #Input the bag and the words into a dataframe
    
    return matrix


# tf-idf

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

def tfidf(df):
    ############################## bag #################################
    count = CountVectorizer(ngram_range=(2,2)) #Choose only 2-grams
    
    df_array = df['content']
    bag = count.fit_transform(df_array)
    ############################## bag #################################
    
    tfidf = TfidfTransformer()
    bag_tfidf = tfidf.fit_transform(bag) 

    tfidf_array = bag_tfidf.toarray() #Make the bag to an array
    matrix_tfidf = pd.DataFrame(data=tfidf_array,columns = count.get_feature_names())
    return matrix_tfidf

# Apply functions

In [19]:
sample1 = tv2[0:3].copy()

In [20]:
df = dr

In [147]:
df['content_cleaned'] = df['content'].apply(cleaner)

In [148]:
df

Unnamed: 0.1,Unnamed: 0,titles,sub_header,h2,content,author,tag,date,link,source,content_cleaned
0,0,danske sygeplejersker får job i norge,"krise, fyringer og ansættelsesstop får sygeple...",markant flere,antallet af danske sygeplejersker der har fået...,ritzau/,Penge,2012-01-19 13:27:00+00:00,https://www.dr.dk/nyheder/penge/danske-sygeple...,DR,antallet af danske sygeplejersker der har fået...
1,1,næsten ingen ledige sygeplejersker,trods fyringsrunder i både 2010 og 2011 er arb...,,trods fyringsrunder på sygehusene i både og ...,henny mortensen,Sjælland,2012-01-28 07:42:00+00:00,https://www.dr.dk/nyheder/regionale/sjaelland/...,DR,trods fyringsrunder på sygehusene i både og ...
2,5,regionsformand: jeg har ikke noget at undskylde,"steen bach nielsen forstår ikke, at sygeplejer...",,der er ikke noget at undskylde eller beklage f...,jørgen hansen,Sjælland,2012-01-24 15:39:00+00:00,https://www.dr.dk/nyheder/regionale/sjaelland/...,DR,der er ikke noget at undskylde eller beklage f...
3,14,sygeplejersker vil skære i nattevagter,risiko for at natarbejde er kræftfremkaldende ...,færre nattevagter fast døgnrytme er vigtig,dansk sygeplejeråd der repræsenterer landets o...,ritzau,Indland,2012-02-21 10:48:00+00:00,https://www.dr.dk/nyheder/indland/sygeplejersk...,DR,dansk sygeplejeråd der repræsenterer landets o...
4,15,udenlandske sygeplejersker er taget hjem,det er slut med sygeplejerudtryk på gebrokkent...,,det er slut med sygeplejerudtryk på gebrokkent...,mikkel from nielsen,Nordjylland,2012-02-27 15:26:00+00:00,https://www.dr.dk/nyheder/regionale/nordjyllan...,DR,det er slut med sygeplejerudtryk på gebrokkent...
...,...,...,...,...,...,...,...,...,...,...,...
523,1495,læger og sygeplejersker siger stop: 'kan ikke ...,"ifølge ansatte på holbæk sygehus, så er forhol...",besøg fra arbejdstilsynet klare krav,det er ikke kun sygeplejersker der flygter fra...,trine warrer juul,Sjælland,2021-12-06 04:55:00+00:00,https://www.dr.dk/nyheder/regionale/sjaelland/...,DR,det er ikke kun sygeplejersker der flygter fra...
524,1496,detektor: talsperson erkender - har ikke tal p...,talsperson for opsigelser blandt sygeplejerske...,debatindlæg som dokumenation sundhedsøkonom: '...,patienter ligger på gangene og dør fordi de i...,august stenbroen,Detektor,2021-12-22 19:39:00+00:00,https://www.dr.dk/nyheder/detektor/detektor-ta...,DR,patienter ligger på gangene og dør fordi de i...
525,1497,flere sygeplejersker skifter til det private: ...,der skal gøres noget ved løn og vilkår i det o...,fik hjertebanken sygeplejerskerne siger fra ’i...,højere løn og bedre arbejdstider det var det s...,allan nisgaard,Indland,2021-12-29 04:46:00+00:00,https://www.dr.dk/nyheder/indland/flere-sygepl...,DR,højere løn og bedre arbejdstider det var det s...
526,1498,nu skal sygeplejersker og læger fra hospitaler...,i både region midtjylland og region syddanmark...,frygter længere ventelister ansætter mange,meldingen fra sundhedsmyndighederne har været ...,emil eller,Indland,2021-12-15 04:50:00+00:00,https://www.dr.dk/nyheder/indland/nu-skal-syge...,DR,meldingen fra sundhedsmyndighederne har været ...


In [92]:
BoW(df)

Unnamed: 0,aabenraa,aagaard,aage,aahauge,aaholm,aalborg,aarhus,aarhusianske,aarhusrådmanden,aaskov,...,østre,øve,øvelse,øvelser,øver,øverst,øverste,øvet,øvrige,øvrigt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
525,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
two_gram(df)

Unnamed: 0,aabenraa de,aabenraa er,aabenraa esbjerg,aabenraa har,aabenraa kommune,aabenraa sygehus,aagaard poulsen,aage grinderslev,aage madsen,aahauge fra,...,øvrige støttepartier,øvrige sundhedspersonale,øvrige sygeplejersker,øvrigt fremgik,øvrigt har,øvrigt ikke,øvrigt kommuner,øvrigt midlertidige,øvrigt prøver,øvrigt sker
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
tfidf(df)

Unnamed: 0,aabenraa de,aabenraa er,aabenraa esbjerg,aabenraa har,aabenraa kommune,aabenraa sygehus,aagaard poulsen,aage grinderslev,aage madsen,aahauge fra,...,øvrige støttepartier,øvrige sundhedspersonale,øvrige sygeplejersker,øvrigt fremgik,øvrigt har,øvrigt ikke,øvrigt kommuner,øvrigt midlertidige,øvrigt prøver,øvrigt sker
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Sentiment analysis


In [129]:
df = dr

In [130]:
df.content

0      antallet af danske sygeplejersker der har fået...
1      trods fyringsrunder på sygehusene i både  og  ...
2      der er ikke noget at undskylde eller beklage f...
3      dansk sygeplejeråd der repræsenterer landets o...
4      det er slut med sygeplejerudtryk på gebrokkent...
                             ...                        
523    det er ikke kun sygeplejersker der flygter fra...
524     patienter ligger på gangene og dør fordi de i...
525    højere løn og bedre arbejdstider det var det s...
526    meldingen fra sundhedsmyndighederne har været ...
527    det var et ønske om at se sine børn noget mere...
Name: content, Length: 528, dtype: object

### AFINN

In [131]:
from afinn import Afinn

In [132]:
afn = Afinn(emoticons=False)

for i, row in df.iterrows():
    print('Predicted Sentiment polarity: ', afn.score(row.content)) # AFINN polarity score

Predicted Sentiment polarity:  -3.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -5.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  1.0
Predicted Sentiment polarity:  2.0
Predicted Sentiment polarity:  2.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -3.0
Predicted Sentiment polarity:  -3.0
Predicted Sentiment polarity:  1.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  2.0
Predicted Sentiment polarity:  0.0
Predicted Sent

Predicted Sentiment polarity:  -1.0
Predicted Sentiment polarity:  -6.0
Predicted Sentiment polarity:  -3.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  -5.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  3.0
Predicted Sentiment polarity:  3.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -8.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  2.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  1.0
Predicted Sentiment polarity:  4.0
Predicted Sentiment polarity:  -8.0
Pred

Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  2.0
Predicted Sentiment polarity:  -18.0
Predicted Sentiment polarity:  1.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  1.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -4.0
Predicted Sentiment polarity:  -6.0
Predicted Sentiment polarity:  -7.0
Predicted Sentiment polarity:  -12.0
Predicted Sentiment polarity:  -8.0
Predicted Sentiment polarity:  -3.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  -6.0
Predicted Sentiment polarity:  -3.0
Predicted Sentiment polarity:  -10.0
Predicted Sentiment polarity:  -7.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  -7.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  -16.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -2.0
Predicted Sentiment polarity:  0.0
Predicted Sentiment polarity:  -

In [133]:
afinn_preds = []
for i in df['content'].values:
    score = afn.score(i)
    if score <= 0:
        afinn_preds.append(0)
    else:
        afinn_preds.append(1)

In [134]:
afinn_preds # +/- afinn sentiment predictions

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,


### VADER

In [135]:
# neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
# NOTE: The compound score is the one most commonly used for sentiment analysis by most researchers, including the authors.
# compound = Calling it a 'normalized, weighted composite score' is accurate.

In [136]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dana_tiger/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [137]:
analyzer = SentimentIntensityAnalyzer()

for i, row in df.iterrows():
    # print("Content: ", row.review)
    print('Predicted Sentiment polarity: ', analyzer.polarity_scores(row.content))

Predicted Sentiment polarity:  {'neg': 0.025, 'neu': 0.975, 'pos': 0.0, 'compound': -0.4767}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.984, 'pos': 0.016, 'compound': 0.3612}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.99, 'pos': 0.01, 'compound': 0.34}
Predicted Sentiment polarity:  {'neg': 0.022, 'neu': 0.978, 'pos': 0.0, 'compound': -0.5859}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.989, 'pos': 0.011, 'compound': 0.34}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.985, 'pos': 0.015, 'compound': 0.2732}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.987, 'pos': 0.013, 'compound': 0.25}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.986, 'pos': 0.014, 'compound': 0.4767}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.028, 'neu': 0.972, 'pos': 0.0, 'compound': -0.4767}
Predict

Predicted Sentiment polarity:  {'neg': 0.006, 'neu': 0.994, 'pos': 0.0, 'compound': -0.2263}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.01, 'neu': 0.99, 'pos': 0.0, 'compound': -0.7184}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.012, 'neu': 0.988, 'pos': 0.0, 'compound': -0.5574}
Predicted Sentiment polarity:  {'neg': 0.013, 'neu': 0.976, 'pos': 0.011, 'compound': -0.0772}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.993, 'pos': 0.007, 'compound': 0.2732}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polari

Predicted Sentiment polarity:  {'neg': 0.012, 'neu': 0.98, 'pos': 0.009, 'compound': -0.2263}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.008, 'neu': 0.989, 'pos': 0.003, 'compound': -0.7003}
Predicted Sentiment polarity:  {'neg': 0.005, 'neu': 0.98, 'pos': 0.015, 'compound': 0.6808}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 0.995, 'pos': 0.005, 'compound': 0.34}
Predicted Sentiment polarity:  {'neg': 0.004, 'neu': 0.993, 'pos': 0.003, 'compound': -0.0516}
Predicted Sentiment polarity:  {'neg': 0.005, 'neu': 0.995, 'pos': 0.0, 'compound': -0.5859}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.025, 'neu': 0.969, 'pos': 0.006, 'compound': -0.802}
Predicted Sentiment polarity:  {'neg': 0.005, 'neu': 0.982, 'pos': 0.013, 'compound': 0.5106}
Predicted Sentiment polarity:  {'neg': 0.013, 'neu': 0.981, 'pos': 0.006, 'compoun

Predicted Sentiment polarity:  {'neg': 0.008, 'neu': 0.983, 'pos': 0.01, 'compound': 0.4588}
Predicted Sentiment polarity:  {'neg': 0.004, 'neu': 0.996, 'pos': 0.0, 'compound': -0.5574}
Predicted Sentiment polarity:  {'neg': 0.008, 'neu': 0.977, 'pos': 0.014, 'compound': 0.5106}
Predicted Sentiment polarity:  {'neg': 0.01, 'neu': 0.988, 'pos': 0.002, 'compound': -0.8176}
Predicted Sentiment polarity:  {'neg': 0.004, 'neu': 0.984, 'pos': 0.011, 'compound': 0.5719}
Predicted Sentiment polarity:  {'neg': 0.007, 'neu': 0.993, 'pos': 0.0, 'compound': -0.5859}
Predicted Sentiment polarity:  {'neg': 0.008, 'neu': 0.992, 'pos': 0.0, 'compound': -0.5574}
Predicted Sentiment polarity:  {'neg': 0.005, 'neu': 0.978, 'pos': 0.017, 'compound': 0.7964}
Predicted Sentiment polarity:  {'neg': 0.01, 'neu': 0.979, 'pos': 0.01, 'compound': -0.2263}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.015, 'neu': 0.98, 'pos': 0.004, '

Predicted Sentiment polarity:  {'neg': 0.005, 'neu': 0.995, 'pos': 0.0, 'compound': -0.4019}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.018, 'neu': 0.982, 'pos': 0.0, 'compound': -0.8885}
Predicted Sentiment polarity:  {'neg': 0.012, 'neu': 0.988, 'pos': 0.0, 'compound': -0.9217}
Predicted Sentiment polarity:  {'neg': 0.01, 'neu': 0.99, 'pos': 0.0, 'compound': -0.743}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Predicted Sentiment polarity:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [142]:
vader_preds = []

for i in df['content'].values: #For each review compute the polarity score, and classify it as positive or negative
    score = analyzer.polarity_scores(i)["compound"]
    
    if score<=0:
        vader_preds.append(0)
    else:
        vader_preds.append(1)

In [139]:
vader_preds # +/- vader sentiment predictions

[0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,


### Sentida

In [65]:
# %pip install sentida
# output = ["mean", "total", "by_sentence_mean", "by_sentence_total"]

In [72]:
from sentida import Sentida

In [152]:
analyzer = SentimentIntensityAnalyzer()
SV = Sentida()

for i, row in df.iterrows():
    print('Predicted Sentiment polarity: ', SV.sentida(row.content, 
                                            output = 'mean', 
                                            normal = True))

Predicted Sentiment polarity:  0.05740740740740732
Predicted Sentiment polarity:  0.01352920227920226
Predicted Sentiment polarity:  -0.01055124892334197
Predicted Sentiment polarity:  -0.006787148594377511
Predicted Sentiment polarity:  0.08029661016949143
Predicted Sentiment polarity:  0.06152777777777776
Predicted Sentiment polarity:  0.2614197530864197
Predicted Sentiment polarity:  0.12710826210826204
Predicted Sentiment polarity:  0.019557438794726878
Predicted Sentiment polarity:  0.07249999999999991
Predicted Sentiment polarity:  0.03697916666666668
Predicted Sentiment polarity:  0.0268719806763285
Predicted Sentiment polarity:  0.03653549382716046
Predicted Sentiment polarity:  0.005047619047619048
Predicted Sentiment polarity:  -0.07022471910112361
Predicted Sentiment polarity:  0.0964814814814815
Predicted Sentiment polarity:  0.036995192307692305
Predicted Sentiment polarity:  0.03639846743295018
Predicted Sentiment polarity:  0.07457264957264954
Predicted Sentiment polarit

Predicted Sentiment polarity:  0.07984477124183016
Predicted Sentiment polarity:  0.010207156308851174
Predicted Sentiment polarity:  0.08516071428571421
Predicted Sentiment polarity:  -0.023728813559322118
Predicted Sentiment polarity:  0.07706597222222225
Predicted Sentiment polarity:  0.13166829745596864
Predicted Sentiment polarity:  -0.013793572984749466
Predicted Sentiment polarity:  -0.06436056998557001
Predicted Sentiment polarity:  -0.000580374188621681
Predicted Sentiment polarity:  0.04269005847953207
Predicted Sentiment polarity:  -0.06226711560044886
Predicted Sentiment polarity:  -0.07425925925925925
Predicted Sentiment polarity:  0.04499884138570269
Predicted Sentiment polarity:  -0.06165532879818594
Predicted Sentiment polarity:  0.10096213066999592
Predicted Sentiment polarity:  0.12511530398322845
Predicted Sentiment polarity:  0.0966222222222222
Predicted Sentiment polarity:  -0.060414462081128786
Predicted Sentiment polarity:  0.04693316624895567
Predicted Sentiment

Predicted Sentiment polarity:  0.08878959276018095
Predicted Sentiment polarity:  0.054232195071010805
Predicted Sentiment polarity:  0.06110204610204603
Predicted Sentiment polarity:  0.0754400352733685
Predicted Sentiment polarity:  0.13173371647509582
Predicted Sentiment polarity:  0.04646666666666665
Predicted Sentiment polarity:  0.03666666666666657
Predicted Sentiment polarity:  0.04754320987654314
Predicted Sentiment polarity:  0.11501133786848072
Predicted Sentiment polarity:  0.18124388539482883
Predicted Sentiment polarity:  0.024661134163208778
Predicted Sentiment polarity:  -0.0053589743589743805
Predicted Sentiment polarity:  0.10345403439153454
Predicted Sentiment polarity:  0.04605363984674327
Predicted Sentiment polarity:  -0.08502583979328147
Predicted Sentiment polarity:  0.1044363341443633
Predicted Sentiment polarity:  0.10368356180856177
Predicted Sentiment polarity:  0.10184422657952073
Predicted Sentiment polarity:  0.020844444444444513
Predicted Sentiment polari

Predicted Sentiment polarity:  0.004567222900556185
Predicted Sentiment polarity:  0.11608653265227677
Predicted Sentiment polarity:  0.05636746031746023
Predicted Sentiment polarity:  -0.00788562091503273
Predicted Sentiment polarity:  0.010270231880970088
Predicted Sentiment polarity:  -0.01694810629838882
Predicted Sentiment polarity:  0.037083211607751375
Predicted Sentiment polarity:  0.07532025844525836
Predicted Sentiment polarity:  0.028956390116183874
Predicted Sentiment polarity:  -0.05694251810756671
Predicted Sentiment polarity:  0.04319767441860459
Predicted Sentiment polarity:  0.026329761904761825
Predicted Sentiment polarity:  -0.019666125979994665
Predicted Sentiment polarity:  -0.02450454840805727
Predicted Sentiment polarity:  -0.01107615268329556
Predicted Sentiment polarity:  0.11194773825208605
Predicted Sentiment polarity:  -0.004283580815838905
Predicted Sentiment polarity:  0.06655073461891639
Predicted Sentiment polarity:  -0.013153825995807148
Predicted Senti

In [149]:
sentida_preds = []

for i in df['content'].values:    #For each review compute the polarity score
    score = SV.sentida(i, output = 'mean')
    
    if score<=0:                  #classify it as positive or negative
        sentida_preds.append(0)
    else:
        sentida_preds.append(1)

In [None]:
sentida_preds

##### Print sentiment results

In [150]:
#print(afinn_preds)
#print(vader_preds)
#print(sentida_preds)

##### Average function

In [85]:
def average(lst):
    return sum(lst) / len(lst)

In [86]:
average(sentida_preds)

0.7518939393939394

# Topic modelling

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english', max_df=0.1, max_features=5000)
bag = count.fit_transform(df['content'].values)

In [107]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5,random_state=123) 
review_topics = lda.fit_transform(bag)

In [108]:
n_top_words = 5
word_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_): 
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([word_names[i]
    for i in topic.argsort()\
        [:-n_top_words - 1:-1]]))

Topic 1:
regeringens møller milliarder psykiatrien mette
Topic 2:
hjem lægen aalborg sygeplejersken sagen
Topic 3:
pristed luca arbejdsretten steenberg dorte
Topic 4:
strejken konflikten konflikt nej arbejdsgiverne
Topic 5:
norge vikarer min afsnit nyuddannede


### Sentida function

In [140]:
def sentida(df): 
    sentida_preds = []

    analyzer = SentimentIntensityAnalyzer()
    SV = Sentida()

    for i, row in df.iterrows():
        sentida_preds.append(SV.sentida(row.content, output = 'mean', normal = True))
        
    return sentida_preds

In [145]:
len(sentida(dr))

528

In [153]:
sentida_preds

[0.05740740740740732,
 0.01352920227920226,
 -0.01055124892334197,
 -0.006787148594377511,
 0.08029661016949143,
 0.06152777777777776,
 0.2614197530864197,
 0.12710826210826204,
 0.019557438794726878,
 0.07249999999999991,
 0.03697916666666668,
 0.0268719806763285,
 0.03653549382716046,
 0.005047619047619048,
 -0.07022471910112361,
 0.0964814814814815,
 0.036995192307692305,
 0.03639846743295018,
 0.07457264957264954,
 0.0397515527950309,
 -0.01803418803418807,
 0.03227124183006524,
 0.15990740740740733,
 0.12623306233062329,
 0.11091194968553453,
 -0.023533853069251306,
 -0.04389558232931724,
 0.1590543735224587,
 0.0412191358024692,
 0.13618518518518513,
 0.021195652173913126,
 0.19254458161865567,
 -0.09031914893617014,
 -0.07017094017094026,
 0.018898809523809502,
 0.14179020664869732,
 0.06599206349206346,
 0.0868484848484849,
 -0.019263285024154624,
 0.034536019536019515,
 0.013194444444444424,
 0.040812547241118595,
 0.02525252525252522,
 0.08749999999999993,
 0.1628472222222222

In [211]:
dr['sentiment'] = sentida_preds # create sentiment column

In [256]:
dr.sort_value['sentiment']

AttributeError: 'DataFrame' object has no attribute 'sort_value'

# Groupby

In [242]:
dr.date[0]

Timestamp('2012-01-19 13:27:00+0000', tz='UTC')

In [259]:
sentiment_grouped = dr.groupby([dr.date.dt.year, dr.date.dt.month])['sentiment'].mean()

In [260]:
sentiment_grouped

date  date
2012  1       0.020128
      2       0.104713
      3       0.033792
      4       0.010859
      5       0.106137
                ...   
2021  8       0.026101
      9       0.023068
      10      0.032652
      11      0.011349
      12      0.063392
Name: sentiment, Length: 115, dtype: float64

In [239]:
dr['date'] = pd.to_datetime(dr['date'])

In [240]:
type(dr.date[0])

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
df.groupby(df.your_date_column.dt.month)['values_column'].sum()