# Presprocessing Speeches

## Loading packages

In [None]:
import pandas as pd
import altair as alt
import numpy as np
import datetime as dt

## Importing data

In [None]:
# Reading pickle
speeches = pd.read_pickle('/Users/frederiklange/Documents/Statskundskab /Kandidat/Speciale/kode/data/ft_speeches.pkl')

## Checking data

In [None]:
# First five rows
speeches

In [None]:
# Overview
speeches.info()

In [None]:
speeches.loc[speeches['name']=='Uffe Elbæk']

## Speeches per year: Before preprocessing

In [None]:
# Grouping data by year
speeches_yearly = speeches.groupby(speeches['date'].dt.year)['text'].agg(['count'])

# Checking grouped data
speeches_yearly

# Adding year column
speeches_yearly['year'] = speeches_yearly.index

# Dropping wrong year
speeches_yearly = speeches_yearly[speeches_yearly['year'] < 2021]

# Turning into interger and then string
speeches_yearly['year'] = speeches_yearly['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(speeches_yearly).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('count', title='Antal taler'),
    tooltip = 'count'
).interactive().configure_mark(opacity=0.8,color='#00BFA5')



## Average words per speech per year: Before preprocessing

In [None]:
# Creating column with number of words per speech
speeches['no_words'] = speeches['text'].str.count(' ') + 1

In [None]:
# Grouping data by year
speeches_yearly_no_words = speeches.groupby(speeches['date'].dt.year)['no_words'].agg(['mean'])

# Checking grouped data
speeches_yearly_no_words

# Adding year column
speeches_yearly_no_words['year'] = speeches_yearly_no_words.index

# Dropping wrong year
speeches_yearly_no_words = speeches_yearly_no_words[speeches_yearly_no_words['year'] < 2021]

# Turning into interger and then string
speeches_yearly_no_words['year'] = speeches_yearly_no_words['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(speeches_yearly_no_words).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('mean', title='Antal ord'),
    tooltip = 'mean'
).interactive().configure_mark(opacity=0.8,color='#00BFA5')



## Preprocesseing speeches

Steps:
1. Drop taler af formænd og næstformænd, som er formelle og faste (X)
2. Hvilke partier skal indgå? Dem som har eksisteret i tidsperioden (X)
3. Ekskludere partiuafhængige (X)
4. Drop alt tekst med navne og ministertitler (X)
5. Stemme tekst
6. Smid taler ud, der ikke har et normalt år (X)
7. Fjern punctuation mm.
8. Fjern stopord
9. Behold ord, der sagt mindst x antal gange
10. Fjern procedurale ord (X)
11. uni, bi and trigrams
12. TF-IDF

### Removing non-party members and ft-presidents

Inspecting parties and roles

In [None]:
# Parties
speeches_parties_yearly = speeches['date'].groupby([speeches.date.dt.year, speeches.party]).agg('count')

# Unstacking parties
speeches_parties_yearly_table = speeches_parties_yearly.unstack('party')

# Printing
speeches_parties_yearly_table

In [None]:
# Speeches per party
speeches['party'].value_counts()

In [None]:
# Number of speeches before removal of small parties
speeches.shape

Renaming Kristeligt Folkeparti til Kristendemokraterne

In [None]:
# Renaming kristendemokraterne
speeches['party'] = speeches['party'].replace({'KRF': 'KD'})

Removing smaller parties and non-party-members

In [None]:
# Removing parties
parties_to_remove = ['TS',
                     'PØU',
                     'DMI',
                     '4/10 05',
                     'RY',
                     'FF',
                     'FÆR',
                     'NQ',
                     'SP',
                     'T',
                     'TP',
                     'TF',
                     'JF',
                     'SIU',
                     'Pause',
                     'UFG',
                     'UP',
                     'IA',
                     'REG',
                     'MødeSlut',
                     'FRI',
                    ]

In [None]:
# Looping over roles
for party in parties_to_remove:
    speeches =  speeches[speeches['party']!=party]

In [None]:
# Number of speeches before removal of small parties
speeches.shape

Removing formal speakers

In [None]:
# Inspecting roles
speeches[speeches['role'].str.contains('formand', na = False)]

In [None]:
# Removing speakers
roles_to_remove = ['formand',
                   'aldersformanden',
                   'MødeSlut',
                   'Pause',
                   'Formanden ■',
                   '.. Formanden',
                   'Formanden v:v,',
                   'Formanden,',
                   'Formanden ^',
                   "Formanden: v>'<",
                   "Formanden i' -k: ■;; .'.vvv.y",
                   '", Formanden',
                   'Formanden v',
                   'Formanden l-',
                   'Formanden " s',
                   'Formanden I.',
                   'Formanden ;',
                   'Formanden —',
                   'Formanden >■> •:',
                   'Formanden 7',
                   'Formanden :',
                   'Formanden ^ ^',
                   'Formanden, ..--r-',
                   'Formanden i/i--:',
                   'Formanden r',
                   'Formanden -',
                   'Formanden i,: m',
                   "Formanden'",
                   'Formanden; ^',
                   'Formanden .ri/.r.-!-',
                   'v Formanden',
                   'Formanden -r.i, ..r.-?;■■■',
                   'Formanden .',
                   'Formanden „ _..s'
                  ]

In [None]:
# Looping over roles
for role in roles_to_remove:
    speeches =  speeches[speeches['role']!=role] 

In [None]:
# Number of speeches before removal of formal speekers
speeches.shape

Removing formal text bites 

In [None]:
# Nan texts
nan_texts = ['Talen er under udarbejdelse)',
             'Ordfører (Talen er under udarbejdelse)',
             '(Spørgsmålet er udgået, da det er taget tilbage af spørgeren).',
             'Finansministeren (Talen er under udarbejdelse)',
             '(Spørgsmålet er udgået af dagsordenen).',
             '(Punktet er udgået af dagsordenen).',
             'Ordfører (Talen er under udarbejdelse) (Talen er under udarbejdelse)',
             'Ja.',
             '(Spørgsmålet er udgået under henvisning til Folketingets forretningsordens § 20, stk. 5).',
             '(Spørgsmålet er overgået til skriftlig besvarelse).',
             '(Spørgsmålet er udgået efter aftale mellem ministeren og spørgeren).',
             'Nej.',
             'Jo.',
             'Ordfører for forslagsstillerne (Talen er under udarbejdelse)',
             '(Talen er under udarbejdelse) (Talen er under udarbejdelse) (Talen er under udarbejdelse)',
             'Selv tak.',
             'Jeg har ikke flere spørgsmål.',
             'Ordfører Liberal Alliance støtter også lovforslaget.',
             '(Spørgsmålet er udgået på grund af lydtekniske problemer).',
             '(Talen er under udarbejdelse) (Talen er under udarbejdelse)',
             'Tak for det.',
             'Ja, det kan jeg bekræfte.',
             'Det er noteret.',
             '(Talen er under udarbejdelse)',
             'Privatist (Talen er under udarbejdelse)',
             'Ordfører for forespørgerne (Talen er under udarbejdelse)',
             'Ordfører for forespørgerne (Talen er under udarbejdelse) (Talen er under udarbejdelse)',
             'Finansministeren (Talen er under udarbejdelse) (Talen er under udarbejdelse) (Talen er under udarbejdelse) (Talen er under udarbejdelse) (Talen er under udarbejdelse) (Talen er under udarbejdelse)'
            ]

In [None]:
# Making speeches to strings
speeches['text'] = speeches['text'].astype(str)


In [None]:
# Looping over texts
for nan_text in nan_texts:
    speeches = speeches[speeches['text']!=nan_text]

In [None]:
# Number of speeches before removal of formal text bites
speeches.shape

Removing NA dates

In [None]:
# Checking Nas for date
speeches['date'].isna().value_counts()

In [None]:
# Creating nan values
speeches['date'].replace(["NaN", 'NaT'], np.nan, inplace = True)

# Subsetting to remove NA's
speeches = speeches[speeches['date'].isna()==0]

In [None]:
#Speeches after NA date
speeches.shape

Removing NA parties

In [None]:
# Inspecting parties
na_parties_df = speeches[speeches['party'].isna()]

In [None]:
# Checking unique names
na_parties_df['name'].unique()

In [None]:
na_parties_df['name'].shape

In [None]:
# Create list with no parties
no_party = {'Lars Løkke Rasmussen': 'V',
 'Eva Kjer Hansen': 'V',
 'Claus Hjort Frederiksen': 'V',
 'Troels Lund Poulsen': 'V',
 'Karen Ellemann': 'V',
 'Inger Støjberg': 'V',
 'Birthe Rønn Hornbech': 'V',
 'Helge Sander': 'V',
 'Brian Mikkelsen': 'KF',
 'Lars Barfoed': 'KF',
 'Kristian Jensen': 'V',
 'Søren Gade': 'V',
 'Carina Christensen': 'KF',
 'Bertel Haarder': 'V',
 'Lene Espersen': 'KF',
 'Per Stig Møller': 'KF',
 'Connie Hedegaard': 'KF',
 'Jakob Axel Nielsen': 'KF',
 'Lykke Friis': 'V', 
 'Ulla Tørnæs': 'V',
 'Gitte Lillelund Bech': 'V',
 'Benedikte Kiær': 'KF',
 'Tina Nedergaard': 'V',
 'Charlotte Sahl-Madsen': 'KF',
 'Henrik Høegh': 'V',
 'Hans Christian Schmidt': 'V',
 'Søren Pind': 'V',
 'Peter Christensen': 'V',
 'Thor Möger Pedersen': 'S',
 'Martin Lidegaard': 'RV',
 'Karen Hækkerup': 'S',
 'Bjarne Corydon': 'S',
 'Henrik Dam Kristensen': 'S',
 'Mette Frederiksen': 'S',
 'Astrid Krag': 'S',
 'Manu Sareen': 'RV',
 'Christian Friis Bach': 'S',
 'Uffe Elbæk': 'RV',
 'Ole Sohn': 'S',
 'Morten Bødskov': 'S',
 'Helle Thorning-Schmidt': 'S',
 'Margrethe Vestager': 'RV',
 'Villy Søvndal': 'SF',
 'Morten Østergaard': 'RV',
 'Carsten Hansen': 'S',
 'Christine Antorini': 'S',
 'Pia Olsen Dyhr': 'SF',
 'Ida Auken': 'RV', 
 'Mette Gjerskov': 'S',
 'Nicolai Wammen': 'S',
 'Nick Hækkerup': 'S',
 'Annette Vilhelmsen': 'SF',
 'Holger K. Nielsen': 'SF',
 'Marianne Jelved': 'RV',
 'Henrik Sass Larsen': 'S',
 'Rasmus Helveg Petersen': 'RV',
 'Jonas Dahl': 'SF',
 'Dan Jørgensen': 'S',
 'Sofie Carsten Nielsen': 'RV',
 'Magnus Heunicke': 'S',
 'Kirsten Brosbøl': 'S',
 'Mogens Jensen': 'S',
 'Benny Engelbrecht': 'S',
 'Karsten Lauritzen': 'V',
 'Jørn Neergaard Larsen': 'V',
 'Ellen Trane Nørby': 'V',
 'Sophie Løhde': 'V',
 'Esben Lunde Larsen': 'V',
 'Lars Christian Lilleholt': 'V',
 'Mai Mercado': 'KF',
 'Simon Emil Ammitzbøll': 'KF',
 'Søren Pape Poulsen': 'KF',
 'Merete Riisager': 'LA',
 'Ole Birk Olesen': 'LA',
 'Thyra Frank': 'LA',
 'Anders Samuelsen': 'LA',
 'Mette Bock': 'LA',
 'Simon Emil Ammitzbøll-Bille': 'LA',
 'Jakob Ellemann-Jensen': 'V',
 'Tommy Ahlers': 'V',
 'Rasmus Jarlov': 'KF',
 'Kaare Dybvad': 'S',
 'Peter Hummelgaard Thomsen': 'S',
 'Jeppe Kofod': 'S',
 'Pernille Rosenkrantz-Theil': 'S',
 'Ane Halsboe-Jørgensen': 'S',
 'Simon Kollerup': 'S',
 'Mattias Tesfaye': 'S',
 'Lea Wermelin': 'S',
 'Trine Bramsen': 'S',
 'Rasmus Prehn': 'S',
 'Peter Hummelgaard': 'S',
 'Kaare Dybvad Bek': 'S',
 'Joy Mogensen': 'S',
 'Flemming Møller Mortensen': 'S'}

In [None]:
# Replace party values
for key, value in no_party.items():
    speeches.loc[speeches.name == key, 'party'] = value

In [None]:
# Efter erstatning partier
speeches.shape

NA Role

In [None]:
# Inspecting parties
na_role_df = speeches[speeches['role'].isna()]
na_role_df

In [None]:
# Dropping NA dates
speeches = speeches[speeches['role'].isna()==0]

In [None]:
# Number of speeches after role is removes
speeches.shape

NA navne

In [None]:
speeches[speeches.name.isna()]

Creating full text version of speech

In [None]:
speeches_temp = speeches

In [None]:
speeches_temp['full_text'] = speeches['text']

Remove names and roles from speeches

In [None]:
# Creating df of roles
roles = speeches.role.value_counts().rename_axis('unique_values').reset_index(name='counts')

# Creating list of roles
roles = roles['unique_values']

# Creating list of names
names = speeches['name'].unique()

In [None]:
# Concatenating to one list
roles_names = list(roles) + list(names)

In [None]:
roles_names = list(map(str, roles_names))

In [None]:
roles_names_lower = [x.lower() for x in roles_names]

In [None]:
speeches

In [None]:
# adding a space in front of each word
roles_names = [f' {x} ' for x in roles_names]

In [None]:
x = 0
# Loop over words and remove from all speeches
for name_role in roles_names:
    x += 1
    try:
        speeches['text'] = speeches['text'].str.replace(name_role, '  ')
    except:
        pass
    print(f'done with {x}/2500')

In [None]:
speeches.shape

#### Lower case

In [None]:
speeches["text"] = speeches["text"].str.lower()

Remove parties and shorts

In [None]:
speeches['party'].value_counts()

In [None]:
parties_short = [' socialedemokratiet ',
                 ' socialedemokraterne ',
                 ' socialedemokraternes ',
                 ' socialdemokratiets ',
                 ' s ',
                 ' socialistiske folkeparti ',
                 ' sf ',
                 ' sfs ',
                 ' det socialistiske folkeparti ',
                 ' socialistisk folkeparti ',
                 ' socialtisk folkepartis ',
                 ' enhedslisten ',
                 ' enhedslistens ',
                 ' el ',
                 ' els ',
                 ' e ',
                 ' det radikale venstre ',
                 ' radikale venstre ',
                 ' radikale venstres ',
                 ' det radikale ventres ',
                 ' rv ',
                 ' rvs ',
                 ' venstre ',
                 ' v ',
                 ' vs ',
                 ' venstres ',
                 ' df ',
                 ' dfs ',
                 ' dansk folkeparti ',
                 ' dansk folkepartis ',
                 ' konservative ',
                 ' det konservative folkeparti ',
                 ' konsevatives ',
                 ' det konservative folkepartis ',
                 ' k ',
                 ' ks ',
                 ' la ',
                 ' las ',
                 ' liberal alliance ',
                 ' liberal alliances ',
                 ' fremskridtspartiet ',
                 ' fremskridtspartiets',
                 ' fp ',
                 ' fps ',
                 ' kristendemokraterne ',
                 ' kristendermokraternes ',
                 ' kristeligt folkeparti',
                 ' kristeligt folkepartis ',
                 ' alternativet ',
                 ' alternativets ',
                 ' centrumdemokraterne ',
                 ' ventrumdemokraternes ',
                 ' nye borgerlige ',
                 ' nye borgerliges ',
                 ' nb ',
                 ' nbs ',
                 ' rød blok ',
                 ' blå blok ',
                 ' oppositionen ',
                 ' regeringen ',
                 ' de blå ',
                 ' de røde '
                ]

In [None]:
speeches['text'].str.contains(' det radikale venstre ').value_counts()

In [None]:
test = speeches
test['text'] = speeches['text'].str.replace(' det radikale venstre ', ' ')
test['text'].str.contains(' det radikale venstre ').value_counts()

In [None]:
speeches

In [None]:
x = 0
# Loop over words and remove from all speeches
for party in parties_short:
    x += 1
    try:
        speeches['text'] = speeches['text'].str.replace(party, '  ')
    except:
        pass
    print(f'done with {x}')

In [None]:
speeches

### Checking Dataframe before text as data

In [None]:
# Checking entire df
speeches

In [None]:
# Missing data for navne, som i stedet er angivet som rolle
speeches[speeches['name'].isna()==True]

## Text as data: presprocessing

#### Packages

In [1]:
# NLTK
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# for vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import scipy.sparse

#other packages
import re
import string
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import SnowballStemmer  
import pandas as pd
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/frederiklange/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/frederiklange/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/frederiklange/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
%pip install --upgrade pandas
import pandas as pd

In [2]:
speeches = pd.read_pickle("all_speeches.pkl")

#### Punctuation

In [4]:
# Punctuation that needs to be removed
PUNCT_TO_REMOVE = string.punctuation

In [5]:
# Creating function to remove punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [6]:
# Removing punctuation
speeches["text"] = speeches["text"].apply(lambda text: remove_punctuation(text))

#### Remove parties

In [None]:
party_names_roles = list(roles_names_lower) + list(parties_short)

In [None]:
party_names_roles = [word.strip() for word in party_names_roles]

In [None]:
# Creating function to remove stopwords
def remove_parties_names_roles(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in party_names_roles])

In [None]:
# Removing stopwords
speeches["text"] = speeches["text"].apply(lambda text: remove_parties_names_roles(text))

#### Stop words

In [7]:
STOPWORDS = ['ad',
'af',
'aldrig',
'alene',
'alle',
'allerede',
'alligevel',
'alt',
'altid',
'anden',
'andet',
'andre',
'at',
'bag',
'bare',
'begge',
'bl.a.',
'blandt',
'blev',
'blive',
'bliver',
'burde',
'bør',
'ca.',
'da',
'de',
'dem',
'den',
'denne',
'dens',
'der',
'derefter',
'deres',
'derfor',
'derfra',
'deri',
'dermed',
'derpå',
'derved',
'det',
'dette',
'dig',
'din',
'dine',
'disse',
'dit',
'dog',
'du',
'efter',
'egen',
'ej',
'eller',
'ellers',
'en',
'end',
'endnu',
'ene',
'eneste',
'enhver',
'ens',
'enten',
'er',
'et',
'f.eks.',
'far',
'fem',
'fik',
'fire',
'flere',
'flest',
'fleste',
'for',
'foran',
'fordi',
'forrige',
'fra',
'fx',
'få',
'får',
'før',
'først',
'gennem',
'gjorde',
'gjort',
'god',
'godt',
'gør',
'gøre',
'gørende',
'ham',
'han',
'hans',
'har',
'havde',
'have',
'hej',
'hel',
'heller',
'helt',
'hen',
'hende',
'hendes',
'henover',
'her',
'herefter',
'heri',
'hermed',
'herpå',
'hos',
'hun',
'hvad',
'hvem',
'hver',
'hvilke',
'hvilken',
'hvilkes',
'hvis',
'hvor',
'hvordan',
'hvorefter',
'hvorfor',
'hvorfra',
'hvorhen',
'hvori',
'hvorimod',
'hvornår',
'hvorved',
'i',
'igen',
'igennem',
'ikke',
'imellem',
'imens',
'imod',
'ind',
'indtil',
'ingen',
'intet',
'ja',
'jeg',
'jer',
'jeres',
'jo',
'kan',
'kom',
'komme',
'kommer',
'kun',
'kunne',
'lad',
'langs',
'lav',
'lave',
'lavet',
'lidt',
'lige',
'ligesom',
'lille',
'længere',
'man',
'mand',
'mange',
'med',
'meget',
'mellem',
'men',
'mens',
'mere',
'mest',
'mig',
'min',
'mindre',
'mindst',
'mine',
'mit',
'mod',
'må',
'måske',
'ned',
'nej',
'nemlig',
'ni',
'nogen',
'nogensinde',
'noget',
'nogle',
'nok',
'nu',
'ny',
'nyt',
'når',
'nær',
'næste',
'næsten',
'og',
'også',
'okay',
'om',
'omkring',
'op',
'ordfører',
'ordføreren',             
'os',
'otte',
'over',
'overalt',
'pga.',
'på',
'samme',
'sammen',
'se',
'seks',
'selv',
'selvom',
'senere',
'ser',
'ses',
'siden',
'sig',
'sige',
'sin',
'sine',
'sit',
'skal',
'skulle',
'som',
'stadig',
'stor',
'store',
'synes',
'syntes',
'syv',
'så',
'sådan',
'således',
'tag',
'tage',
'tak',
'temmelig',
'thi',
'ti',
'tidligere',
'til',
'tilbage',
'tit',
'to',
'tre',
'ud',
'uden',
'udover',
'under',
'undtagen',
'var',
'ved',
'vi',
'via',
'vil',
'ville',
'vor',
'vore',
'vores',
'vær',
'være',
'været',
'øvrig',
'hr',
'fru',
'minist',
'altså',
'gerne',
'men',
'sig',
'tror',
'giv',
'går',
'tag',
'find',
'sid',
'brug',
'selvfølgelig',
'rigtig',
'tror',
'spørgsmål',
'forslag',
'år',
'muligvis',
'kr',
'ind',
'ønske',
'spørge'
]

In [8]:
# Creating function to remove stopwords
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [9]:
# Removing stopwords
speeches["text"] = speeches["text"].apply(lambda text: remove_stopwords(text))

In [None]:
speeches

#### Stemming

In [10]:
# Defining Stemmer
stemmer = SnowballStemmer("danish")

In [11]:
# Creating stemming function
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

In [12]:
# stemming words
speeches["text"] = speeches["text"].apply(lambda text: stem_words(text))

#### Removing numbers

In [13]:
# Removing numbers
speeches['text'] = speeches['text'].str.replace('\d+', '')

  speeches['text'] = speeches['text'].str.replace('\d+', '')


In [16]:
speeches['text']

0          verd fjor europa anderled folketingsår indled ...
1          står midt betydningsfuld forjæt forandring øst...
2          men høj grad land tag medansvar verdensudvikli...
3          beundringsværd energi viljestyrk nye regering ...
4          afgør europa fremtid lyk nye tid baltikumpolit...
                                 ...                        
1819742    forkert del stem person grundlov spæd barndom ...
1819743    spørgsmål giv anledning oplev stort problem in...
1819744    retssikkerhedsmæs korrek sæt grundlov kraft si...
1819745    følg mærk idé sæt grundlov kraft begynd regul ...
1819746    interessant diskussion rejs står grundlov skat...
Name: text, Length: 1819747, dtype: object

In [None]:
speeches.to_pickle('', protocol=4)

In [23]:
speeches['no_words_after']

0          17
1          34
2          29
3          32
4          23
           ..
1819742    17
1819743    35
1819744    20
1819745    10
1819746    23
Name: no_words_after, Length: 1819747, dtype: int64

In [24]:
speeches_2 = pd.read_pickle('all_speeches.pkl')

In [29]:
speeches['full_text'] = speeches_2['text']


#### Creating group variables

Creating groups

In [None]:
# Possible parties
parties = speeches['party'].unique()

# Print parties
parties

Blokke

In [None]:
# Blokke
red = ['S', 'RV', 'SF', 'EL', 'ALT']
blue = ['KF', 'V', 'CD', 'FP', 'KD', 'DF', 'LA', 'NB']

In [None]:
# Creating bloc variable
speeches.loc[speeches['party'].isin(red), "blok"] = 0
speeches.loc[speeches['party'].isin(blue), "blok"] = 1

Parti mod anden blok

In [None]:
# Making seperate dataset for each bloc
red_speeches = speeches.loc[speeches['blok'] == 0]
blue_speeches = speeches.loc[speeches['blok'] == 1]

In [None]:
# Printing dfs
print(red_speeches.shape)
print(blue_speeches.shape)

Rød med dummies

In [None]:
# Mod egen blok
red_dummies = pd.get_dummies(red_speeches["party"])

# Adding dummies to df
red_speeches = red_speeches.join(red_dummies)

# Checking new DF
red_speeches

Blå med dummies

Parti mod egen blok

In [None]:
# Mod egen blok
blue_dummies = pd.get_dummies(blue_speeches["party"])

# Adding dummies to df
blue_speeches = blue_speeches.join(blue_dummies)

# Checking new DF
blue_speeches

Parti mod resten

In [None]:
# Mod egen resten
dummies = pd.get_dummies(speeches["party"])

# Adding dummies to df
speeches = speeches.join(dummies)

# Checking new DF
speeches

Calculating number words after preprocessing

In [14]:
speeches['no_words_after'] = speeches['text'].str.count(' ') + 1

#### TF-IDF

All speeches

In [None]:
# Initialize
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df=200, max_df=0.50)

# Transform
doc_vec = vectorizer.fit_transform(speeches["text"])

# Get feature names
feature_names = vectorizer.get_feature_names()

In [None]:
# Creating tfidf
tfidf = pd.DataFrame.sparse.from_spmatrix(doc_vec, columns=feature_names)

In [None]:
# Checking tdidf
tfidf

In [None]:
# Reset the index
speeches.reset_index(inplace=True)

# Reset index
tfidf.reset_index(inplace=True)

In [None]:
# Merging the two datasets
speeches_tfidf = pd.concat([speeches, tfidf], axis=1)

# Dropping index column
del speeches_tfidf['index']

In [None]:
# Check df
speeches_tfidf

In [None]:
# Pickle
speeches_tfidf.to_pickle("all_speeches.pkl")

Red speeches

In [None]:
# Initialize
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df=200, max_df=0.50)

# Transform
doc_vec = vectorizer.fit_transform(red_speeches["text"])

# Get feature names
feature_names = vectorizer.get_feature_names()

In [None]:
# Creating tfidf
red_tfidf = pd.DataFrame.sparse.from_spmatrix(doc_vec, columns=feature_names)

In [None]:
# Checking tdidf
red_tfidf

In [None]:
# Reset the index
red_speeches.reset_index(inplace=True)

# Reset index
red_tfidf.reset_index(inplace=True)

In [None]:
# Merging the two datasets
red_speeches_tfidf = pd.concat([red_speeches, red_tfidf], axis=1)

# Dropping index column
del red_speeches_tfidf['index']

In [None]:
# Check df
red_speeches_tfidf

In [None]:
# Pickle
red_speeches_tfidf.to_pickle("red_speeches.pkl")

Blue speeches

In [None]:
# Initialize
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df=200, max_df=0.50)

# Transform
doc_vec = vectorizer.fit_transform(blue_speeches["text"])

# Get feature names
feature_names = vectorizer.get_feature_names()

In [None]:
# Creating tfidf
blue_tfidf = pd.DataFrame.sparse.from_spmatrix(doc_vec, columns=feature_names)

In [None]:
# Checking tdidf
blue_tfidf

In [None]:
# Reset the index
blue_speeches.reset_index(inplace=True)

# Reset index
blue_tfidf.reset_index(inplace=True)

In [None]:
# Merging the two datasets
blue_speeches_tfidf = pd.concat([blue_speeches, blue_tfidf], axis=1)

# Dropping index column
del blue_speeches_tfidf['index']

In [None]:
# Check df
blue_speeches_tfidf

In [None]:
# Pickle
blue_speeches_tfidf.to_pickle("blue_speeches.pkl")

## Final plots

In [None]:
# Grouping data by year
speeches_yearly = speeches.groupby(speeches['date'].dt.year)['text'].agg(['count'])

# Checking grouped data
speeches_yearly

# Adding year column
speeches_yearly['year'] = speeches_yearly.index

# Dropping wrong year
speeches_yearly = speeches_yearly[speeches_yearly['year'] < 2021]

# Turning into interger and then string
speeches_yearly['year'] = speeches_yearly['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(speeches_yearly).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('count', title='Antal taler'),
    tooltip = 'count'
).interactive().configure_mark(opacity=0.8,color='#00BFA5')

## Average words per speech per year: Before preprocessing

In [None]:
# Creating column with number of words per speech
speeches['no_words'] = speeches['text'].str.count(' ') + 1

In [None]:
# Grouping data by year
speeches_yearly_no_words = speeches.groupby(speeches['date'].dt.year)['no_words_after'].agg(['mean'])

# Checking grouped data
speeches_yearly_no_words

# Adding year column
speeches_yearly_no_words['year'] = speeches_yearly_no_words.index

# Dropping wrong year
speeches_yearly_no_words = speeches_yearly_no_words[speeches_yearly_no_words['year'] < 2021]

# Turning into interger and then string
speeches_yearly_no_words['year'] = speeches_yearly_no_words['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(speeches_yearly_no_words).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('mean', title='Antal ord'),
    tooltip = 'mean'
).interactive().configure_mark(opacity=0.8,color='#00BFA5')



### Red Parties

In [None]:
# Grouping data by year
red_speeches_yearly = red_speeches.groupby(red_speeches['date'].dt.year)['text'].agg(['count'])

# Checking grouped data
red_speeches_yearly

# Adding year column
red_speeches_yearly['year'] = red_speeches_yearly.index

# Dropping wrong year
red_speeches_yearly = red_speeches_yearly[red_speeches_yearly['year'] < 2021]

# Turning into interger and then string
red_speeches_yearly['year'] = red_speeches_yearly['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(red_speeches_yearly).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('count', title='Antal taler'),
    tooltip = 'count'
).interactive().configure_mark(opacity=0.8,color='red')

## Average words per speech per year: Before preprocessing

In [None]:
# Creating column with number of words per speech
red_speeches['no_words'] = red_speeches['text'].str.count(' ') + 1

In [None]:
# Grouping data by year
red_speeches_yearly_no_words = red_speeches.groupby(red_speeches['date'].dt.year)['no_words'].agg(['mean'])

# Checking grouped data
red_speeches_yearly_no_words

# Adding year column
red_speeches_yearly_no_words['year'] = red_speeches_yearly_no_words.index

# Dropping wrong year
red_speeches_yearly_no_words = red_speeches_yearly_no_words[red_speeches_yearly_no_words['year'] < 2021]

# Turning into interger and then string
red_speeches_yearly_no_words['year'] = red_speeches_yearly_no_words['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(red_speeches_yearly_no_words).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('mean', title='Antal ord'),
    tooltip = 'mean'
).interactive().configure_mark(opacity=0.8,color='red')

### Blue Parties

In [None]:
# Grouping data by year
blue_speeches_yearly = blue_speeches.groupby(blue_speeches['date'].dt.year)['text'].agg(['count'])

# Checking grouped data
blue_speeches_yearly

# Adding year column
blue_speeches_yearly['year'] = blue_speeches_yearly.index

# Dropping wrong year
blue_speeches_yearly = blue_speeches_yearly[blue_speeches_yearly['year'] < 2021]

# Turning into interger and then string
blue_speeches_yearly['year'] = blue_speeches_yearly['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(blue_speeches_yearly).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('count', title='Antal taler'),
    tooltip = 'count'
).interactive().configure_mark(opacity=0.8,color='blue')

## Average words per speech per year: Before preprocessing

In [None]:
# Creating column with number of words per speech
blue_speeches['no_words'] = blue_speeches['text'].str.count(' ') + 1

In [None]:
# Grouping data by year
blue_speeches_yearly_no_words = blue_speeches.groupby(blue_speeches['date'].dt.year)['no_words'].agg(['mean'])

# Checking grouped data
blue_speeches_yearly_no_words

# Adding year column
blue_speeches_yearly_no_words['year'] = blue_speeches_yearly_no_words.index

# Dropping wrong year
blue_speeches_yearly_no_words = blue_speeches_yearly_no_words[blue_speeches_yearly_no_words['year'] < 2021]

# Turning into interger and then string
blue_speeches_yearly_no_words['year'] = blue_speeches_yearly_no_words['year'].astype(int).astype(str)

In [None]:
# Pltotting the speeches
alt.Chart(blue_speeches_yearly_no_words).mark_bar().encode(
    x=alt.X('year', title=''),
    y=alt.Y('mean', title='Antal ord'),
    tooltip = 'mean'
).interactive().configure_mark(opacity=0.8,color='blue')

## Creating data sets for different policy areas

1. Udvælger de relevante policy emner. Læs op på dette. Gerne artikel om værdi- og fordelingspolitik. Tjek Flemming Christiansen artikel.
2. Kig 1000 taler igennem og fordel ord, som passer ind i de forskellige kategorier, så der er ene ordbog for alle emner.
3. Subset data og lav pickles for alle data filer. (4x3 - rød_4, blå_4, alle_4)

In [None]:
#Tjek denne tale i datasættet:
337476

#### Selecting 1000 random speeches

In [None]:
# Loading in data
speeches = pd.read_pickle("all_speeches.pkl")

In [None]:
# Inspecting DF
speeches

In [None]:
# Subsetting 1000 random speeches
random_speeches = speeches.sample(1000)

In [None]:
# Inspecting random speeches
random_speeches

In [None]:
# Reading each speeches
print(random_speeches.iloc[499,6])

### Udlændingepolitik

In [30]:
udlændingpolitisk_ordbog = ['statsborgerskab',
                            'opholdstilladelse',
                            'integration',
                            'flygtnininge',
                            'indvandrere',
                            'tosprogede',
                            'udlændinge',
                            'islam',
                            'imamer',
                            'parallelsamfund',
                            'asylpolitik',
                            'integrationspolitik',
                            'udlændingepolitik',
                            'flygtningepolitik',
                            'asylpolitisk',
                            'integrationspolitisk',
                            'udlændingepolitisk',
                            'flygtningepolitisk',
                            'flygtningenævnet',
                            'ghetto',
                            'ghettoriseringseffekten',
                            'ghettosering',
                            'udlændingestyrelsen',
                            'flygtningekonvention',
                            'muslimer',
                            'racehygiejne',
                            'starthjælp',
                            'sprogbonus',
                            'indfødsret',
                            'indfødsretsprøve',
                            'udlændingepakken',
                            'danskundervisning',
                            'integrationsprogram',
                            'integrationsindsats',
                            'integrationsbarometer',
                            'udlændingepolitikken',
                            'migration',
                            'muslimske',
                            'asylpladser',
                            'asylsøgere',
                            'asylbørn',
                            'integrationsplaner',
                            'ikkevestlige',
                            'grænsekontrol',
                            'maskeringsforbuddet',
                            'maskeringsforbud',
                            'udlændingedebat',
                            'udlændingeområdet',
                            'flygtningestrømme',
                            'integration',
                            'udlændinge',
                            'etnisk',
                            'anden etnisk',
                            'sharia',
                            'masseindvandring' 
                           ]

### Klimapolitik

In [31]:
klimapolitisk_ordbog = ['forurene',
                        'klima',
                        'klimaforandringer',
                        'forurening',
                        'miljøeffekt',
                        'giftstoffer',
                        'co2',
                        'co2-reduktion',
                        'grøn omstiling',
                        'grønne omstilling',
                        'biodiversitet',
                        'PTX',
                        'IPCEI',
                        'vindmøller',
                        'vindmølle',
                        'grøn energi',
                        'klimaplan',
                        'kulbrint',
                        'kulbrintebeskatning',
                        'vandmiljø',
                        'vandmiljøhandlingsplan',
                        'gødning',
                        'miljøskadelig',
                        'klimaindsatsen',
                        'vindmølle',
                        'energi',
                        'klimaændringer',
                        'luftforandringer',
                        'vandressourcer', 
                        'skovressourcer', 
                        'C02-miljøvenlig',
                        'benzinforurening',
                        'miljøoptimering',
                        'miljøøkonomi',
                        'fossilt brændstof',
                        'fossilt brændsel',
                        'fossile brændstoffer',
                        'fossile brændsler',
                        'solceller',
                        'vindmøller',
                        'miljøpolitik',
                        'klimapolitik',
                        'miljøbelastning',
                        'miljøvenlig',
                        'klimatiltag',
                        'miljøpolitisk',
                        'klimapolitisk',
                        'klimamål',
                        'klimakommission',
                        'atomkraft',
                        'fossil energi',
                        'klimamålsætning',
                        'klimaregulering',
                        'miljøafgift'
                       ]


### Skattepolitik

In [32]:
skattepolitisk_ordbog = ['skattestop',
                         'skattehuller',
                         'brugerbetaling',
                         'skattesystemet',
                         'skatte',
                         'skat',
                         'skattetrykket',
                         'skattetryk',
                         'beskatning',
                         'skattevæsenet',
                         'moms',
                         'skattereform',
                         'skatterabat',
                         'topskatten',
                         'fradrag',
                         'skatteyder',
                         'topskatteyder',
                         'jobfradrag',
                         'skattenedsættelse',
                         'skattekontrolloven',
                         'SKAT',
                         'skatteudvalget',
                         'kommuneskat',
                         'skatteprocent',
                         'skattekort',
                         'trækprocent',
                         'beskatningsgrundlaget',
                         'skatterådet',
                         'skattebelagt',
                         'skatteministeren',
                         'ubeskattet',
                         'afgifter',
                         'gebyrer',
                         'udligningsskatten',
                         'skattelettelse',
                         'skatteudspil',
                         'skattefri',
                         'skatteforslag',
                         'skatteaftalen',
                         'topskattelettelser',
                         'sambeskatningsordningen',
                         'skattestoppet',
                         'momsregistrerede',
                         'momsregnskab',
                         'skatteværdi',
                         'skattepligtige',
                         'skattepolitik',
                         'skattepolitisk',
                         'skatteløsninger',
                         'skattely'
                        ]

### Velfærdspolitik

In [33]:
velfærdpolitisk_ordbog = ['velfærdsreformer',
                          'efterløn',
                          'dagpasning',
                          'dagpenge',
                          'mindsteløn',
                          'ulighed',
                          'arbejdsløs',
                          'aktivering'
                          'pension',
                          'offentlige ydelser',
                          'førtidspension',
                          'kontanthjælp',
                          'kontanthjælpsloft',
                          'kontanthjælpsniveau',
                          'velfærd',
                          'plejehjem',
                          'børnehaver',
                          'børnehave'
                          'folkeskoler',
                          'folkeskolen',
                          'velfærdssamfund',
                          'overenskomst',
                          'dagpengesystem',
                          'velfærdssystem',
                          'hjemmehjælp',
                          'forsøgelse',
                          'forsøgelsesgrundlag',
                          'folkepension',
                          'arbejdsmarkedet',
                          'ældre',
                          'beskæftigelse',
                          'arbejdsmarkedskøen',
                          'fattigdom',
                          'beskæftigelse',
                          'flexicurity',
                          'velfærdsydelser',
                          'børnecheck',
                          'ældrechecken',
                          'kernevelfærden',
                          'kernevelfærd',
                          'servicelovens',
                          'kerneydelser',
                          'velfærdspolitik',
                          'velfærdspolitisk',
                          'bistandshjælp',
                          'dagpengeniveau',
                          'dagpengeperiode'
                         ]

In [34]:
# Removing duplicates
udlændingpolitisk_ordbog_nodup = list(dict.fromkeys(udlændingpolitisk_ordbog))
klimapolitisk_ordbog_nodup = list(dict.fromkeys(klimapolitisk_ordbog))
skattepolitisk_ordbog_nodup = list(dict.fromkeys(skattepolitisk_ordbog))
velfærdpolitisk_ordbog_nodup = list(dict.fromkeys(velfærdpolitisk_ordbog))

In [None]:
# Reading in data sets
all_speeches = pd.read_pickle('all_speeches_notfidf.pkl')
#blue_speeches = pd.read_pickle('blue_speeches.pkl')
#red_speeches = pd.read_pickle('red_speeches.pkl')

In [None]:
# Checking dfs
all_speeches.head()

In [None]:
# Lower case that speeches
all_speeches['full_text'] = all_speeches['full_text'].str.lower()
#blue_speeches['full_text'] = blue_speeches['full_text'].str.lower()
#red_speeches['full_text'] = red_speeches['full_text'].str.lower()

In [35]:
# Creating regex for word serach
pattern_udlændinge = '|'.join(udlændingpolitisk_ordbog_nodup)
pattern_klima = '|'.join(klimapolitisk_ordbog_nodup)
pattern_skatte = '|'.join(skattepolitisk_ordbog_nodup)
pattern_velfærd = '|'.join(velfærdpolitisk_ordbog_nodup)

In [37]:
# Creating columns
speeches['udlændinge_pol'] = speeches['full_text'].str.contains(pattern_udlændinge)
#blue_speeches['udlændinge_pol'] = blue_speeches['full_text'].str.contains(pattern_udlændinge)
#red_speeches['udlændinge_pol'] = red_speeches['full_text'].str.contains(pattern_udlændinge)

speeches['klima_pol'] = speeches['full_text'].str.contains(pattern_klima)
#blue_speeches['klima_pol'] = blue_speeches['full_text'].str.contains(pattern_klima)
#red_speeches['klima_pol'] = red_speeches['full_text'].str.contains(pattern_klima)

speeches['skatte_pol'] = speeches['full_text'].str.contains(pattern_skatte)
#blue_speeches['skatte_pol'] = blue_speeches['full_text'].str.contains(pattern_skatte)
#red_speeches['skatte_pol'] = red_speeches['full_text'].str.contains(pattern_skatte)

speeches['velfærd_pol'] = speeches['full_text'].str.contains(pattern_velfærd)
#blue_speeches['velfærd_pol'] = blue_speeches['full_text'].str.contains(pattern_velfærd)
#red_speeches['velfærd_pol'] = red_speeches['full_text'].str.contains(pattern_velfærd)

In [None]:
# Creating list of col
all_cols = all_speeches.columns.tolist()
blue_cols = blue_speeches.columns.tolist()
red_cols = red_speeches.columns.tolist()

# Setting up list of cols in right order
all_cols = all_cols[-4:] + all_cols[:-4]
blue_cols = blue_cols[-4:] + blue_cols[:-4]
red_cols = red_cols[-4:] + red_cols[:-4]

# Creating df with right order
all_speeches = all_speeches[all_cols]
blue_speeches = blue_speeches[blue_cols]
red_speeches = red_speeches[red_cols]

In [38]:
print('udlændinge_pol')
print(speeches['udlændinge_pol'].value_counts())
#print(blue_speeches['udlændinge_pol'].value_counts())
#print(red_speeches['udlændinge_pol'].value_counts())

print('\n klima_pol')
print(speeches['klima_pol'].value_counts())
#print(blue_speeches['klima_pol'].value_counts())
#print(red_speeches['klima_pol'].value_counts())

print('\n skatte_pol')
print(speeches['skatte_pol'].value_counts())
#print(blue_speeches['skatte_pol'].value_counts())
#print(red_speeches['skatte_pol'].value_counts())

print('\n velfærd_pol')
print(speeches['velfærd_pol'].value_counts())
#print(blue_speeches['velfærd_pol'].value_counts())
#print(red_speeches['velfærd_pol'].value_counts())

udlændinge_pol
False    1740307
True       79440
Name: udlændinge_pol, dtype: int64

 klima_pol
False    1751863
True       67884
Name: klima_pol, dtype: int64

 skatte_pol
False    1669430
True      150317
Name: skatte_pol, dtype: int64

 velfærd_pol
False    1477113
True      342634
Name: velfærd_pol, dtype: int64


In [39]:
speeches.to_pickle("all_speeches.pkl", protocol=4)

In [None]:
test = all_speeches[all_speeches['klima_pol']==True]

In [None]:
test.iloc[402,24]

In [None]:
# Creating pickles
all_speeches.to_pickle("all_speeches_notfidf.pkl")

#blue_speeches.to_pickle("blue_speeches.pkl")

#red_speeches.to_pickle("red_speeches.pkl")

In [None]:
cols = []
count = 1
for column in red_speeches.columns:
    if column == 'blok':
        cols.append(f'blok{count}')
        count+=1
        continue
    cols.append(column)
red_speeches.columns = cols



In [None]:
red_speeches.drop(['blok2'], axis=1, inplace=True)

In [None]:
red_speeches.rename(columns={'blok1': 'blok'}, inplace=True)

In [None]:
red_speeches['blok']