In [None]:
import pandas as pd
import locale
from preprocessing_functions import *
from sklearn.model_selection import train_test_split

locale.setlocale(locale.LC_ALL, 'da_DK.UTF-8')

In [None]:
# Her hentes labelled artikler ind

xx = pd.read_feather('data/articles_df_daniel_label.feather')
yy = pd.read_feather('data/articles_df_esben_label.feather')

articles_df = xx.append(yy)

del xx, yy

# remove all rows where articles_score is NaN – Keeping all the labelled articles
articles_df = articles_df[articles_df['articles_score'].notna()].reset_index(drop=True)

stop = (stopwords.words('danish')) # load stopwords

In [None]:
articles_df.head(5)

In [None]:
articles_df['article_title'].fillna("",inplace=True)
articles_df['article_sub_header'].fillna("",inplace=True)
articles_df['article_body'].fillna("",inplace=True)

# if it is not None set the None to empty string
articles_df['article_textbox'].fillna("",inplace=True)

In [None]:
articles_df.tail()

In [None]:
# Combining the title, subheader and body into one column
articles_df['article'] = articles_df['article_title'] + ' ' + articles_df['article_sub_header'] + ' ' + articles_df['article_body'] + articles_df['article_textbox']
# articles_df['article_deep'] = articles_df['article_title'] + ' ' + articles_df['article_sub_header'] + ' ' + articles_df['article_body'] + articles_df['article_textbox']

articles_df[articles_df['article'].isna()] # Checking if there are any empty articles. There must NOT be any.

In [None]:
articles_df['article']

In [None]:
# A lot of preprocessing of the text 

# Cleaning html elements
articles_df['article'] = articles_df['article'].apply(lambda x: remove_html_elements(x))
articles_df['article_bert'] = articles_df['article'].apply(lambda x: remove_html_elements(x)) 

# Cleaning extra spaces
articles_df['article'] = articles_df['article'].apply(lambda x: remove_extra_spaces(x))
articles_df['article_bert'] = articles_df['article'].apply(lambda x: remove_extra_spaces(x))

# Appending a column with the stopwords count
articles_df['stopword_count'] = articles_df['article'].apply(lambda x: stopword_counter(x)) 

# Appending a column with the punctuation count
articles_df['period_count'] = articles_df['article'].apply(lambda x: punctuation_counter(x))

# Appending a column with the word count
articles_df['word_count'] = articles_df['article'].apply(lambda x: word_counter(x))

# Appending a column with the character count
articles_df['character_count'] = articles_df['article'].apply(lambda x: character_counter(x))

# Appending a column with the sentiment score
articles_df['sentiment_analysis'] = articles_df['article'].apply(lambda x: sentiment_analysis(x))

# Some more preprocessing of the text
articles_df['article'] = articles_df['article'].apply(lambda x: lowercasing(x)) # Lowercasing
articles_df['article'] = articles_df['article'].apply(lambda x: remove_punctuation(x)) # Removing punctuation

# Count number of long words (>6 characters) in each body in articles_df and add to dataframe
articles_df['long_words_count'] = articles_df['article'].apply(lambda x: long_words_counter(x))

# Appends a column with the LIX score
articles_df['lix'] = (articles_df['word_count']/articles_df['period_count']) + ((articles_df['long_words_count'])*100 / articles_df['word_count'])

# Appends a column with the number of ci words
articles_df['ci_words_count'] = articles_df['article'].apply(lambda x: count_ci_words(x))

# Removing stopwords
articles_df['article'] = articles_df['article'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [None]:
# Checking the most common words to see if any needs to be added to the stopword list
common_words = pd.Series(' '.join(articles_df['article']).split()).value_counts()[:20]
common_words

In [None]:
# Saving the entire labeled dataframe as a feather after the extra stopwords are removed

extra_stopwords = ['så', 'kan'] # The chosen words to also be removed as stopwords
articles_df['article'] = articles_df['article'].apply(lambda x: " ".join(x for x in x.split() if x not in extra_stopwords)) # removing the extra stopwords

articles_df.reset_index(inplace=True, drop=True) # Resetting the index
articles_df.to_feather('data/articles_df_LABELLED_preprocessed.feather')

In [None]:
articles_df.head()

In [None]:
# draw 114 articles_constructive = true and 114 articles_constructive = false without replacement
articles_df_balanced = pd.concat([articles_df[articles_df['articles_constructive'] == False].sample(n=114, replace=False, random_state=42).reset_index(drop=True),articles_df[articles_df['articles_constructive'] == True].sample(n=114, replace=False, random_state=42).reset_index(drop=True)])

# split articles_df_balanced into a balanced train validation and test set
articles_df_train, articles_df_test = train_test_split(articles_df_balanced, test_size=0.21, random_state=42, stratify=articles_df_balanced['articles_constructive'])
articles_df_train, articles_df_val = train_test_split(articles_df_train, test_size=0.23, random_state=42, stratify=articles_df_train['articles_constructive'])

In [None]:
articles_df_train['articles_constructive'].value_counts()


In [None]:
articles_df_val['articles_constructive'].value_counts()


In [None]:
articles_df_test['articles_constructive'].value_counts()

In [None]:
articles_df_train.to_csv('data/train_new.csv', index=False)
articles_df_val.to_csv('data/val_new.csv', index=False)
articles_df_test.to_csv('data/test_new.csv', index=False)