In [None]:
import os
import re
import pandas as pd
from datetime import datetime
import pickle

### Settings paths

In [None]:
os.chdir("..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus','prepared','corpus.csv')
result_path = os.path.join(os.path.abspath(os.curdir),'corpus','preprocessed')

### Loading prepared corpus


In [None]:
df = pd.read_csv(data_path)
df.drop(columns=df.columns[0],axis=1,inplace=True)
df.sort_values(by='id', ascending=True, inplace=True)
df.reset_index(drop=True,inplace=True)

### Data pre-processing

In [None]:
def pre_processing (dataframe):
    '''

    :param dataframe: dataframe prepared during the first preprocessing step
    :return: dataframe with a column that contains lowercased plenary speeches without punctuation and numeric values
    '''
    speechContent_column = dataframe['speechContent']

    # convert to lowercase
    speechContent_column = speechContent_column.apply(lambda x: ' '.join([w.lower() for w in x.split()]))

    # keep words incl. Umlaute
    speechContent_column = speechContent_column.apply(lambda x: ' '.join(re.sub("[^a-zA-Z\x7f-\xff]+", " ", x).split()))

    # remove numbers
    speechContent_column = speechContent_column.apply(lambda x: ' '.join([w for w in x.split() if not w.isnumeric()]))

    return speechContent_column

df['speechContent_cleaned'] = pre_processing(dataframe=df)

df = df.loc[(df.speechContent_cleaned != ""), :]
time_steps = df.electoralTerm.to_list()
speeches = df.speechContent_cleaned.to_list()

### Saving pre-processed files to disk

In [None]:
file_name = os.path.join(result_path,'electoralTerms', 'BERTopic_time_steps.pkl')
with open(file_name, 'wb') as handle:
    pickle.dump(time_steps, handle)

file_name = os.path.join(result_path,'corpus', 'BERTopic_corpus_preprocessed.pkl')
with open(file_name, 'wb') as handle:
    pickle.dump(speeches, handle)
