In [216]:
import pandas as pd
import numpy as np
import string
import spacy
import re
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()

In [189]:
df = pd.read_csv('tovima.csv', encoding = 'utf-32',sep='\t',na_values="NaN")

In [190]:
nlp = spacy.load("el_core_news_lg")

In [191]:
def clean_and_tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [192]:
def filter_tokens(tokens):
    filtered_tokens = [
        token for token in tokens
        if not any(char.isalpha() and ord(char) < 128 for char in token)
        and not all(char in string.punctuation for char in token)
        and not token.isspace() and not token == '\n'
    ]
    return filtered_tokens

In [193]:
nlp.Defaults.stop_words.add("«")
nlp.Defaults.stop_words.add("»")
nlp.Defaults.stop_words.add("*")
nlp.Defaults.stop_words.add("”")
nlp.Defaults.stop_words.add("“")

In [194]:
def remove_stopwords(tokens):
    tokens_without_stopwords = [token for token in tokens if token not in nlp.Defaults.stop_words]
    return tokens_without_stopwords

In [195]:
def replace_key_words(tokens):
    for i in range(len(tokens)):
        word=tokens[i]
        if word.lower() == "eclass" or word.lower() == "zoom" or word.lower == "online" or word.lower == "on-line":
            tokens[i]="ψηφιακός"
    return tokens

In [196]:
def replace_dates_and_times(tokens):
    text = ' '.join(tokens)

    # Replace dates with 'ημερομηνία'
    text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', 'ημερομηνία', text)
    text = re.sub(r'\b\d{1,2}-\d{1,2}-\d{2,4}\b', 'ημερομηνία', text)
    text = re.sub(r'\b\d{1,2}\s(?:Ιανουαρίου|Φεβρουαρίου|Μαρτίου|Απριλίου|Μαΐου|Ιουνίου|Ιουλίου|Αυγούστου|Σεπτεμβρίου|Οκτωβρίου|Νοεμβρίου|Δεκεμβρίου)\s\d{2,4}\b', 'ημερομηνία', text)
    
    # Replace times with 'ωρα'
    text = re.sub(r'\b\d{1,2}:\d{2}\b', 'ώρα', text)
    text = re.sub(r'\b\d{1,2}\s(?:πμ|μμ)\b', 'ώρα', text)
    
    return text.split()

In [197]:
def remove_numerics(text):
    return ''.join(char for char in text if not char.isdigit())

In [198]:
def lemmatize(tokens):
    lemmatized_tokens = [token.lemma_ for token in tokens]
    return lemmatized_tokens

In [199]:
def convert_to_lowercase(tokens):
    lowercase_tokens = [token.lower() for token in tokens]
    return lowercase_tokens

In [200]:
def join_tokens(tokens):
        return ' '.join(tokens)

In [201]:
def print_most_common_words(texts, num_common_words=10):
    all_texts = " ".join(texts)
    words = all_texts.split()
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(num_common_words)
    return set([w for (w,wc) in most_common_words])

In [202]:
def remove_freq(text):
    return " ".join([word for word in text.split() if word not in freq_words])

In [203]:
def remove_single_letter_words(sentence):
    words = sentence.split()
    return ' '.join(word for word in words if len(word) > 1)

In [204]:
df['text']=df['Author']+ ' ' + df['Subject']+ ' ' +df['Message']

In [205]:
df=df.dropna().reset_index(drop=True)

In [206]:
df['tokens'] = df['text'].progress_apply(clean_and_tokenize)

100%|██████████| 14264/14264 [14:19<00:00, 16.59it/s]


In [207]:
df['tokens'] = df['tokens'].apply(replace_dates_and_times)

In [208]:
df['tokens'] = df['tokens'].apply(replace_key_words)

In [209]:
df['filtered_tokens'] = df['tokens'].apply(filter_tokens)

In [210]:
df['tokens_without_stopwords'] = df['filtered_tokens'].apply(remove_stopwords)

In [211]:
df['lemmatized_tokens'] = df['tokens_without_stopwords'].progress_apply(lambda x: lemmatize(nlp(" ".join(x))))

100%|██████████| 14264/14264 [06:24<00:00, 37.11it/s]


In [212]:
df['lemmatized_tokens']=df['lemmatized_tokens'].apply(convert_to_lowercase)

In [213]:
df = df[df['lemmatized_tokens'].apply(lambda x: len(x) > 20)]

In [214]:
df['lemmatized_sentences']=df['lemmatized_tokens'].apply(join_tokens)

In [217]:
freq_words=print_most_common_words(df['lemmatized_sentences'], num_common_words=20)

In [218]:
df['final_sentences']=df['lemmatized_sentences'].apply(remove_freq)

In [219]:
df['final_sentences']=df['final_sentences'].apply(remove_numerics)

In [220]:
df['final_sentences']=df['final_sentences'].apply(remove_single_letter_words)

In [225]:
df.reset_index(inplace=True)

In [229]:
df.to_csv('preprocessed.csv', sep='\t', encoding='utf-32')