In [1]:
import numpy as np
import pandas as pd
import nltk
import emoji
import string
from textblob import TextBlob
from tqdm import tqdm
# from symspellpy.symspellpy import SymSpell 
# nltk.download('stopwords')
# nltk.download('punkt')

In [2]:
class Preprocessor:
    def __init__(self,
                 emoji = None, #None, rm, translate_en, translate_alias, :translate_en:, :translate_alias:
                 urls_users_remover = False, #False, True
                 lowercaser = False, #False, True
                 punctuation_remover = False, #None,
                 spell_checker = False, #False, True
                 stopwords_remover = False, #False, True
                 word_transformer = None, #lemmatization, stemming
                 ):
        self.emoji = emoji
        self.urls_users_remover = urls_users_remover
        self.lowercaser = lowercaser
        self.spell_checker = spell_checker
        self.punctuation_remover = punctuation_remover
        self.stopwords_remover = stopwords_remover
        self.word_transformer = word_transformer

    def process(self, data:list)->list:
        if self.emoji:
            data = [self.de_emoji(text) for text in tqdm(data)]

        if self.urls_users_remover:
          data = [self.remove_urls_users(text) for text in tqdm(data)]

        if self.lowercaser:
            data = [self.lowercase(text) for text in tqdm(data)]

        if self.spell_checker:
            data = [self.correct_spelling(text) for text in tqdm(data)]

        if self.punctuation_remover:
            data = [self.remove_punctuations(text) for text in tqdm(data)]

        if self.stopwords_remover:
            data = [self.remove_stopwords(text) for text in tqdm(data)]

        if self.word_transformer:
            data = [self.word_transform(text) for text in tqdm(data)]

        return data

    def de_emoji(self,text:str) -> str:
        if self.emoji == 'rm':
            cleaned_text = emoji.replace_emoji(text,'')
        elif self.emoji == 'translate_en':
            cleaned_text = emoji.demojize(text, delimiters=('', ''), language= 'en')
            cleaned_text = cleaned_text.replace('_',' ')
        elif self.emoji == 'translate_alias':
            cleaned_text = emoji.demojize(text, delimiters=('', ''), language= 'alias')
            cleaned_text = cleaned_text.replace('_',' ')
        elif self.emoji == ':translate_en:':
            cleaned_text = emoji.demojize(text, delimiters=(':', ':'), language= 'en')
        elif self.emoji == ':translate_alias:':
            cleaned_text = emoji.demojize(text, delimiters=(':', ':'), language= 'alias')
        return cleaned_text

    def remove_urls_users(self, text:str) ->str:
        cleaned_text = text.replace('[URL]','')
        cleaned_text = cleaned_text.replace('[USER]','')
        return cleaned_text

    def remove_punctuations(self,text:str) -> str:
        cleaned_text = text.translate(str.maketrans("", "", string.punctuation))
        return cleaned_text

    def lowercase(self, text:str) -> str:
        cleaned_text = text.lower()
        return cleaned_text

    def correct_spelling(self,text:str) -> str:
        cleaned_text = TextBlob(text)
        return cleaned_text.correct().string

    def remove_stopwords(self,text:str) -> str:
        stop_words = set(nltk.corpus.stopwords.words('english'))
        tokens = nltk.tokenize.word_tokenize(text)
        filtered_text = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_text)

    def word_transform(self,text:str) -> str:
        if self.word_transformer == 'lemmatization':
            lemmatizer = nltk.stem.WordNetLemmatizer()
            lemmatized_words = [lemmatizer.lemmatize(word) for word in nltk.tokenize.word_tokenize(text)]
            return " ".join(lemmatized_words)
        elif self.word_transformer == 'stemming':
            stemmer = nltk.stem.PorterStemmer()
            stemmed_words = [stemmer.stem(word) for word in text.split()]
            return " ".join(stemmed_words)

In [3]:
# Load the data in df
df = pd.read_csv('data/edos_labelled_aggregated.csv')

# Creat a preprocessor
preprocessor_rm_emoji = Preprocessor(emoji = 'rm', #None, rm, translate_en, translate_alias, :translate_en:, :translate_alias:
                             # urls_users_remover = False,
                             # lowercaser = False, #False, True
                             # punctuation_remover = False, #None,
                             # spell_checker = False, #False, True
                             # stopwords_remover = False, #False, True
                             # word_transformer = None, #lemmatization, stemming
                             )

preprocessor_en_emoji = Preprocessor(emoji = 'translate_en')
preprocessor_alias_emoji = Preprocessor(emoji = 'translate_alias')
preprocessor_een_emoji = Preprocessor(emoji = ':translate_en:')
preprocessor_aalias_emoji = Preprocessor(emoji = ':translate_en:')

preprocessor_rm_urls = Preprocessor(urls_users_remover= True)

preprocessor_check_spelling = Preprocessor(spell_checker=True)

# add preprocessed data into df
text = list(df['text'])

# emoji
text_rm_emoji = preprocessor_rm_emoji.process(text)
text_en_emoji = preprocessor_en_emoji.process(text)
text_een_emoji = preprocessor_een_emoji.process(text)
text_alias_emoji = preprocessor_alias_emoji.process(text)
text_aalias_emoji = preprocessor_aalias_emoji.process(text)

df['text_rm_emoji'] = text_rm_emoji
df['text_en_emoji'] = text_en_emoji
df['text_een_emoji'] = text_een_emoji
df['text_alias_emoji'] = text_alias_emoji
df['text_aalias_emoji'] = text_aalias_emoji

#rm_urls_users
text_rm_emoji_rm_url = preprocessor_rm_urls.process(text_rm_emoji)
text_en_emoji_rm_url = preprocessor_rm_urls.process(text_en_emoji)
text_een_emoji_rm_url = preprocessor_rm_urls.process(text_een_emoji)
text_alias_emoji_rm_url = preprocessor_rm_urls.process(text_alias_emoji)
text_aalias_emoji_rm_url = preprocessor_rm_urls.process(text_aalias_emoji)

df['text_rm_emoji_rm_url'] = text_rm_emoji_rm_url
df['text_en_emoji_rm_url'] = text_en_emoji_rm_url
df['text_een_emoji_rm_url'] = text_een_emoji_rm_url
df['text_alias_emoji_rm_url'] = text_alias_emoji_rm_url
df['text_aalias_emoji_rm_url'] = text_aalias_emoji_rm_url

#cspl check spellings
# text_rm_emoji_rm_url_cspl = preprocessor_check_spelling.process(text_rm_emoji_rm_url)
# text_en_emoji_rm_url_cspl = preprocessor_check_spelling.process(text_en_emoji_rm_url)
# text_een_emoji_rm_url_cspl = preprocessor_check_spelling.process(text_een_emoji_rm_url)
# text_alias_emoji_rm_url_cspl = preprocessor_check_spelling.process(text_alias_emoji_rm_url)
# text_aalias_emoji_rm_url_cspl = preprocessor_check_spelling.process(text_aalias_emoji_rm_url)

# df['text_rm_emoji_rm_url_cspl'] = text_rm_emoji_rm_url_cspl
# df['text_en_emoji_rm_url_cspl'] = text_en_emoji_rm_url_cspl
# df['text_een_emoji_rm_url_cspl'] = text_een_emoji_rm_url_cspl
# df['text_alias_emoji_rm_url_cspl'] = text_alias_emoji_rm_url_cspl
# df['text_aalias_emoji_rm_url_cspl'] = text_aalias_emoji_rm_url_cspl

100%|██████████| 20000/20000 [00:00<00:00, 21456.42it/s]
100%|██████████| 20000/20000 [00:00<00:00, 24403.55it/s]
100%|██████████| 20000/20000 [00:00<00:00, 24729.04it/s]
100%|██████████| 20000/20000 [00:00<00:00, 24613.38it/s]
100%|██████████| 20000/20000 [00:00<00:00, 25225.14it/s]
100%|██████████| 20000/20000 [00:00<00:00, 3067020.58it/s]
100%|██████████| 20000/20000 [00:00<00:00, 3139684.11it/s]
100%|██████████| 20000/20000 [00:00<00:00, 3331192.12it/s]
100%|██████████| 20000/20000 [00:00<00:00, 3070275.97it/s]
100%|██████████| 20000/20000 [00:00<00:00, 2974367.27it/s]


In [4]:
df.to_csv('data/edos_cleaned.csv')