In [4]:
import re
import pandas as pd
import numpy as np
import unicodedata
from enelvo.normaliser import Normaliser
import spacy
from pandarallel import pandarallel

ModuleNotFoundError: No module named 'spacy'

## Pre processing functions

In [2]:
def remove_tt_username(text):
    text = str(text)
    no_tt_username = re.sub(r'\@\S+', '', text)
    return no_tt_username

def identify_emoticons(text):
    text = str(text)
    text = re.sub(r'\:\-?\)+', ' cara feliz ', text)
    text = re.sub(r'\:\-?[dDpP]+', ' cara feliz ', text)
    text = re.sub(r'\:\-?\'?\(+', ' cara triste ', text)
    text = re.sub(r'\>\:\-?\(+', ' cara brava ', text)
    return text

def remove_hashtags(text):
    text = str(text)
    no_hashtags = re.sub(r'\#\S+', '', text)
    return no_hashtags

def remove_phone(text):
    text = str(text)
    text = re.sub(r'(\(?(\d{2,3})\)?)?\ ?\d{4,5}\-?\ ?\d{4}', ' ', text)
    return text

def remove_url(text):
    text = str(text)
    text = re.sub(r'https?\:\/\/\S+', ' ', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'[a-zA-Z|.]+\.com(\.br)?', ' link ', text)
    return text

def remove_date(text):
    text = str(text)
    text = re.sub(r'((\d{1,2}\/)(\d{1,2}\/?)(\d{2,4})?)', ' ', text)
    text = re.sub(r'((\d{1,2}\-)(\d{1,2}\-?)(\d{2,4})?)', ' ', text)
    text = re.sub(r'((\d+(\s+[deDE]+\s+)[aA-zZ|ç|Ç]+((\s+[deDE]+\s+)\d+)?))', ' ', text)
    return text

def remove_hour(text):
    text = str(text)
    text = re.sub(r'(\d+)\:(\d+)[hH]?(\:\d+)?[hH]?[rsRS]\w?', ' ', text)
    text = re.sub(r'(\d+)[hH](\d+)', ' < hora > ', text)
    return text

def remove_number(text): 
    text = str(text)
    text = re.sub(r'[0-9]', '', text)
    return text

def lowercase(text):
    text = str(text)
    text = text.lower()
    return text

def remove_oneword(text):
    text = str(text)
    if len(text.split()) > 1:        
        return text
    return

def load_stopword(text):
    global stopwords
    with open(text, 'r') as file:
        stopwords = [line.strip() for line in file] 

def remove_stopword(text):
    text = str(text) 
    text = [word for word in text.split() if word not in stopwords]
    text = ' '.join(text)
    return text

def remove_accent(text):
    text = str(text) 
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode("utf-8")
    return text

def remove_emoji(text):
    text = str(text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U000E007F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF" 
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def remove_laugh(text):
    text = str(text)
    text = re.sub(r'\b([haueirskj])*\b', ' ', text)
    return text

def remove_punction(text): 
    text = str(text) 
    text = re.sub(r'[!"#$%&\'()*+,-.º<>/:;=?@[/\/\]^_`{|}~]', ' ', text)
    return text

norm = Normaliser(tokenizer='readable')
def enelvo_corrector(text):
    text = str(text)
    text = norm.normalise(text)
    return text

def preprocessing(data, stopwords_dir = 'stopwords.txt'):
    data = pd.Series(data)
    data = data.apply(remove_tt_username)
    data = data.apply(remove_hashtags)
    data = data.apply(identify_emoticons)
    data = data.apply(remove_url)
    data = data.apply(remove_phone)
    data = data.apply(remove_hour)
    data = data.apply(remove_date)
    data = data.apply(remove_number)
    data = data.apply(remove_emoji)
    data = data.apply(lowercase)
    data = data.apply(remove_laugh)
    load_stopword(stopwords_dir)
    data = data.apply(remove_stopword)
    data = data.apply(remove_accent)
    data = data.apply(remove_punction)
    pandarallel.initialize()
    data = data.parallel_apply(enelvo_corrector)
    data = data.apply(remove_oneword)
    return data