In [20]:
#Packages, install the necessary ones

import pandas as pd
import numpy as np
import spacy
import nltk

from nltk.corpus import stopwords
from unicodedata import normalize

import re
import scipy as sc

from nltk.tokenize import word_tokenize, sent_tokenize
from keras.preprocessing.text import text_to_word_sequence
from sentiment_analysis_spanish import sentiment_analysis
from spacy.lang.es import Spanish

# Load the model:
nlp = spacy.load('es_core_news_lg')

In [21]:
# Functions to clean data:

def remove_accents(text):
    """
    Strips accents from text
    :param text: Text to clean
    :return:
    """
    try: 
        text_wo_accents = text.replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o') \
            .replace('ú', 'u').replace('ü', 'u')
            
    except:
        pass

    return text_wo_accents


def remove_stopwords(sentence):
    """Remove stopwords"""
    try: 
        sentence = ' '.join(
        [word for word in sentence.split() if word not in stopwords.words('spanish')]
        )
        
    except:
        pass
    
    return(sentence)


def lemmatize_text(sentence, nlp):
    """
    Lemmatize text using spacy lemmatizer for spanish
    :param text:
    :param nlp: spanish language model from spacy
    :return:
    """
    return remove_accents(" ".join([token.lemma_ for token in nlp(sentence, disable = ['ner', 'parser'])]))


def clean_text(sentence, *nlp, remove_stopwords=False, lemmatize=False):
    """Clean string

    * Removes:
    ** html
    ** http links
    ** accents
    ** hasthtags
    ** mentions
    ** punctuation
    ** numbers
    """
    try: 
        tag_re = re.compile(
    
        r'<[^>]+>'
    
        )
        sentence = tag_re.sub(
    
        '', sentence
    
        )
        sentence = re.sub(
        r'https?:\/\/.[^\n ]*',
        '',
        sentence
        )
        sentence = re.sub(
            r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
            normalize( "NFD", sentence), 0, re.I)
    
        sentence = sentence.lower()
        sentence = re.sub(
        r'\@\S+',
        ' ',
        sentence
        )
        sentence = re.sub(r'\#\S+', ' ', sentence)
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
        if remove_stopwords:
            sentence = remove_stopwords(sentence)
    
        sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
        sentence = re.sub(r"^[a-zA-Z]\s+", ' ', sentence)
        sentence = re.sub(r'\s+', ' ', sentence)
    
        if lemmatize:
            sentence = lemmatize_text(sentence, *nlp)
            #sentence = remove_stopwords(sentence)
        sentence = sentence.strip()
    except:
        pass
    return sentence

def clean_data(df, text_column):
    df['text_clean'] = df[text_column].apply(clean_text)
    df['text_normalize'] = df.apply(
        lambda x: clean_text(x[text_column], nlp, 
                             remove_stopwords=False, 
                             lemmatize=True),axis = 1)
    df['text_normalize'] = df['text_normalize'].apply(remove_stopwords)
    df['label'], df['augmented'] = np.nan, np.nan
    df = df.reset_index(drop=True)
    
    return(df)

In [13]:
# Functions to do the sentiment analysis:

def sentiment_metrics(text, sentiments, sentiment_score=True):
    """Computes sentiment score.

    Parameters
    ----------
    text : str
        String containing text to analyse
    sentiments : object
        Instantiate sentiment object
    sentiment_score : bol
        Defines if sentiment score will be computed (defaults to True)
    Returns
    -------
    sentimiento : float
        Sentiment score
    """
    try:
        if sentiment_score:
            sentimiento = sentiments.sentiment(text)
        else:
            sentimiento = np.nan
    except:
        sentimiento = np.nan, np.nan


    return sentimiento

from sentiment_analysis_spanish import sentiment_analysis

def compute_sentiment(df, text):
    sentiments = sentiment_analysis.SentimentAnalysisSpanish()
    df["sentiment_score"] = df.apply(
        lambda x: sentiment_metrics(x[text], sentiments),
        axis=1,
    )
    df_sentiment = pd.DataFrame(
        df["sentiment_score"].values.tolist(), index=df.index
    )
    df_sentiment.rename(columns={0: "sentiment_score"}, inplace=True)
    df_sentiment = pd.concat(
        [df, df_sentiment["sentiment_score"]], axis=1
    )
    return(df_sentiment)


In [22]:
df = pd.read_parquet('../data/hate_speech.parquet')
df

Unnamed: 0,author_id,text
100,203579995,"RT @Lady_Chiyome: Femenina, nunca #FEMINAZI 🤮\..."
101,1358301364384890880,@PabloEchenique @IreneMontero Prefiero escucha...
102,232758195,"RT @danielalozanocu: Todo el año: feminazi, lo..."
103,1325368543614013440,Feminismo≠Feminazi\nFeminismo es igualdad\nUn ...
104,551967420,RT @jmrivas6911: RADFEM\n\nHay cavada una trin...
105,1353562862720069632,"RT @ig_robertss_hdz: todo el año: feminazi, lo..."
106,1352777031700926464,"RT @danielalozanocu: Todo el año: feminazi, lo..."
107,80946750,"RT @danielalozanocu: Todo el año: feminazi, lo..."
108,750296926264946688,@salvameoficial La celebridad será Perico Guti...
109,940062787,RT @onlinemami_: Soy bisexual porque me podria...


In [23]:
df = clean_data(df, 'text')
df = df[~((df['text_clean'].isna()) | (df['text_clean']==''))].reset_index(drop=True)
df[['author_id', 'text','text_clean']].head()

Unnamed: 0,author_id,text,text_clean
0,203579995,"RT @Lady_Chiyome: Femenina, nunca #FEMINAZI 🤮\...",rt femenina nunca
1,1358301364384890880,@PabloEchenique @IreneMontero Prefiero escucha...,prefiero escuchar personas con mas neuronas qu...
2,232758195,"RT @danielalozanocu: Todo el año: feminazi, lo...",rt todo el an feminazi loca abortera deberia m...
3,1325368543614013440,Feminismo≠Feminazi\nFeminismo es igualdad\nUn ...,feminismo feminazi feminismo es igualdad un ho...
4,551967420,RT @jmrivas6911: RADFEM\n\nHay cavada una trin...,rt radfem hay cavada una trinchera entre el od...


In [24]:
df = compute_sentiment(df, 'text_clean')
df[['author_id', 'text','text_clean', 'sentiment_score']].head()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,author_id,text,text_clean,sentiment_score,sentiment_score.1
0,203579995,"RT @Lady_Chiyome: Femenina, nunca #FEMINAZI 🤮\...",rt femenina nunca,0.15214,0.15214
1,1358301364384890880,@PabloEchenique @IreneMontero Prefiero escucha...,prefiero escuchar personas con mas neuronas qu...,0.006189,0.006189
2,232758195,"RT @danielalozanocu: Todo el año: feminazi, lo...",rt todo el an feminazi loca abortera deberia m...,0.069884,0.069884
3,1325368543614013440,Feminismo≠Feminazi\nFeminismo es igualdad\nUn ...,feminismo feminazi feminismo es igualdad un ho...,6e-06,6e-06
4,551967420,RT @jmrivas6911: RADFEM\n\nHay cavada una trin...,rt radfem hay cavada una trinchera entre el od...,0.004116,0.004116
