In [18]:
import pandas as pd
# from pandas_profiling import ProfileReport

from num2words import num2words
import re, unicodedata
# import contractions
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk

In [19]:
# %conda install -c conda-forge contractions

# Preparacion de ambiente

In [20]:
# Punkt permite separar un texto en frases.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ernes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
# Descarga todas las palabras vacias, es decir, aquellas que no aportan nada al significado del texto
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ernes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# Descarga de paquete WordNetLemmatizer, este es usado para encontrar el lema de cada palabra
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ernes\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Exploracion de datos

In [23]:
df_raw = pd.read_excel('cat_6716.xlsx')

In [24]:
# Contar valores nulos
df_raw.isnull().sum()

Textos_espanol    0
sdg               0
dtype: int64

In [25]:
# Contar duplicados
df_raw.duplicated().sum()

0

In [26]:
df_raw.head()

Unnamed: 0,Textos_espanol,sdg
0,"Es importante destacar que, en un año de sequí...",6
1,Hay una gran cantidad de literatura sobre Aust...,6
2,"Los procesos de descentralización, emprendidos...",6
3,Esto puede tener consecuencias sustanciales pa...,6
4,La función de beneficio también incorpora pará...,6


In [27]:
textos = df_raw.copy()
textos['Conteo'] = [len(x) for x in textos['Textos_espanol']]
textos['Moda'] = [max(set(x.split(' ')), key = x.split(' ').count) for x in textos['Textos_espanol']]
textos['Max'] = [[max([len(x) for x in i.split(' ')])][0] for i in textos['Textos_espanol']]
textos['Min'] = [[min([len(x) for x in i.split(' ')])][0] for i in textos['Textos_espanol']]

# Se realiza un perfilamiento de los datos con la libre pandas profiling
# ProfileReport(textos)

# Preparacion de datos

### Separar en datos de entrenamiento y prueba

In [28]:
X, Y = df_raw['Textos_espanol'], df_raw['sdg']
Y = Y.astype(int)
Y

0        6
1        6
2        6
3        6
4        6
        ..
2995    16
2996    16
2997    16
2998    16
2999    16
Name: sdg, Length: 3000, dtype: int32

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [30]:
X_train.shape, Y_train.shape


((2400,), (2400,))

In [31]:
X_test.shape, Y_test.shape

((600,), (600,))

In [32]:
X_train.head()

2370    Lograr el respeto de los derechos humanos por ...
1774    Diseño de proyectos sostenibles de electrifica...
731     En las partes españolas de las cuencas del Due...
271     Garantizar que los arreglos de gobernanza ayud...
1077    Además, a nivel de plantas y unidades, no hay ...
Name: Textos_espanol, dtype: object

#### **3.1 Limpieza de los datos**

https://medium.com/datos-y-ciencia/preprocesamiento-de-datos-de-texto-un-tutorial-en-python-5db5620f1767

In [33]:
# Pasar todos los caracteres Ã³ a o.
X_train = X_train.str.replace('Ã³', 'o')  

In [34]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def replace_numbers(words):
    """Replace all integer occurrences in list of tokenized words with textual representation in Spanish"""
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = num2words(word, lang='es')
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words



def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('spanish'):
            new_words.append(word)
    return new_words

def preprocessing(words):
    words = to_lowercase(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

#### **3.2 Tokenización**

In [35]:
# df_raw['Message'] = df_raw['Message'].apply(contractions.fix) #Aplica la corrección de las contracciones

In [36]:
#Aplica la eliminación del ruido
X_train = X_train.apply(word_tokenize).apply(preprocessing) 
X_train.head()

2370    [lograr, respeto, derechos, humanos, parte, em...
1774    [diseno, proyectos, sostenibles, electrificaci...
731     [partes, espanolas, cuencas, duero, guadiana, ...
271     [garantizar, arreglos, gobernanza, ayuden, mov...
1077    [ademas, nivel, plantas, unidades, pruebas, di...
Name: Textos_espanol, dtype: object

#### **3.3. Normalización**

In [37]:
# install spacy
# TODO: Instalar spacy y correr la funcion de lematizacion
# %conda install -c conda-forge spacy

In [38]:
from nltk.stem.snowball import SpanishStemmer
# import spacy
#TODO: Probar stemming vs lematizacion y ver cual funciona mejor
def stem_words_spanish(words):
    # Stem words in list of tokenized words (Spanish)
    stemmer = SpanishStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems



def lemmatize_verbs_spanish(text):
    """Lemmatize verbs in a text (Spanish)"""
    nlp = spacy.load("es_core_news_sm")
    doc = nlp(text)
    lemmas = [token.lemma_ if token.pos_ == "VERB" else token.text for token in doc]
    return lemmas

X_train_stemmed = X_train.copy()
X_train_lematized = X_train.copy()
X_train_stemmed = X_train.apply(stem_words_spanish) #Aplica steming
X_train_lematized = X_train.apply(stem_words_spanish) #Aplica lematización 


In [39]:
X_train_stemmed.head()

2370    [logr, respet, derech, human, part, empres, re...
1774    [disen, proyect, sosten, electrif, rural, cone...
731     [part, espanol, cuenc, duer, guadian, respect,...
271     [garantiz, arregl, gobern, ayud, moviliz, fina...
1077    [adem, nivel, plant, unidad, prueb, diferent, ...
Name: Textos_espanol, dtype: object

In [40]:
X_train_lematized.head()

2370    [logr, respet, derech, human, part, empres, re...
1774    [disen, proyect, sosten, electrif, rural, cone...
731     [part, espanol, cuenc, duer, guadian, respect,...
271     [garantiz, arregl, gobern, ayud, moviliz, fina...
1077    [adem, nivel, plant, unidad, prueb, diferent, ...
Name: Textos_espanol, dtype: object

##### **3.4 Selección de campos**

Primero, se separa la variable predictora y los textos que se van a utilizar.

In [None]:
# Transformación Term-frecuency times inverse Document-frecuency.
tf_idf = TfidfVectorizer()
X_tf_idf = tf_idf.fit_transform(X_train_lematized)
print(X_tf_idf.shape)
X_tf_idf.toarray()[0]

(3000, 11841)


array([0., 0., 0., ..., 0., 0., 0.])