In [85]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("dataset_final_train_flat.csv")

In [3]:
df_test = pd.read_csv("dataset_final_test_flat.csv")

In [4]:
X_train, y_train= df_train['Texto'], df_train['Emocion']

In [5]:
X_test, y_test= df_test['Texto'], df_test['Emocion']

In [51]:
from nltk.corpus import stopwords
import unicodedata

In [52]:
def decontract(sentence):
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

def removePunctuation(sentence):
    sentence = re.sub(r'[¿|?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.replace("\n"," ")
    return sentence

def removeNumber(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeAccents(sentence):
    alpha_sent = ""
    for word in sentence.split():
        word_no_accents = unicodedata.normalize('NFD', word)
        word_no_accents = ''.join(char for char in word_no_accents if unicodedata.category(char) != 'Mn')        
        alpha_word = re.sub('[^a-z A-Z]+', '', word_no_accents)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    stop_words_english = set(stopwords.words('english'))
    stop_words_spanish = set(stopwords.words('spanish'))
    stop_words = stop_words_english.union(stop_words_spanish)
    filtered_sentence = [w for w in sentence.split() if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def stemming(sentence):
    stemmer = SnowballStemmer("english")
    stemmedSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemmedSentence += stem
        stemmedSentence += " "
    stemmedSentence = stemmedSentence.strip()
    return stemmedSentence

In [43]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
import spacy

In [73]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0mMB/s[0m eta [36m0:00:01[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')


In [82]:
nlp = spacy.load("xx_ent_wiki_sm", disable=["parser", "ner"])

In [83]:
from sklearn.preprocessing import normalize

def spacy_embeddings(texts):
    texts_processed = [decontract(t) for t in texts]
    docs = list(nlp.pipe(texts_processed, batch_size=50))
    vectors = np.array([doc.vector for doc in docs])
    return normalize(vectors, norm='l2')

In [76]:
def custom_preprocess(text):
    print(f'{text}')
    text = decontract(text)
    print(f'{text}')
    text = removePunctuation(text)
    print(f'{text}')
    text = removeNumber(text)
    print(f'{text}')
    text = removeStopWords(text)
    print(f'{text}')
    text = removeAccents(text)
    print(f'{text}')
    doc = nlp(text)
    print(f'{text}')
    lemmas = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    print(f'{' '.join(lemmas)}')
    return ' '.join(lemmas)

In [77]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import re

In [78]:
feature_union = FeatureUnion([
    ("tfidf", TfidfVectorizer(
        preprocessor=custom_preprocess,
        ngram_range=(1, 2),
        max_features=8000,
        norm="l2"
    )),
    ("embeddings", FunctionTransformer(
        spacy_embeddings,
        validate=False
    ))
])

In [65]:
from sklearn.preprocessing import LabelEncoder

In [66]:
le = LabelEncoder()

In [67]:
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [18]:
df_train.Emocion

0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
31966    sadness
31967    sadness
31968        joy
31969      anger
31970    sadness
Name: Emocion, Length: 31971, dtype: object

In [19]:
y_train

array([4, 4, 0, ..., 2, 0, 4], shape=(31971,))

In [20]:
y_test

array([4, 4, 4, ..., 2, 2, 1], shape=(3998,))

In [21]:
temp = df_train['Texto'].sample(15)

In [22]:
temp

4912     i remember sitting in my family room in dallas...
7057         i go to bed feeling very distraught otherwise
3710                       im not feeling too keen on that
14809    i feel thankful happy and blessed and these ar...
279      i feel like i almost convinced myself this is ...
26887    Sólo quiero que vea cómo se siente cuando hace...
29702    Odio sentirme tan indeciso sobre las cosas por...
13999                               i feel like not caring
13230    the day i received the key of my apartment and...
16556    im el tipo que no utiliza una crema hidratante...
6380     im starting to feel and think as if i dont wan...
26334    Supongo que he oído suficiente durante los dos...
3990     i hope you all make the time to play along i h...
2541     i love the liz earle moisturizer it does reall...
9409     i feel terribly unkind to say it span style fo...
Name: Texto, dtype: object

In [80]:
temp2 = feature_union.fit_transform(['Supongo que he oído suficiente durante los dos'])

Supongo que he oído suficiente durante los dos
Supongo que he oído suficiente durante los dos
Supongo que he oído suficiente durante los dos
Supongo que he odo suficiente durante los dos
Supongo odo suficiente dos
Supongo odo suficiente dos
Supongo odo suficiente dos
   


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [92]:
spacy_embeddings(temp)

ValueError: Found array with 0 feature(s) (shape=(15, 0)) while a minimum of 1 is required by the normalize function.