In [1]:
import gensim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
import nltk
nltk.download('all') # para evitar futuras dependências
from nltk.corpus import stopwords

Lendo o dataset e fazendo o pré processamento

In [4]:
df_text = pd.read_csv('dataset.csv', sep=";")

In [5]:
df_text.head()

Unnamed: 0,phrase,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [6]:
english_stopwords = stopwords.words("english")
# no dataset tem im ao invés de I'm, então fez-se esse tipo de exclusão também.
more_words = ['im', 'ive', 'youve', 'youll', 'youd', 'wasnt', 'shouldve', 'hes', 'shes', 'dont', 'doesnt', 'didnt', 'couldnt',
'wouldnt', 'werent', 'hasnt', 'arent','wont']
for word in more_words:
  english_stopwords.append(word)

In [7]:
# faz a tokenização usando como separador o espaço em branco
whitespace_tokenizer = nltk.tokenize.WhitespaceTokenizer()

Remoção das stopwords

In [8]:
# retirando as palavras que estão contidas na lista de stopwords
df_text_nostop = df_text['phrase'].apply(lambda x: ' '.join(x for x in x.split() if x not in english_stopwords))

Lemmatização

In [9]:
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    lemma_words = [lemmatizer.lemmatize(w) for w in whitespace_tokenizer.tokenize(text)]
    return ' '.join(lemma_words)
df_text_nostop = df_text_nostop.apply(lemmatize_text)
df_text_nostop.head()

0                feeling rather rotten ambitious right
1                            updating blog feel shitty
2      never make separate ever want feel like ashamed
3    left bouquet red yellow tulip arm feeling slig...
4                              feeling little vain one
Name: phrase, dtype: object

In [10]:
def read_file(df):
  for i, phrase in enumerate(df):
    if i % 1000 == 0:
      print("li {0} frases".format(i))

    yield gensim.utils.simple_preprocess(phrase) # faz a tokenização (talvez desnecessário)


Pré processamento feito

In [11]:
documents = list(read_file(df_text_nostop))
print(documents)

li 0 frases
li 1000 frases
li 2000 frases
li 3000 frases
li 4000 frases
li 5000 frases
li 6000 frases
li 7000 frases
li 8000 frases
li 9000 frases
li 10000 frases
li 11000 frases
li 12000 frases
li 13000 frases
li 14000 frases
li 15000 frases
li 16000 frases
li 17000 frases
li 18000 frases
li 19000 frases


Divisão do dataset tratado

In [12]:
X_train, X_test, y_train, y_test = train_test_split(documents, df_text["emotion"], test_size=0.3)

# Word2Vec

Treinamento do word2vec e vetorização das palavras

In [13]:
model_w2v = gensim.models.Word2Vec(X_train, vector_size=200, window= 10, workers= 6)

In [14]:
model_w2v.train(X_train, total_examples = len(X_train), epochs= 20)



(1839163, 2526760)

Observação: A taxa de aprendizado, quando se treina o word2vec, tava dando alta a cada época (o que indica que tá tendo problema de aprendizado do word2vec no dataset) (possível sugestão: trocar o dataset?)

In [15]:
w2v_words = list(model_w2v.wv.index_to_key) # pegando todas as palavras dentro do Word2Vec

In [None]:
w2v_words

Vetorizando

In [16]:
def vectorize(sentence):
  words_vecs = [model_w2v.wv[word] for word in sentence if word in w2v_words]
  if len(words_vecs) == 0:
    return np.zeros(200)
  words_vecs = np.array(words_vecs, dtype= float)
  return words_vecs.mean(axis=0)

In [17]:
X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

Usando classificadores

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier

Como o word2vec gera embeddings com números negativos, é preferivel fazer uma normalização com o MinMaxScaler

In [19]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
log_regression = LogisticRegression()
ovr = OneVsRestClassifier(log_regression)
ovr.fit(X_train, y_train)
y_pred = ovr.predict(X_test)

In [None]:
# cnb = ComplementNB()

# # Como o word2vec tá retornando valores negativos, fiz um min max pra arrumar a
# # escala pro classificador.
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.fit_transform(X_test)
# cnb.fit(X_train, y_train)
# y_pred = cnb.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.41683333333333333
Precision: 0.5033670124366019
Recall: 0.41683333333333333
F1-score: 0.37713140087266184


Performance mto ruim

Possiveis melhorias:
Usar outros classificadores, Logistic Regression e SVM (Usar o one-vs-rest ou one-vs-one para gerar mais classificadores)
Talvez trocar o dataset

# Fast Text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, df_text["emotion"], test_size=0.3)

In [None]:
from gensim.models.fasttext import FastText

In [None]:
model_ft = FastText(X_train, vector_size = 150, window= 10, min_count = 2, workers= 4)



In [None]:
model_ft.train(X_train, total_examples = len(X_train), epochs= 20)



(2317756, 2887340)

Vetorização

In [None]:
ft_words = model_ft.wv.index_to_key # pegando todas as palavras dentro do FastText

In [None]:
ft_words

In [None]:
def vectorize(sentence):
  words_vecs = [model_ft.wv[word] for word in sentence if word in ft_words]
  if len(words_vecs) == 0:
    return np.zeros(150)
  words_vecs = np.array(words_vecs)
  return words_vecs.mean(axis=0)

In [None]:
X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

Classificadores

In [None]:
# cnb = ComplementNB()

# # Como o FastText tbm tá retornando valores negativos, fiz um min max pra arrumar a
# # escala pro classificador.
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.fit_transform(X_test)
# cnb.fit(X_train, y_train)
# y_pred = cnb.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.38975
Precision: 0.33238201282076113
Recall: 0.38975
F1-score: 0.3286917622022275


Deu a mesma performance ruim

Me pergunto se a vetorização está funcionando conforme o esperado (eu imagino que sim)