In [1]:
!pip install gensim



In [2]:
import re  
import pandas as pd 
from time import time  
from collections import defaultdict 
import string 
import multiprocessing
import os
import gensim
import sklearn
from sklearn import linear_model
from collections import Counter
import numpy as np
import scipy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score, classification_report
from nltk.tokenize import word_tokenize
import pickle

# word2vec
from gensim.models import Word2Vec, KeyedVectors, FastText
from gensim.models.phrases import Phrases, Phraser
from sklearn.model_selection import train_test_split
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
from collections import Counter

punctuation = string.punctuation + "«»“”‘’…—"

stopwords_spanish = pd.read_csv(
    'https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt'
).values
stopwords_spanish = Counter(stopwords_spanish.flatten().tolist())

stopwords_english = pd.read_csv(
    'https://raw.githubusercontent.com/Alir3z4/stop-words/master/english.txt'
).values
stopwords_english = Counter(stopwords_english.flatten().tolist())

def simple_tokenizer(doc, stopwords, lower=False):
    if lower:
        tokenized_doc = doc.translate(str.maketrans(
            '', '', punctuation)).lower().split()

    tokenized_doc = doc.translate(str.maketrans('', '', punctuation)).split()

    tokenized_doc = [
        token for token in tokenized_doc if token.lower() not in stopwords
    ]
    return tokenized_doc

In [4]:
def word2vec_from_df(df, stopwords, path):
    emoji_w2v = Word2Vec(min_count=200,
                          window=4,
                          vector_size=200,
                          sample=6e-5,
                          alpha=0.03,
                          min_alpha=0.0007,
                          negative=20,
                          workers=multiprocessing.cpu_count())

    content = df["text"]
    cleaned_content = [simple_tokenizer(doc, stopwords=stopwords) for doc in content.values]
    phrases = Phrases(cleaned_content, min_count=150, progress_per=5000) 
    bigram = Phraser(phrases)
    sentences = bigram[cleaned_content]

    emoji_w2v.build_vocab(sentences, progress_per=10000)

    t = time()
    emoji_w2v.train(sentences, total_examples=emoji_w2v.corpus_count, epochs=15, report_delay=10)
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

    emoji_w2v.init_sims(replace=True)

    emoji_w2v.save(path)
    return emoji_w2v

In [5]:
path =  "../../Data/train/df_us_train.pickle"
df_us_train = pickle.load(open(path, "rb"))

path =  "../../Data/train/df_es_train.pickle"
df_es_train = pickle.load(open(path, "rb"))

In [6]:
%%time
us_emoji_w2v = word2vec_from_df(df_us_train, stopwords_english, "us_emoji_w2v.model")

2022-06-30 12:32:26,482 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=200, alpha=0.03>', 'datetime': '2022-06-30T12:32:26.467519', 'gensim': '4.2.0', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2022-06-30 12:32:31,705 : INFO : collecting all words and their counts
2022-06-30 12:32:31,705 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2022-06-30 12:32:31,750 : INFO : PROGRESS: at sentence #5000, processed 29821 words and 36340 word types
2022-06-30 12:32:31,799 : INFO : PROGRESS: at sentence #10000, processed 59723 words and 67133 word types
2022-06-30 12:32:31,851 : INFO : PROGRESS: at sentence #15000, processed 89629 words and 96385 word types
2022-06-30 12:32:31,901 : INFO : PROGRESS: at sentence #20000, processed 119889 words and 124696 word types
2022-06-30 12:32:31,951 : INFO : PROGRESS: at sentence #25000, processed 149690 words an

2022-06-30 12:32:35,569 : INFO : PROGRESS: at sentence #360000, processed 2153424 words and 1545014 word types
2022-06-30 12:32:35,626 : INFO : PROGRESS: at sentence #365000, processed 2183479 words and 1563543 word types
2022-06-30 12:32:35,687 : INFO : PROGRESS: at sentence #370000, processed 2213300 words and 1581910 word types
2022-06-30 12:32:35,741 : INFO : PROGRESS: at sentence #375000, processed 2243347 words and 1600485 word types
2022-06-30 12:32:35,797 : INFO : PROGRESS: at sentence #380000, processed 2273268 words and 1619180 word types
2022-06-30 12:32:35,852 : INFO : PROGRESS: at sentence #385000, processed 2303285 words and 1637550 word types
2022-06-30 12:32:35,876 : INFO : collected 1645907 token types (unigram + bigrams) from a corpus of 2317037 words and 387292 sentences
2022-06-30 12:32:35,877 : INFO : merged Phrases<1645907 vocab, min_count=150, threshold=10.0, max_vocab_size=40000000>
2022-06-30 12:32:35,878 : INFO : Phrases lifecycle event {'msg': 'built Phrases<

2022-06-30 12:32:45,064 : INFO : Word2Vec lifecycle event {'msg': 'training model with 8 workers on 1509 vocabulary and 200 features, using sg=0 hs=0 sample=6e-05 negative=20 window=4 shrink_windows=True', 'datetime': '2022-06-30T12:32:45.064509', 'gensim': '4.2.0', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
2022-06-30 12:32:46,095 : INFO : EPOCH 0 - PROGRESS: at 6.17% examples, 23719 words/s, in_qsize 14, out_qsize 1
2022-06-30 12:32:50,471 : INFO : EPOCH 0: training on 2262016 raw words (387020 effective words) took 5.4s, 71810 effective words/s
2022-06-30 12:32:51,759 : INFO : EPOCH 1 - PROGRESS: at 13.24% examples, 41055 words/s, in_qsize 15, out_qsize 0
2022-06-30 12:32:55,915 : INFO : EPOCH 1: training on 2262016 raw words (387033 effective words) took 5.4s, 71616 effective words/s
2022-06-30 12:32:56,945 : INFO : EPOCH 2 - PROGRESS: at 10.59% examples, 40048 words/s, in_qsize 15, out

Time to train the model: 1.3 mins
Wall time: 1min 37s


In [7]:
%%time
es_emoji_w2v = word2vec_from_df(df_es_train, stopwords_spanish, "es_emoji_w2v.model")

2022-06-30 12:34:03,484 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=200, alpha=0.03>', 'datetime': '2022-06-30T12:34:03.484815', 'gensim': '4.2.0', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2022-06-30 12:34:04,387 : INFO : collecting all words and their counts
2022-06-30 12:34:04,388 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2022-06-30 12:34:04,434 : INFO : PROGRESS: at sentence #5000, processed 28478 words and 33175 word types
2022-06-30 12:34:04,481 : INFO : PROGRESS: at sentence #10000, processed 57196 words and 61335 word types
2022-06-30 12:34:04,517 : INFO : PROGRESS: at sentence #15000, processed 85826 words and 86676 word types
2022-06-30 12:34:04,561 : INFO : PROGRESS: at sentence #20000, processed 114762 words and 111512 word types
2022-06-30 12:34:04,601 : INFO : PROGRESS: at sentence #25000, processed 143174 words an

2022-06-30 12:34:17,864 : INFO : EPOCH 11: training on 458686 raw words (15687 effective words) took 0.8s, 19312 effective words/s
2022-06-30 12:34:18,824 : INFO : EPOCH 12: training on 458686 raw words (15695 effective words) took 0.9s, 16642 effective words/s
2022-06-30 12:34:19,735 : INFO : EPOCH 13: training on 458686 raw words (15813 effective words) took 0.9s, 17650 effective words/s
2022-06-30 12:34:20,636 : INFO : EPOCH 14: training on 458686 raw words (15496 effective words) took 0.9s, 17548 effective words/s
2022-06-30 12:34:20,637 : INFO : Word2Vec lifecycle event {'msg': 'training on 6880290 raw words (234669 effective words) took 13.8s, 16995 effective words/s', 'datetime': '2022-06-30T12:34:20.637925', 'gensim': '4.2.0', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
  emoji_w2v.init_sims(replace=True)
2022-06-30 12:34:20,641 : INFO : Word2Vec lifecycle event {'fname_or_handle': '

Time to train the model: 0.23 mins
Wall time: 17.2 s


In [8]:
def sentence_embedding_from_w2v(doc, w2v):
    pre_tokens = simple_tokenizer(doc, stopwords_spanish, lower=False)
    sentence_vect = np.zeros(w2v.vector_size)
    m = 0
    for token in pre_tokens:
        if token in w2v.wv:
            sentence_vect += w2v.wv[token]
            m += 1
    sentence_vect *= 1/m if m>0 else 1
    return sentence_vect

In [9]:
%%time
us_train_w2v_sentence_embedding = pd.DataFrame(np.array(
    [ sentence_embedding_from_w2v(doc, us_emoji_w2v) for doc in df_us_train["text"]]
))
us_train_w2v_sentence_embedding["id"] = df_us_train["id"]
us_train_w2v_sentence_embedding["label"] = df_us_train["label"]

Wall time: 17.6 s


In [10]:
%%time
es_train_w2v_sentence_embedding = pd.DataFrame(np.array(
    [ sentence_embedding_from_w2v(doc, es_emoji_w2v) for doc in df_es_train["text"]]
))
es_train_w2v_sentence_embedding["id"] = df_es_train["id"]
es_train_w2v_sentence_embedding["label"] = df_es_train["label"]

Wall time: 2.95 s
