In [17]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import re
import os
import gensim
import gensim.downloader as gensim_api
from nltk import word_tokenize
import pickle
from sklearn.model_selection import train_test_split

## load custom stop words
lst_stopwords = pd.read_csv("./lst_custom_stopwords.txt")
lst_stopwords = lst_stopwords["header"].to_list()

In [2]:
dataset = pd.read_csv("./data/posts2.csv")

In [3]:
dataset

Unnamed: 0,doc,label
0,Filipino clients may not take pain medication ...,Present
1,I miss my lil Percocet phase 😂😂 I c y niggas t...,Past
2,I can not sleep so maybe I will use tramadol,Future
3,I’ve got opiate-withdrawal leg muscle spasms t...,Present
4,there is no better taste than oxycodone in the...,Present
...,...,...
643,Nope 10 year oxycodone run. But they tested me...,Past
644,I appreciate it a lot. Im 19 been addicted for...,Present
645,I gotta yeast overgrowth from kratom and it ru...,Present
646,Kratom gave me a yeast overgrowth and ruin my ...,Past


In [8]:
def preprocess_text(text, use_stem=False, use_lemm=True, lst_stopwords=None):
    
    ## de-construct contractions
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"gonna", "going to", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    
    ## clean (convert \n or \t to " ", lower case, remove punctuations, strip, remove hyperlinks)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'http\S+', '', str(text).lower().strip())
    
    ## tokenize (make a list of text)
    lst_text = text.split()
    
    ## remove stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
        
    ## stemming (remove -ing, -ly, ...)
    if use_stem == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
    
    ## lemmatization (convert word into base form)
    if use_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
        
    ## join lst back to string
    
    text = " ".join(lst_text)
    return text

In [9]:
dataset["clean"] = [preprocess_text(x, use_stem=False, use_lemm=False, lst_stopwords=lst_stopwords) for x in dataset["doc"]]

In [11]:
X = dataset.drop(["label"], axis=1)
y = dataset["label"]

In [14]:
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X, y, test_size = 0.33, shuffle=True, random_state=42)

X_train_w.reset_index(inplace=True, drop=True)
X_test_w.reset_index(inplace=True, drop=True)
y_train_w.reset_index(inplace=True, drop=True)
y_test_w.reset_index(inplace=True, drop=True)

In [33]:
corpus = X["clean"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

model = gensim.models.word2vec.Word2Vec(lst_corpus, vector_size=300, window=8, min_count=1, sg=1, epochs=30)
model.wv["opioid"].shape

(300,)

In [38]:
model1_dict = dict(zip(model.wv.index_to_key, model.wv.vectors))
model.save("embeddings_model1.pkl")

In [35]:
## Train-only w2v
corpus = X_train_w["clean"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

model2 = gensim.models.word2vec.Word2Vec(lst_corpus, vector_size=300, window=8, min_count=1, sg=1, epochs=30)
model2.wv["opioid"].shape

(300,)

In [36]:
model2_dict = dict(zip(model2.wv.index_to_key, model2.wv.vectors))
model2.save("embeddings_model2.pkl")

In [39]:
print(f'length of word embeddings {len(model.wv.key_to_index.keys())}')
print(f'length of word embeddings {len(model2.wv.key_to_index.keys())}')

length of word embeddings 3009
length of word embeddings 2406
