In [1]:
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from data.gensim_data.google import load_data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
import transformers
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import string


punctuation = string.punctuation
stop_words = stopwords.words('english')

In [3]:
model = load_data("data/gensim_data/google")

In [5]:
def read_txt_as_str(f_name):
    with open(f_name, 'r') as file:
        return file.read().replace('\n', ' ')

In [9]:
positive_car = read_txt_as_str("data/positive_car.txt")
negative_car = read_txt_as_str("data/negative_car.txt")

advertising_keywords = ["car", "drive", "fast"]

In [118]:
def preproccess_txt(txt, model):
    # tokenize
    tokens = word_tokenize(txt.lower())

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stopwords
    final_tokens = [token for token in lemmas if ((not (token in stop_words)) and (not (token in punctuation)))]

    # remove words not in model
    final_tokens = [w for w in final_tokens if w in model.key_to_index]
    return final_tokens

In [28]:
def get_tfid_vec(txt):
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', strip_accents='ascii')
    vector = vectorizer.fit_transform(txt)
    return vector

In [87]:
def get_lsa(vecs):
    lsa = TruncatedSVD()
    dtm_lsa = lsa.fit_transform(vecs)
    return dtm_lsa

In [79]:
def get_vec_from_model(model, words, single_vec=False):
    vecs = [model[w] for w in words if w in model.key_to_index]
    vecs = np.array(vecs)
    
    if single_vec:
        return vecs.mean(axis=0)
    else:
        return vecs

In [102]:
def words_2_doc(words):
    return ' '.join(words)

In [85]:
def get_vec_from_model(model, words, single_vec=True):
    vecs = [model[w] for w in words if w in model.key_to_index]
    vecs = np.array(vecs)
    
    if single_vec:
        return vecs.mean(axis=0)
    else:
        return vecs

In [120]:
def get_vec_similarity(all_text, keywords, model, lsa):
    all_vecs = []
    for text in all_text:
        proc_text = preproccess_txt(text, model)
        vecs = get_vec_from_model(model, proc_text)
        all_vecs.append(vecs)

    proc_keywords = preproccess_txt(keywords, model)
    keyword_vecs = get_vec_from_model(model, proc_keywords)

    stacked_vecs = np.stack(all_vecs + [keyword_vecs])
    if lsa:
        return cosine_similarity(get_lsa(stacked_vecs))
    else:
        return cosine_similarity(stacked_vecs)

In [121]:
lsa = get_vec_similarity([positive_car, negative_car], words_2_doc(advertising_keywords), model, True)
cosine = get_vec_similarity([positive_car, negative_car], words_2_doc(advertising_keywords), model, False)

In [115]:
lsa

array([[1.        , 0.99134505, 0.4975334 ],
       [0.99134505, 1.        , 0.6071074 ],
       [0.4975334 , 0.6071074 , 0.99999994]], dtype=float32)

In [116]:
cosine

array([[1.0000002 , 0.62357   , 0.46202835],
       [0.62357   , 1.0000002 , 0.5324671 ],
       [0.46202835, 0.5324671 , 1.0000004 ]], dtype=float32)

In [125]:
pos_similarity = model.n_similarity(preproccess_txt(positive_car, model), advertising_keywords)
neg_similarity = model.n_similarity(preproccess_txt(negative_car, model), advertising_keywords)
print("positive", pos_similarity)
print("negative", neg_similarity)

positive 0.46202832
negative 0.5324669
