In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from data.gensim_data.google import load_data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import string
from gensim import corpora
import gensim
import aspect_based_sentiment_analysis as absa
nlp = absa.load()

punctuation = string.punctuation
stop_words = stopwords.words('english')

In [7]:
# the model is downloaded from gensim and stored in this relative directory
model = load_data("data/gensim_data/google")

In [8]:
def read_txt_as_str(f_name):
    with open(f_name, 'r') as file:
        return file.read().replace('\n', ' ')

In [32]:
positive_car = read_txt_as_str("data/positive_car.txt")
negative_car = read_txt_as_str("data/negative_car.txt")
positive_plane = read_txt_as_str("data/positive_plane.txt")
negative_plane = read_txt_as_str("data/negative_plane.txt")

advertising_keywords = ["car", "drive", "family", "SUV"]
advertising_keywords2 = ["plane", "travel", "airplane", "relax"]

In [10]:
def preproccess_txt(txt, model):
    # tokenize
    tokens = word_tokenize(txt.lower())

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stopwords
    final_tokens = [token for token in lemmas if ((not (token in stop_words)) and (not (token in punctuation)))]

    # remove words not in model
    final_tokens = [w for w in final_tokens if w in model.key_to_index]
    return final_tokens

In [11]:
def get_tfid_vec(txt):
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', strip_accents='ascii')
    vector = vectorizer.fit_transform(txt)
    return vector

In [12]:
def get_lsa(vecs):
    lsa = TruncatedSVD()
    dtm_lsa = lsa.fit_transform(vecs)
    return dtm_lsa

In [13]:
def get_vec_from_model(model, words, single_vec=False):
    vecs = [model[w] for w in words if w in model.key_to_index]
    vecs = np.array(vecs)
    
    if single_vec:
        return vecs.mean(axis=0)
    else:
        return vecs

In [14]:
def words_2_doc(words):
    return ' '.join(words)

In [15]:
def get_vec_from_model(model, words, single_vec=True):
    vecs = [model[w] for w in words if w in model.key_to_index]
    vecs = np.array(vecs)
    
    if single_vec:
        return vecs.mean(axis=0)
    else:
        return vecs

In [16]:
def get_vec_similarity(all_text, keywords, model, lsa):
    all_vecs = []
    for text in all_text:
        proc_text = preproccess_txt(text, model)
        vecs = get_vec_from_model(model, proc_text)
        all_vecs.append(vecs)

    proc_keywords = preproccess_txt(keywords, model)
    keyword_vecs = get_vec_from_model(model, proc_keywords)

    stacked_vecs = np.stack(all_vecs + [keyword_vecs])
    if lsa:
        return cosine_similarity(get_lsa(stacked_vecs))
    else:
        return cosine_similarity(stacked_vecs)

In [17]:
def get_topics(tokens, num_topics=5):
    dictionary = corpora.Dictionary([tokens])

    # convert tokenized document to a bag-of-words format
    bow = dictionary.doc2bow(tokens)

    # train LDA model
    lda_model = gensim.models.LdaModel([bow], num_topics=num_topics, id2word=dictionary, passes=10, iterations=200, random_state=0)

    # get sorted top topics
    top_topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    top_topics = sorted(top_topics, key=lambda x: x[1], reverse=True)

    all_keywords = []
    for topic in top_topics:
        topic_words = [dictionary[word_id] for word_id, prob in lda_model.get_topic_terms(topic[0])]
        all_keywords.append(topic_words)

    return all_keywords

In [18]:
def get_positive_sent(info):
    sent = info[1]
    if sent == absa.Sentiment.positive:
        return True
    else:
        return False

In [46]:
def get_aspa_sim(article, keywords, model):
    topics = get_topics(preproccess_txt(article, model))
    top_topic = topics[0]
    print("top keywords: ", top_topic)

    completed_task = nlp(article, aspects=top_topic)
    
    all_sent = []
    for task in completed_task.examples:
        sent = task.sentiment
        aspect = task.aspect
        all_sent.append((aspect, sent))
    
    filt_sent = list(filter(get_positive_sent, all_sent))
    pos_words = [w[0] for w in filt_sent]

    if len(pos_words) == 0:
        print("no positive topics in article")
        return 0.0

    print("positive keywords: ", pos_words)
    return model.n_similarity(pos_words, preproccess_txt(words_2_doc(keywords), model))

## All article words

In [40]:
pos_similarity = model.n_similarity(preproccess_txt(positive_car, model), preproccess_txt(words_2_doc(advertising_keywords), model))
neg_similarity = model.n_similarity(preproccess_txt(negative_car, model), preproccess_txt(words_2_doc(advertising_keywords), model))
print("positive car article:", pos_similarity)
print("car crash article:", neg_similarity)

positive car article 0.46706164
car crash article 0.5531928


## All article words with LSA 

In [43]:
lsa = get_vec_similarity([positive_car, negative_car], words_2_doc(advertising_keywords), model, True)
print("positive car article:", lsa[2][0])
print("car crash article:", lsa[2][1])

positive car article 0.4994566
car crash article 0.6341154


## Article leading topic keywords

In [45]:
positive_topics = get_topics(preproccess_txt(positive_car, model))[0]
negative_topics = get_topics(preproccess_txt(negative_car, model))[0]
print("positive car article keywords used:", positive_topics)
print("positive car article:", model.n_similarity(positive_topics, preproccess_txt(words_2_doc(advertising_keywords), model)))
print("car crash article keywords used:", negative_topics)
print("car crash article:", model.n_similarity(negative_topics, preproccess_txt(words_2_doc(advertising_keywords), model)))

positive car article keywords used: ['car', 'event', 'together', 'community', 'cause', 'local', 'come', 'spirit', 'support', 'food']
positive car article: 0.4297304
car crash article keywords used: ['accident', 'freeway', 'driver', 'traffic', 'people', 'area', 'serious', 'safe', 'los', 'crash']
car crash article: 0.5382987


## Article leading topic keywords with aspect based sentiment analysis

In [47]:
pos_sim = get_aspa_sim(positive_car, advertising_keywords, model)
print("positive car article:", pos_sim)
neg_sim = get_aspa_sim(negative_car, advertising_keywords, model)
print("car crash article", neg_sim)

top keywords:  ['car', 'event', 'together', 'community', 'cause', 'local', 'come', 'spirit', 'support', 'food']
positive keywords:  ['car', 'event', 'together', 'community', 'cause', 'local', 'come', 'spirit', 'support', 'food']
positive car article: 0.4297304
top keywords:  ['accident', 'freeway', 'driver', 'traffic', 'people', 'area', 'serious', 'safe', 'los', 'crash']
positive keywords:  ['safe']
car crash article 0.18475531


In [26]:
pos_sim = get_aspa_sim(positive_plane, advertising_keywords2, model)
print("positive car article:", pos_sim)
neg_sim = get_aspa_sim(negative_plane, advertising_keywords2, model)
print("car crash article", neg_sim)

top keywords:  ['flight', 'airplane', 'speed', 'new', 'advanced', 'set', 'plane', 'ha', 'record', 'journey']
positive keywords:  ['flight', 'airplane', 'speed', 'new', 'advanced', 'set', 'plane', 'ha', 'record', 'journey']
top keywords:  ['said', 'plane', 'police', 'two', 'airport', 'venice', 'pier', 'beaver', 'p.m.', 'wednesday']
no positive topics in article
positive:  0.72294444
negative:  0.0
