In [54]:
!pip install sentence_transformers
!pip install aspect_based_sentiment_analysis



In [55]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_ROOT = '/content/drive/MyDrive'

import os 
os.chdir(PROJECT_ROOT)
DATA_PATH = os.path.join(PROJECT_ROOT, 'Quotebank_limunADA')

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()


import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from textblob import TextBlob

import aspect_based_sentiment_analysis as absa
nlp = absa.load()

import torch 
import torch.nn as nn 
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import matplotlib.pyplot as plt 
import numpy as np 
import scipy
import pickle
import bz2
import json
from operator import itemgetter 


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Some layers from the model checkpoint at absa/classifier-rest-0.2 were not used when initializing BertABSClassifier: ['dropout_379']
- This IS expected if you are initializing BertABSClassifier from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertABSClassifier from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of BertABSClassifier were not initialized from the model checkpoint at absa/classifier-rest-0.2 and are newly initialized: ['dropout_75']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)

    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)

    return lda_tokens


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


def get_tokens_per_quote(path_to_file, print_step=5e4, num_instances=None):

    tokens_per_quote = {}
    # Iterate through the quotes
    with bz2.open(path_to_file, 'rb') as s_file:
        for i, instance in enumerate(s_file):
            if i % int(print_step) == 0:
                print(f'Instance {i}')

            if num_instances is not None:
                if i == num_instances:
                    break 

            # loading a sample and checking the speaker
            instance = json.loads(instance) 
            tokens = prepare_text_for_lda(instance['quotation'])

            tokens_per_quote[instance['quoteID']] = tokens 
    
    return tokens_per_quote


def get_instances_at_indexes(path_to_file, indexes=None, print_step=5e4):
    instances = []
    # Iterate through the quotes
    with bz2.open(path_to_file, 'rb') as s_file:
        for i, instance in enumerate(s_file):
            if i % int(print_step) == 0:
                print(f'Instance {i}')
                
            if indexes is None or i in indexes:
                instance = json.loads(instance) 
                instances.append(instance['quotation'])
            else:                
                continue 

    return instances



In [57]:
YEAR = '2019'

LOAD_TOKENS_PER_QUOTE = True

path_to_democrates = os.path.join(
    DATA_PATH, f'quotes-democrates-{YEAR}.json.bz2'
    )
path_to_republicans = os.path.join(
    DATA_PATH, f'quotes-republicans-{YEAR}.json.bz2'
    )

os.makedirs(os.path.join(DATA_PATH, 'SBERT'), exist_ok=True) 

path_to_democrates_tokens = os.path.join(
    DATA_PATH, 'SBERT', f'tokens_per_quote_strings_democrates_{YEAR}.pkl'
    )
path_to_republicans_tokens = os.path.join(
    DATA_PATH, 'SBERT', f'tokens_per_quote_strings_republicans_{YEAR}.pkl'
    )

if not LOAD_TOKENS_PER_QUOTE:
    democrates_tokens_per_quote = get_tokens_per_quote(
        path_to_democrates, num_instances=None
        )
    republicans_tokens_per_quote = get_tokens_per_quote(
        path_to_republicans, num_instances=None
        )
    
    democrates_tokens_per_quote_strings = \
        [' '.join(quote) for k, quote in democrates_tokens_per_quote.items()]
    republicans_tokens_per_quote_strings = \
        [' '.join(quote) for k, quote in republicans_tokens_per_quote.items()]

    pickle.dump(
        democrates_tokens_per_quote_strings, 
        open(path_to_democrates_tokens, 'wb')
        )
    pickle.dump(
        republicans_tokens_per_quote_strings, 
        open(path_to_republicans_tokens, 'wb')
        )    

else:
    print('Loading democrates_tokens_per_quote_strings...')
    democrates_tokens_per_quote_strings = pickle.load(
        open(path_to_democrates_tokens, 'rb')
        )
        
    print('Loading republicans_tokens_per_quote_strings...')
    republicans_tokens_per_quote_strings = pickle.load(
        open(path_to_republicans_tokens, 'rb')
        )



Loading democrates_tokens_per_quote_strings...
Loading republicans_tokens_per_quote_strings...


In [58]:
LOAD_QUOTES = True

path_to_democrates_quotes_list = os.path.join(
    DATA_PATH, 'SBERT', f'quotes_list_democrates_{YEAR}.pkl'
    )
path_to_republicans_quotes_list = os.path.join(
    DATA_PATH, 'SBERT', f'quotes_list_republicans_{YEAR}.pkl'
    )

if not LOAD_QUOTES:
    democrates_quotes = get_instances_at_indexes(path_to_democrates)
    republicans_quotes = get_instances_at_indexes(path_to_republicans)

    pickle.dump(
        democrates_quotes, 
        open(path_to_democrates_quotes_list, 'wb')
        ) 
    pickle.dump(
        republicans_quotes, 
        open(path_to_republicans_quotes_list, 'wb')
        ) 
    
else:
    print('Loading democrates_quotes...')
    democrates_quotes = pickle.load(
        open(path_to_democrates_quotes_list, 'rb')
        )
        
    print('Loading republicans_quotes...')
    republicans_quotes = pickle.load(
        open(path_to_republicans_quotes_list, 'rb')
        )



Loading democrates_quotes...
Loading republicans_quotes...


In [59]:
LOAD_ENCODINGS = True

path_to_democrates_embeddings = os.path.join(
    DATA_PATH, 'SBERT', f'embeddings_democrates_{YEAR}.pkl'
    )
path_to_republicans_embeddings = os.path.join(
    DATA_PATH, 'SBERT', f'embeddings_republicans_{YEAR}.pkl'
    )

if not LOAD_ENCODINGS:
    democrates_embeddings = model.encode(
        democrates_tokens_per_quote_strings,
        show_progress_bar=True,
        device=DEVICE,
        batch_size=32 if str(DEVICE) == 'cuda' else 1
        )
    
    republicans_embeddings = model.encode(
        republicans_tokens_per_quote_strings,
        show_progress_bar=True,
        device=DEVICE,
        batch_size=32 if str(DEVICE) == 'cuda' else 1
        )

else:
    print('Loading democrates_embeddings...')
    democrates_embeddings = pickle.load(
        open(path_to_democrates_embeddings, 'rb')
        )
    
    print('Loading republicans_embeddings...')
    republicans_embeddings = pickle.load(
        open(path_to_republicans_embeddings, 'rb')
        )


Loading democrates_embeddings...
Loading republicans_embeddings...


In [7]:
def get_topic_embeddings(topics, model):
    topics_embeddings = model.encode(
        topics, show_progress_bar=True, device=DEVICE, batch_size=1
        )
    
    return topics_embeddings


TOPICS = [
    'economy', 'healthcare', 'education', 'russia', 
    'korea', 'trump', 'china', 'guns', 'budget'
    ]

topics_embeddings = get_topic_embeddings(TOPICS, model)

print(topics_embeddings.shape)

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

(9, 384)


In [60]:
compute_cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)

democrates_similarities_per_topic = {}
republicans_similarities_per_topic = {}

for topic, topic_embedding in zip(TOPICS, topics_embeddings):
    print(topic)

    democrates_similarities_per_topic[topic] = {}

    democrates_similarities = compute_cosine_similarity(
        torch.from_numpy(democrates_embeddings), 
        torch.from_numpy(topic_embedding)
        )
    
    democrates_similarities_per_topic[topic]['sorted'], \
    democrates_similarities_per_topic[topic]['indexes'] = \
        torch.sort(democrates_similarities, descending=True)

    

    republicans_similarities_per_topic[topic] = {}

    republicans_similarities = compute_cosine_similarity(
        torch.from_numpy(republicans_embeddings), 
        torch.from_numpy(topic_embedding)
        )
    
    republicans_similarities_per_topic[topic]['sorted'], \
    republicans_similarities_per_topic[topic]['indexes'] = \
        torch.sort(republicans_similarities, descending=True)
    

economy
healthcare
education
russia
korea
trump
china
guns
budget


In [61]:
def print_quotes_for_topic(
    all_quotes, 
    topic, 
    similarities_per_topic, 
    num_to_print=None
    ):

    num_quotes = len(similarities_per_topic[topic]['indexes'])
    if num_to_print is None:
        num_to_print = num_quotes
    step = num_quotes / num_to_print

    sampling = np.arange(0, num_quotes-1, step)

    should_keep = list(np.array(
        similarities_per_topic[topic]['indexes'][sampling],
        ))

    kept_similarities = similarities_per_topic[topic]['sorted'][sampling]
    kept_quotes = itemgetter(*should_keep)(all_quotes)

    for i in range(len(kept_quotes)):
        print(f'{kept_similarities[i]} {kept_quotes[i]}')


def get_similar_quotes(all_quotes, similarities_per_topic, topic, threshold):

    should_keep = np.where(
        similarities_per_topic[topic]['sorted'] > threshold
        )[0]

    kept_indexes = similarities_per_topic[topic]['indexes'][should_keep]

    kept_quotes = itemgetter(*kept_indexes)(all_quotes)

    return kept_quotes

def get_number_of_similar_quotes_for_topic(
    topic, 
    similarities_per_topic,
    treshold
    ):

    procentage_quotes = np.mean(np.array(
        similarities_per_topic[topic]['sorted'] > treshold, dtype=np.int8
        ))

    return procentage_quotes



In [62]:
THRESHOLDS_DICT = {'education': 0.37, 'trump': 0.32, 'china':32}

democrates_quotes_per_topic = {}
republicans_quotes_per_topic = {}

for topic in ['trump']:
    threshold = THRESHOLDS_DICT[topic]

    democrates_quotes_per_topic[topic] = get_similar_quotes(
        democrates_quotes, democrates_similarities_per_topic, topic, threshold
        )
    
    republicans_quotes_per_topic[topic] = get_similar_quotes(
        republicans_quotes, republicans_similarities_per_topic, topic, threshold
        )
    



## ABSA

In [69]:
from time import time

SENTIMENT_INDEXING = {'neutral': 0, 'negative': 1, 'positive': 2}

start = time()

for i, quote in enumerate(democrates_quotes_per_topic['trump']):
    print(i)

    task = nlp(text=(quote), aspects=['trump'])
    absa_scores = task.examples[0].scores

    print('\n')
    print(quote)
    for sentiment_str, ind in SENTIMENT_INDEXING.items():
        print(f'{sentiment_str}: {absa_scores[ind]:.3f}')

    if i == 50:
        break

end = time()

print(end - start)



0


Trump' s the man.
neutral: 0.001
negative: 0.001
positive: 0.998
1


We need to pray for Trump,
neutral: 0.333
negative: 0.031
positive: 0.636
2


This Trump is bad,
neutral: 0.001
negative: 0.991
positive: 0.008
3


I don't want Trump to win. Did he win?
neutral: 0.141
negative: 0.849
positive: 0.010
4


will help Trump win.
neutral: 0.002
negative: 0.001
positive: 0.997
5


Trump Of The Day Club
neutral: 0.006
negative: 0.003
positive: 0.991
6


Oh my gosh. Trump said that?
neutral: 0.806
negative: 0.008
positive: 0.186
7


We're in a Trump era, but that's our goal.
neutral: 0.011
negative: 0.003
positive: 0.986
8


What has he [ Trump ] done to slow up Iran at all?
neutral: 0.629
negative: 0.349
positive: 0.022
9


[ Trump ] shouldn't do it at all,
neutral: 0.157
negative: 0.618
positive: 0.225
10


Well, you should vote for Trump,
neutral: 0.026
negative: 0.002
positive: 0.972
11


What a gift Trump gave Beto,
neutral: 0.001
negative: 0.001
positive: 0.998
12


the day after Tr

In [63]:
SENTIMENT_INDEXING = {'neutral': 0, 'negative': 1, 'positive': 2}

for topic in ['trump']:
    
    task = nlp(text=(democrates_quotes_per_topic[topic]), aspects=[topic])
    absa_scores = task.examples[0].scores

    for sentiment_str, ind in SENTIMENT_INDEXING.items():
        print(f'{sentiment_str}: {absa_scores[ind]:.3f}')

    
    task = nlp(text=(republicans_quotes_per_topic[topic]), aspects=[topic])
    absa_scores = task.examples[0].scores

    for sentiment_str, ind in SENTIMENT_INDEXING.items():
        print(f'{sentiment_str}: {absa_scores[ind]:.3f}')


TypeError: ignored

## Print 

In [11]:
print_quotes_for_topic(
    republicans_quotes, 'education', republicans_similarities_per_topic, 1000
    )


In [52]:
print_quotes_for_topic(
    republicans_quotes, 'china', republicans_similarities_per_topic, 1000
    )

In [53]:
democrates_quotes_per_topic['trump']