# BERT embeddings

## Instalations and imports

In [None]:
!pip install sentence_transformers

In [3]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_ROOT = '/content/drive/MyDrive'

import os 
os.chdir(PROJECT_ROOT)
DATA_PATH = os.path.join(PROJECT_ROOT, 'Quotebank_limunADA')

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from textblob import TextBlob

import torch 
import torch.nn as nn 
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import matplotlib.pyplot as plt 
import numpy as np 
import scipy
import pickle
import bz2
import json
from operator import itemgetter 


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Text prepocessing

In [4]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)

    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)

    return lda_tokens


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


## Generating tokens and embeddings for quotes

In [5]:
# Loading tokens from a file
def get_tokens_per_quote(path_to_file, print_step=5e4, num_instances=None):

    tokens_per_quote = {}
    # Iterate through the quotes
    with bz2.open(path_to_file, 'rb') as s_file:
        for i, instance in enumerate(s_file):
            if i % int(print_step) == 0:
                print(f'Instance {i}')

            if num_instances is not None:
                if i == num_instances:
                    break 

            # loading a sample and checking the speaker
            instance = json.loads(instance) 
            tokens = prepare_text_for_lda(instance['quotation'])

            tokens_per_quote[instance['quoteID']] = tokens 
    
    return tokens_per_quote

# Loading quotes at given indexes from a file
def get_instances_at_indexes(path_to_file, indexes=None, print_step=5e4):
    instances = []
    # Iterate through the quotes
    with bz2.open(path_to_file, 'rb') as s_file:
        for i, instance in enumerate(s_file):
            if i % int(print_step) == 0:
                print(f'Instance {i}')
                
            if indexes is None or i in indexes:
                instance = json.loads(instance) 
                instances.append(instance['quotation'])
            else:                
                continue 

    return instances

In [6]:
def load_tokens_string_per_quote(party, year, load_tokens_per_quote=True):
  """
  Given a party and a year loads all the tokens per quote such that each 
  quote is assigned to a string that is concatenation of tokens and spaces. 
  Additionally, if load_tokens_per_quote is false, than generates all the tokens
  """

  # Get path to json file to cleaned data from Quotebank
  path_to_file = os.path.join(  
      DATA_PATH, f'quotes-{party}-{year}.json.bz2'
      )

  # Make new directory if necessary
  os.makedirs(os.path.join(DATA_PATH, 'SBERT'), exist_ok=True) 

  # Get path to file with tokens
  path_to_tokens = os.path.join(
      DATA_PATH, 'SBERT', f'tokens_per_quote_strings_{party}_{year}.pkl'
      )

  # Check if we should only load or generate tokens
  if not load_tokens_per_quote:
    # Get tokens for each quote
    tokens_per_quote = get_tokens_per_quote(
        path_to_file, num_instances=None
        )
   
    # For each quote generate a string from all the tokens by concatenation
    tokens_per_quote_strings = \
        [' '.join(quote) for k, quote in tokens_per_quote.items()]
  
    # Save all the tokens
    pickle.dump(
        tokens_per_quote_strings, 
        open(path_to_tokens, 'wb')
        )
    
  else:
    # Load tokens
    print(f'Loading {party}_tokens_per_quote_strings...')
    tokens_per_quote_strings = pickle.load(
        open(path_to_tokens, 'rb')          
        )
  
  return tokens_per_quote_strings
    


In [7]:
def load_quotes(party, year, load_quotes=True):
  """
  Given a party and a year, loads all the quotes as a list of strings
  """
  # Get a path to json file to cleaned data from Quotebank
  path_to_file = os.path.join(  
      DATA_PATH, f'quotes-{party}-{year}.json.bz2'
      )

  # Get a path to output file where quotes will be stored so the access to them
  # is more efficient
  path_to_quotes_list = os.path.join(
      DATA_PATH, 'SBERT', f'quotes_list_{party}_{year}.pkl'
      )

  if not load_quotes:
    # Load all the quotes from initial file
    quotes = get_instances_at_indexes(path_to_file)
      
    # Store all the quotes 
    pickle.dump(
        quotes, 
        open(path_to_quotes_list, 'wb')
        ) 
    
  else:
    print(f'Loading {party}_quotes...')
    quotes = pickle.load(
        open(path_to_quotes_list, 'rb')
        )
  
  return quotes
   



In [8]:
def load_embeddings(party, year, load_embeddings=True):
  """
  Given a party and a year returns a list of embeddings for each quote
  Additionally, if load embeddings is false than it generates all the embeddings
  and stores it.
  """

  # Get path to file with the embeddings
  path_to_embeddings = os.path.join(
    DATA_PATH, 'SBERT', f'embeddings_{party}_{year}.pkl'
    )

  # Check if embeddings should be generated
  if not load_embeddings:
    # Loads all the tokens 
    tokens_per_quote_strings = load_tokens_string_per_quote(party, year, load_tokens_per_quote=True)

    # Generates all the embeddings by using SentenceTransfromer model    
    embeddings = model.encode(
        tokens_per_quote_strings,
        show_progress_bar=True,
        device=DEVICE,
        batch_size=32 if str(DEVICE) == 'cuda' else 1
        )
    
    # Store all the embeddings 
    pickle.dump(
        embeddings, 
        open(path_to_embeddings, 'wb')
        ) 

  else:
    print(f'Loading {party}_embeddings...')
    # Loads the embeddings
    embeddings = pickle.load(
        open(path_to_embeddings, 'rb')
        )
    
  # Returns embeddings
  return embeddings


In [9]:
#### Novakovic
for year in [2015, 2016, 2017]:
  for party in ['republicans', 'democrates']:
    tokens_string = load_tokens_string_per_quote(party, year, load_tokens_per_quote=False)
    quotes = load_quotes(party, year, load_quotes=False)
    embeddings = load_embeddings(party, year, load_embeddings=False)

Instance 0


KeyboardInterrupt: ignored

## Generating embeddings for topics

In [12]:
def get_topic_embeddings(topics, model):
  """
  Generates topic embeddings from pretrained model
  """
  topics_embeddings = model.encode(
      topics, show_progress_bar=True, device=DEVICE, batch_size=1
      )
    
  return topics_embeddings


TOPICS = [
    'politics', 'economy', 'education', 'health', 'crime'
    ,'russia', 'korea', 'trump', 'china', 
    'drug addiction', 'climate change', 'racism', 'terrorism',
    'illegal immigration', 'sexism', 'affordability of healthcare',
    'affordability of college education', 'economic inequality', 
    'job opportunities'
    ]

topics_embeddings = get_topic_embeddings(TOPICS, model)

print(topics_embeddings.shape)

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

(19, 384)


In [28]:
def get_similarities_per_topis(party, year):
  compute_cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
  similarities_per_topic = {}

  embeddings = load_embeddings(party, year)

  for topic, topic_embedding in zip(TOPICS, topics_embeddings):
    print(topic)

    similarities_per_topic[topic] = {}

    similarities = compute_cosine_similarity(
        torch.from_numpy(embeddings), 
        torch.from_numpy(topic_embedding)
        )
    
    similarities_per_topic[topic]['sorted'], \
    similarities_per_topic[topic]['indexes'] = \
        torch.sort(similarities, descending=True)

  return similarities_per_topic

def get_similar_quotes(all_quotes, similarities_per_topic, topic, threshold):

    should_keep = np.where(
        similarities_per_topic[topic]['sorted'] > threshold
        )[0]

    kept_indexes = similarities_per_topic[topic]['indexes'][should_keep]

    kept_quotes = itemgetter(*kept_indexes)(all_quotes)

    return kept_quotes

def get_analysis_of_similar_quotes(all_quotes, similarities_per_topic, topic, threshold):
  similar_quotes = get_similar_quotes(all_quotes, similarities_per_topic, topic, threshold)
  print(f'Length {len(similar_quotes)}')
  print(f'Ratio {len(similar_quotes) / len(all_quotes)}')
  get_sentiment = lambda x: 1 if x > 0.3 else (0 if x > -0.3 else -1)
  sentiments = np.array([get_sentiment(TextBlob(quote).sentiment.polarity) for quote in similar_quotes])
  print(f'Procentage of postive {np.mean(sentiments == 1)*100}%')
  print(f'Procentage of neutral {np.mean(sentiments == 0)*100}%')
  print(f'Procentage of negative {np.mean(sentiments == -1)*100}%')
  return

democrates_2019_similarities_per_topic = get_similarities_per_topis('democrates', 2019)
democrates_2019_quotes = load_quotes('democrates', 2019)
republicans_2019_similarities_per_topic = get_similarities_per_topis('republicans', 2019)
republicans_2019_quotes = load_quotes('republicans', 2019)
treshold = 0.4
print('DEMOCRATES')
get_analysis_of_similar_quotes(democrates_2019_quotes, democrates_2019_similarities_per_topic, 'trump', treshold)
print()
print('REPUBLICANS')
get_analysis_of_similar_quotes(republicans_2019_quotes, republicans_2019_similarities_per_topic, 'trump', treshold)

Loading democrates_embeddings...
politics
economy
education
health
crime
russia
korea
trump
china
drug addiction
climate change
racism
terrorism
illegal immigration
sexism
affordability of healthcare
affordability of college education
economic inequality
job opportunities
Loading democrates_quotes...
Loading republicans_embeddings...
politics
economy
education
health
crime
russia
korea
trump
china
drug addiction
climate change
racism
terrorism
illegal immigration
sexism
affordability of healthcare
affordability of college education
economic inequality
job opportunities
Loading republicans_quotes...
DEMOCRATES
Length 19384
Ratio 0.018330595659249885
Procentage of postive 5.581923235658275%
Procentage of neutral 91.0699546017334%
Procentage of negative 3.348122162608337%

REPUBLICANS
Length 16952
Ratio 0.020603170204646165
Procentage of postive 6.040585181689476%
Procentage of neutral 90.44360547428032%
Procentage of negative 3.5158093440302034%


In [None]:
def print_quotes_for_topic(
    all_quotes, 
    topic, 
    similarities_per_topic, 
    num_to_print=None
    ):

    num_quotes = len(similarities_per_topic[topic]['indexes'])
    if num_to_print is None:
        num_to_print = num_quotes
    step = num_quotes / num_to_print

    sampling = np.arange(0, num_quotes-1, step)

    should_keep = list(np.array(
        similarities_per_topic[topic]['indexes'][sampling],
        ))

    kept_similarities = similarities_per_topic[topic]['sorted'][sampling]
    kept_quotes = itemgetter(*should_keep)(all_quotes)

    for i in range(len(kept_quotes)):
        print(f'{kept_similarities[i]} {kept_quotes[i]}')







## Education

We printed totally 1000 quotations. Rough estimation for a good treshold would be 0.37.

In [None]:
print_quotes_for_topic(
    democrates_quotes, 'education', democrates_similarities_per_topic, 1000
    )

# TOPICS = [
#     'economy', 'healthcare', 'education': 0.37, 'russia', 
#     'korea', 'trump', 'china', 'guns', 'budget'
#     ]

In [None]:
print_quotes_for_topic(
    republicans_quotes, 'education', republicans_similarities_per_topic, 1000
    )


In [None]:
democrates_quotes_education = get_similar_quotes(
    democrates_quotes, democrates_similarities_per_topic, 'education', 0.37
    )

In [None]:
republicans_quotes_education = get_similar_quotes(
    republicans_quotes, republicans_similarities_per_topic, 'education', 0.37
    )

In [None]:
THRESHOLDS_DICT = {'education': 0.37, 'trump': None}

democrates_quotes_per_topic = {}
