# LDA models for topic detection

In [1]:
!pip3 install --upgrade pandas
!pip install pyLDAvis



In [2]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_ROOT = '/content/drive/MyDrive'

import os 
os.chdir(PROJECT_ROOT)
DATA_PATH = os.path.join(PROJECT_ROOT, 'Quotebank_limunADA')

import pandas as pd

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

import pyLDAvis 
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import gensim
from gensim import corpora
import pickle
import bz2
import json

import warnings
warnings.filterwarnings("ignore")

NUM_TOPICS = 10


Mounted at /content/drive
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  from collections import Iterable
  from collections import Mapping
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Text preprocessing and LDA modeling

In [3]:
def tokenize(text):
    """
    Function that performs tokenization of the given input text.
    """
    lda_tokens = []
    tokens = parser(text)

    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)

    return lda_tokens


def get_lemma(word):
    """
    Function that performs lemmatization of the given word, using the 
    NLTK wordnet library.
    """
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


def prepare_text_for_lda(text):
    """
    Given an input text, tokenize each word, remove stopwords and short tokens,
    and finally lemmatize what has remained in the token list.
    """
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


def get_tokens_per_quote(path_to_file, print_step=5e4):
    """
    Function that reads the input bz2 file line by line, and performs 
    tokenization and lemmatization on each of the input lines.
    Returns a dictionary of form: {quoteID: quote_tokenized_lemmatized}
    """
    tokens_per_quote = {}
    # Iterate through the quotes
    with bz2.open(path_to_file, 'rb') as s_file:
        for i, instance in enumerate(s_file):
            if i % int(print_step) == 0:
                print(f'Instance {i}')

            # Loading a sample and preparing it for LDA fitting
            instance = json.loads(instance) 
            tokens = prepare_text_for_lda(instance['quotation'])

            tokens_per_quote[instance['quoteID']] = tokens 
    
    return tokens_per_quote


def get_dictionary_and_corpus(tokens_per_quote_list):
    """
    Function that, given a list of tokens per quote, creates the corresponding 
    dictionary and corpus.
    """
    
    # List of all the words in text_data (unique, so more like a set)
    dictionary = corpora.Dictionary(tokens_per_quote_list)

    # Bag of Words representation of each of the lines 
    corpus = [dictionary.doc2bow(text) for text in tokens_per_quote_list]

    return dictionary, corpus



In [4]:
def generate_LDA_model(num_of_topics, party, year, load_corp_and_dict=False):
    """
    Function that generates and saves the desired LDA model, for given 
    number of topics, party and year.
    """

    quotes_path = os.path.join(DATA_PATH, f'quotes-{party}-{year}.json.bz2')
    lda_path = os.path.join(
        DATA_PATH, 'LDA', f'LDA_{party}_{year}_{num_of_topics}.pkl'
        )
    corpus_path = os.path.join(DATA_PATH, 'LDA', f'corpus_{party}_{year}.pkl')
    dictionary_path = os.path.join(
        DATA_PATH, 'LDA', f'dictionary_{party}_{year}.pkl'
        )
    
    # Check if file already exists
    if os.path.isfile(lda_path):
        print(f'Model for {party}-{year} already exists.')
        return 


    print(f'\n=== Computing topcis for {party}-{year}===\n')

    # If the corpus and dictionary shouldn't be loaded, we generate and save it
    if not load_corp_and_dict:
        # Tokenization and lemmatization of input quotes
        tokens_per_quote = get_tokens_per_quote(quotes_path)
        tokens_per_quote_list = [
            tokens for qid, tokens in tokens_per_quote.items()
            ]

        # Converting the tokens per quote into dictionary and corpus
        print('\nCreating dictionary and corpus...')
        dictionary, corpus = \
            get_dictionary_and_corpus(tokens_per_quote_list)

        # Saving the generated dictionary and corpus
        pickle.dump(corpus, open(corpus_path, 'wb'))
        pickle.dump(dictionary, open(dictionary_path, 'wb'))
        print('Done')

    # Otherwise, load the pre-computed dictionary and corpus
    else:
        print('\nLoading corpus and dictionary...')
        pickle.load(corpus, open(corpus_path, 'wb'))
        pickle.load(dictionary, open(dictionary_path, 'wb'))
        print('Done')

    # Creating and fitting the LDA model
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus, id2word=dictionary, num_topics=num_of_topics, passes=15
        )
    
    # Saving it
    pickle.dump(lda_model, open(lda_path, 'wb'))


In [5]:
# Generating LDA models for all the available years in the dataset, 
# for both political parties
YEARS = range(2015, 2020 + 1)
PARTIES = ['democrates', 'republicans']
num_of_topics = 8

for year in YEARS:
  for party in PARTIES:
    generate_LDA_model(num_of_topics, party, year)

Model for democrates-2015 already exists.
Model for republicans-2015 already exists.
Model for democrates-2016 already exists.
Model for republicans-2016 already exists.
Model for democrates-2017 already exists.
Model for republicans-2017 already exists.
Model for democrates-2018 already exists.
Model for republicans-2018 already exists.
Model for democrates-2019 already exists.
Model for republicans-2019 already exists.
Model for democrates-2020 already exists.
Model for republicans-2020 already exists.


In [10]:
def get_LDA_model(party, year, num_of_topics):
    """
    Function that loads the desired LDA model
    """
    lda_path = os.path.join(
        DATA_PATH, 'LDA', f'LDA_{party}_{year}_{num_of_topics}.pkl'
        )
    lda_model = pickle.load(open(lda_path, 'rb'))

    return lda_model


def get_LDA_display_model(party, year, num_of_topics):
    """
    Function that returns the model used for visualizing LTA results.
    """

    print(f'\nComputing LDA display model for {party}-{year}')

    corpus_path = os.path.join(DATA_PATH, 'LDA', f'corpus_{party}_{year}.pkl')
    dictionary_path = os.path.join(
        DATA_PATH, 'LDA', f'dictionary_{party}_{year}.pkl'
        )
    lda_path = os.path.join(
        DATA_PATH, 'LDA', f'LDA_{party}_{year}_{num_of_topics}.pkl'
        )

    # Load corpus
    corpus = pickle.load(open(corpus_path, 'rb'))

    # Load dictionary
    dictionary = pickle.load(open(dictionary_path, 'rb'))

    # Load LDA model
    lda_model = pickle.load(open(lda_path, 'rb'))

    # Prepare LDA model to be displayed
    lda_display = pyLDAvis.gensim_models.prepare(
        lda_model, corpus, dictionary, sort_topics=False
        )

    print('Done')

    return lda_display


In [12]:
# Preparing LDA models to be displayed
parties_to_display = ['democrates', 'republicans']
years_to_display = [2019]

num_of_topics = 8
lda_display = {}

for party in parties_to_display:
    lda_display[party] = {}

    for year in years_to_display:
        lda_display[party][year] = get_LDA_display_model(
            party, year, num_of_topics
            )


Computing LDA display model for democrates-2019
democrates-2019 done
Computing LDA display model for republicans-2019
republicans-2019 done


## Displaying and analyzing LDA results for the year of 2019

In the following cells, we visualize the results of our LDA models for the year 2019. After observing and analyzing the keywords per topic, we try to categorize the clusters, i.e. interpret and explain what each topic represents.

**NOTE** - the plots below are interactive, so maybe it's not completely informative when looking at a pre-compiled notebook, but everything is explained and visualized in our datastory and report.


### Republicans 2019

* Cluster 1: *Trade war with China*
* Cluster 2: *Promisses*
* Claster 3: *Mueller report* - Report On The Investigation Into Russian Interference In The 2016 Presidential Election
  - https://en.wikipedia.org/wiki/Mueller_report
* Cluster 4: *Ukraine affair; National Security*
  - https://en.wikipedia.org/wiki/Trump%E2%80%93Ukraine_scandal
* Cluster 5: *Political parties and political system*
* Cluster 6: *Relations with Israel + Women and child rights*
* Cluster 7: *Immigration and abortion law + Constitution and policy law making*  
* Cluster 8: *Education and job opportunities*




In [14]:
pyLDAvis.display(lda_display['republicans'][2019])

### Democrates 2019

* Cluster 1: *Miscellaneous*
* Cluster 2: *Relationship with China and the world trade*
* Cluster 3: *Economy and labor system*
* Cluster 4: *Police and Law towards minorities*
* Cluster 5: *State system: Republicans and Democrats*
* Cluster 6: *Education system + Climate change*
* Cluster 7: *Housing, Economy and Government*
* Cluster 8: *US president and state congress*

In [13]:
pyLDAvis.display(lda_display['democrates'][2019])
