In [None]:
!pip3 install --upgrade pandas
!pip install pyLDAvis

Collecting pandas
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 29.2 MB/s 
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas~=1.1.0; python_version >= "3.0", but you have pandas 1.3.4 which is incompatible.[0m
Successfully installed pandas-1.3.4


Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 23.9 MB/s eta 0:00:01[K     |▍                               | 20 kB 24.9 MB/s eta 0:00:01[K     |▋                               | 30 kB 27.6 MB/s eta 0:00:01[K     |▉                               | 40 kB 30.3 MB/s eta 0:00:01[K     |█                               | 51 kB 33.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 37.3 MB/s eta 0:00:01[K     |█▍                              | 71 kB 31.1 MB/s eta 0:00:01[K     |█▋                              | 81 kB 31.5 MB/s eta 0:00:01[K     |█▉                              | 92 kB 33.1 MB/s eta 0:00:01[K     |██                              | 102 kB 30.1 MB/s eta 0:00:01[K     |██▏                             | 112 kB 30.1 MB/s eta 0:00:01[K     |██▍                             | 122 kB 30.1 MB/s eta 0:00:01[K     |██▋                             | 133 kB 30.1 MB/s eta 0:00:01

In [None]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_ROOT = '/content/drive/MyDrive'

import os 
os.chdir(PROJECT_ROOT)
DATA_PATH = os.path.join(PROJECT_ROOT, 'Quotebank_limunADA')


import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

import pyLDAvis 
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import gensim
from gensim import corpora
import pickle
import bz2
import json

import warnings
warnings.filterwarnings("ignore")

NUM_TOPICS = 10


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  from collections import Iterable
  from collections import Mapping
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Text preprocessing and LDA modeling

In [None]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)

    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)

    return lda_tokens


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


def get_tokens_per_quote(path_to_file, print_step=5e4):

    tokens_per_quote = {}
    # Iterate through the quotes
    with bz2.open(path_to_file, 'rb') as s_file:
        for i, instance in enumerate(s_file):
            if i % int(print_step) == 0:
                print(f'Instance {i}')

            # loading a sample and checking the speaker
            instance = json.loads(instance) 
            tokens = prepare_text_for_lda(instance['quotation'])

            tokens_per_quote[instance['quoteID']] = tokens 
    
    return tokens_per_quote


def get_dictionary_and_corpus(tokens_per_quote_list):
    # List of all the words in text_data (unique, so more like a set)
    dictionary = corpora.Dictionary(tokens_per_quote_list)

    # Bag of Words representation of each of the lines 
    corpus = [dictionary.doc2bow(text) for text in tokens_per_quote_list]

    return dictionary, corpus



In [47]:
def generate_LDA_model(num_of_topics, party, year, load_corp_and_dict=False):
  """
  Function that generates LDA model for given number of topics, party and year.
  """

  if os.path.isfile(os.path.join(DATA_PATH, 'LDA', f'LDA_{party}_{year}_{num_of_topics}.pkl')):
    print(f'Model for {party}-{year} already exists.')
    return 

  input_file = os.path.join(DATA_PATH, f'quotes-{party}-{year}.json.bz2')

  print(f'\n=== Computing topcis for {party}-{year}===\n')

  if not load_corp_and_dict:
    print('Collecting tokens:')
    tokens_per_quote = get_tokens_per_quote(input_file)
    tokens_per_quote_list = [
        tokens for qid, tokens in tokens_per_quote.items()
        ]

    print('\nCreating dictionary and corpus...')
    dictionary, corpus = \
        get_dictionary_and_corpus(tokens_per_quote_list)

    pickle.dump(
        corpus, 
        open(os.path.join(DATA_PATH, 'LDA', f'corpus_{party}_{year}.pkl'), 'wb')
        )

    pickle.dump(
        dictionary, 
        open(os.path.join(DATA_PATH, 'LDA', f'dictionary_{party}_{year}.pkl'), 'wb')
        )
    print('Done')
  else:
    print('Loading corpus and dictionary...')

    pickle.load(
        corpus, 
        open(os.path.join(DATA_PATH, 'LDA', f'corpus_{party}_{year}.pkl'), 'wb')
        )

    pickle.load(
        dictionary, 
        open(os.path.join(DATA_PATH, 'LDA', f'dictionary_{party}_{year}.pkl'), 'wb')
        )


  lda_model = gensim.models.ldamodel.LdaModel(
    corpus, id2word=dictionary, 
    num_topics=num_of_topics, passes=15
    )

  pickle.dump(
      lda_model, 
      open(os.path.join(DATA_PATH, 'LDA', f'LDA_{party}_{year}_{num_of_topics}.pkl'), 'wb')
      )

In [48]:
### Generate all the models
YEARS = range(2015, 2020+1)
PARTIES = ['democrates', 'republicans']
num_of_topics = 8

for year in YEARS:
  for party in PARTIES:
    generate_LDA_model(num_of_topics, party, year)

Model for democrates-2015 already exists.
Model for republicans-2015 already exists.
Model for democrates-2016 already exists.
Model for republicans-2016 already exists.
Model for democrates-2017 already exists.
Model for republicans-2017 already exists.
Model for democrates-2018 already exists.
Model for republicans-2018 already exists.
Model for democrates-2019 already exists.
Model for republicans-2019 already exists.
Model for democrates-2020 already exists.

=== Computing topcis for republicans-2020===

Collecting tokens:
Instance 0
Instance 50000
Instance 100000
Instance 150000
Instance 200000

Creating dictionary and corpus...
Done


In [27]:
def get_LDA_display_model(party, year, num_of_topics):
  """
  Function that returns LDA model ready to be displayed
  """

  corpus = pickle.load(
    open(os.path.join(DATA_PATH, 'LDA', f'corpus_{party}_{year}.pkl'), 'rb')
      )

  dictionary = pickle.load(
      open(os.path.join(DATA_PATH, 'LDA', f'dictionary_{party}_{year}.pkl'), 'rb')
      )

  lda_model = pickle.load(
      open(os.path.join(DATA_PATH, 'LDA', f'LDA_{party}_{year}_{num_of_topics}.pkl'), 'rb')
      )

  lda_display = pyLDAvis.gensim_models.prepare(
    lda_model, corpus, dictionary, sort_topics=False
    )
  
  return lda_display


In [50]:
lda_display['republicans'][2020] = get_LDA_display_model('republicans', 2020, 8)

In [34]:
num_of_topics = 8
for party in PARTIES:
  for year in YEARS:
    lda_display[party][year] = get_LDA_display_model(party, year, num_of_topics)
    print(f'{party}-{year} done')

democrates-2015 already exists
democrates-2016 already exists
democrates-2017 already exists
democrates-2018 already exists
democrates-2019 already exists
democrates-2020 already exists
republicans-2015 done
republicans-2016 done
republicans-2017 done
republicans-2018 done
republicans-2019 done
republicans-2020 done


## Displaying and analysing LDA models for different parties and years

### Republicans 2015

* Cluster 1: *Education and jobs*
  - Interesting to be investigated: why is Florida here?
* Cluster 2: *Politics - organization related*
  - All the words are completely political, but we can also see keyword Nuclear, that suggests importance of nuclear power in politics
* Cluster 3: *Politics *
* Cluster 4: ???
* Cluster 5: *Natioanl budget*
  - We can also see keyword healt, that implies importance of health budget 
* Cluster 6: *Topics for folks*
* Cluster 7: *Dark side of politics*
  - many political figures Trump, Clinton, Hilarry
  - many sensitive topics: religion, terrorism
* Cluster 8: *Light topics*
  - similar to cluster 6

In [51]:
pyLDAvis.display(lda_display['republicans'][2015])

### Republicans 2016

* Cluster 1: *Dark side of politics*
* CLuster 2: ???
* Cluster 3:
* Cluster 4: *Economy*
  - keywords: fail, concern, issue imply that it is likely the conotation of this was negative
  - interseting to investigate: why is Indidana here?
  - interesting keyword: reform
- Cluster 5: *Campaign*
  - clearly about elections
  - interesting keywords: people, support, choice, change, would
- Cluster 6: *Promises*
  - we are GOING to ...
  - I WOULD ...
  - start, great, better, together 
  - very close to elections 
* Cluster 7: *Fight for women rights*
  - keywords: protect, freedom, family
  - why Carolina?
  - why north and south?
* Cluster 8: *Law*
  - why Mexico?

In [52]:
pyLDAvis.display(lda_display['republicans'][2016])

### Republicans 2017

* Cluster 1: *Promisses*
* Cluster 2:
* Cluster 3: *Trading and business*
  - North Corea, China, Middle (East)
  - Nuclear weapons

* Cluster 5: 
  - contains climate, but not very important?
  - peace and Israel (https://www.state.gov/u-s-relations-with-israel/)
* Cluster 6: *Oportunities and conditions for education*
* CLuster 7: *Budget*
  - interesting keywords: health and insurence
  - implies that research makes sense
* Cluster 8: *Feminism*

In [53]:
pyLDAvis.display(lda_display['republicans'][2017])

### Republicans 2018

* Cluster 1: 
  - Trade war between China and USA
  - https://en.wikipedia.org/wiki/China%E2%80%93United_States_trade_war
  - interesting keywords: border, China 
* Cluster 2: *Promises*
* Cluster 3: *Relationship with Russia*
  - interesting keywards: election (refering to elections in Russia), investigation (refering to scandal of poisoning)
* Cluster 4: *Racism*
* Cluster 5: *Budget*
  - interesting keywords: health and education
  - interesting keyword: increase
* Cluster 6: *Environment*
  - interesting keyword: Mueller (water industry)
* Cluster 7: *Criminal*
  - interesting keywords: North Korea, nuclear
* Cluster 8: *Familly*
  - event: Kavanaugh judge was nominated by Trump to become an associate justice of the Supreme Court of the United States
  - why teacher? worker strike https://en.wikipedia.org/wiki/2018%E2%80%932019_education_workers%27_strikes_in_the_United_States


In [54]:
-pyLDAvis.display(lda_display['republicans'][2018])

### Republicans 2019

* Cluster 1: *Trade war with China*
* Cluster 2: *Promisses*
* Claster 3: *Mueller crime*
  - check this thing here: https://en.wikipedia.org/wiki/Mueller_report
* Claster 4: *Ukraine affair*
https://de.wikipedia.org/wiki/Ukraine-Aff%C3%A4re
* Claster 5: *Politics, pre election phase*
* Claster 6: *Love, life, happy*
  - but why Israel???
* Claster 7: ???
* Claster 8: *Education and job opportunities*


In [55]:
pyLDAvis.display(lda_display['republicans'][2019])

### Republicans 2020

* Claster 1: *Covid19*
* Claster 2: *Covid19*
* Claster 3: *Election and promises*
* Claster 5: *Education, budget*

In [56]:
pyLDAvis.display(lda_display['republicans'][2020])

In [57]:
pyLDAvis.display(lda_display['democrates'][2015])

In [58]:
pyLDAvis.display(lda_display['democrates'][2016])

In [59]:
pyLDAvis.display(lda_display['democrates'][2017])

In [60]:
pyLDAvis.display(lda_display['democrates'][2018])

In [61]:
pyLDAvis.display(lda_display['democrates'][2019])

In [62]:
pyLDAvis.display(lda_display['democrates'][2020])

## Different number of topics

In this section we will briefly represent models for year 2020 and different number of topics.

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(
    lda_democrates, corpus_democrates, dictionary_democrates, sort_topics=False
    )

pyLDAvis.display(lda_display)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
lda_display = pyLDAvis.gensim_models.prepare(
    lda_republicans, corpus_republicans, dictionary_republicans, sort_topics=False
    )

pyLDAvis.display(lda_display)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
lda_display = pyLDAvis.gensim_models.prepare(
    lda_democrates, corpus_democrates, dictionary_democrates, sort_topics=False
    )

pyLDAvis.display(lda_display)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
