# Issue Trend Analysis and Issue Tracking

----------------------------------------------------

### Imports & Statics

In [1]:
import csv, pickle
import utils
from utils import Corpus, Issue, Extractor, IssueModel, EventModel
import os, sys
import warnings

warnings.filterwarnings('ignore')

years = [2015, 2016, 2017]
num_issues = 50
num_events = 50
num_keywords = 10

----------------------------------------------------

## Part 0: Preprocessing

### Clean articles: lemmatize, remove stopwords (Already Done)
**_Caution!_ Involves multiprocessing**

In [2]:
# utils.clean_articles()

### Detect Entities: IBM Watson NLU (Already Done)
**_Caution!_ Involves multiprocessing**

In [3]:
# utils.build_watson()

## Part 1: Issue Trend Analysis

#### Initialize Corpuses

In [4]:
corpus = {}
for year in years:
    corpus[year] = Corpus(year=year-2015)

#### Build Corpuses: Load cleaned articles, build phrasers, dictionary, and BOWs

In [5]:
for year in years:
    print("Corpus "+str(year)+":")
    corpus[year].build_corpus()
    print("Corpus "+str(year)+" Done\n")


Corpus 2015:
building corpus...
collecting articles...
building phrasers...
bigram train finished! 7.40 seconds
trigram train finished! 12.16 seconds
building dictionary...
building bag of words...


100%|██████████| 7155/7155 [00:03<00:00, 2038.94it/s]


Corpus 2015 Done

Corpus 2016:
building corpus...
collecting articles...
building phrasers...
bigram train finished! 7.98 seconds
trigram train finished! 12.99 seconds
building dictionary...
building bag of words...


100%|██████████| 7480/7480 [00:03<00:00, 1881.52it/s]


Corpus 2016 Done

Corpus 2017:
building corpus...
collecting articles...
building phrasers...
bigram train finished! 8.76 seconds
trigram train finished! 14.80 seconds
building dictionary...
building bag of words...


100%|██████████| 9117/9117 [00:04<00:00, 2159.72it/s]


Corpus 2017 Done



#### Extract Keywords from each Article using tf-ifd

In [6]:
for year in years:
    print("Corpus "+str(year)+":")
    corpus[year].build_tfidf()
    corpus[year].extractor = Extractor(corpus[year])
    corpus[year].extractor.extract(k=num_keywords)
    print("Corpus "+str(year)+" Done\n")

Corpus 2015:
building tf-idf model...
tfidf finished! 0.07 seconds
extracting keywords...


100%|██████████| 7155/7155 [00:04<00:00, 1682.06it/s]


Corpus 2015 Done

Corpus 2016:
building tf-idf model...
tfidf finished! 0.07 seconds
extracting keywords...


100%|██████████| 7480/7480 [00:04<00:00, 1680.44it/s]


Corpus 2016 Done

Corpus 2017:
building tf-idf model...
tfidf finished! 0.08 seconds
extracting keywords...


100%|██████████| 9117/9117 [00:05<00:00, 1712.40it/s]

Corpus 2017 Done






#### Save Corpuses

In [7]:
utils.save(corpus, filename='corpus_ready.bin')

saving corpus_ready.bin...
corpus_ready.bin saved


#### Load corpus

In [2]:
corpus = utils.load(filename='corpus_ready.bin')

loading corpus_ready.bin...
corpus_ready.bin loaded


#### Build LDA model, cluster articles into issues

In [3]:
for year in years:
    print("Corpus "+str(year)+":")
    corpus[year].build_lda(num_topics=num_issues)
    corpus[year].issue_model = IssueModel(corpus=corpus[year], model=corpus[year].lda)
    corpus[year].issue_model.build_issues()
    print("Corpus "+str(year)+" Done\n")

Corpus 2015:
building LDA model...
LDA finished! 8.57 seconds
building issues...


100%|██████████| 7155/7155 [00:05<00:00, 1238.60it/s]


extracting keywords...


100%|██████████| 50/50 [00:00<00:00, 2583.46it/s]


Corpus 2015 Done

Corpus 2016:
building LDA model...
LDA finished! 9.25 seconds
building issues...


100%|██████████| 7480/7480 [00:06<00:00, 1215.47it/s]


extracting keywords...


100%|██████████| 50/50 [00:00<00:00, 3022.62it/s]


Corpus 2016 Done

Corpus 2017:
building LDA model...
LDA finished! 10.00 seconds
building issues...


100%|██████████| 9117/9117 [00:07<00:00, 1292.78it/s]


extracting keywords...


100%|██████████| 50/50 [00:00<00:00, 998.79it/s]

Corpus 2017 Done






#### Save Corpuses

In [4]:
utils.save(corpus, filename='corpus_done.bin')

saving corpus_done.bin...
corpus_done.bin saved


#### Init Issues (for Part 2)

In [5]:
issues = []
for year in years:
    issue_model = corpus[year].issue_model
    top_issue_id = issue_model.sorted_issues[0][0]
    issues.append(Issue(articles=issue_model.issues[top_issue_id], keywords=issue_model.keywords[top_issue_id]))

#### Save Issues (for Part 2)

In [6]:
utils.save(issues, filename='issues_init.bin')

saving issues_init.bin...
issues_init.bin saved


----------------------------------------------------

### Show Results

#### Load corpus

In [2]:
corpus = utils.load(filename='corpus_done.bin')

loading corpus_done.bin...
corpus_done.bin loaded


#### Select year to show

In [3]:
show_year = 2016

#### Show top trending issues  

In [4]:
corpus[show_year].issue_model.show_top_issues()

ID:   5 Score: 1109.61 N:  539 Keywords:  Saenuri, bill, impeachment, Moon, Ahn, Chung, Minjoo, Minjoo_Party, constituency, Opposition_party
ID:  14 Score: 946.36 N:  275 Keywords:  N._Korea, sanction, N_K, S._Korea, human_right, Obama, US, NK, resolution, White_House
ID:   1 Score: 767.23 N:  187 Keywords:  Trump, comfort_woman, Japan, foundation, S._Korea, sex_slavery, Iran, victim, sexual_slavery, sex_slave
ID:  19 Score: 647.80 N:  103 Keywords:  S._Korea, deploy, N_K, nuclear_armament, extended_deterrence, nuclear_weapon, Taurus_missile, military_GPS, exercise, deterrence
ID:  45 Score: 593.15 N:  153 Keywords:  N_K, N._Korea, submarine, site, nuke_test, SLBM_test, SLBM, drill, nuclear_warhead, provocation
ID:   7 Score: 588.85 N:   99 Keywords:  complex, N_K, park, S._Korea, nuclear_envoy, Yun, N._Korea, nuke_test, Hwang, talk
ID:  28 Score: 524.26 N:   50 Keywords:  murder, Patterson, defendant, stab, student, arrest, baby, suspect, fortress, Hwaseong_Fortress
ID:  47 Score: 514

#### Show Articles from Top Issues 

In [5]:
corpus[show_year].issue_model.show_issues(k=5)

ID:   5 Score: 1109.61 N:  539 Keywords:  Saenuri, bill, impeachment, Moon, Ahn, Chung, Minjoo, Minjoo_Party, constituency, Opposition_party
	 0 	 Former leader quits opposition party
	 1 	 [Newsmaker] Park’s lame duck deadline looms
	 2 	 Constitutional reform debate resurfaces
	 3 	 General elections mired in uncertainty without constituencies
	 4 	 Park asks political parties to embrace reform
ID:  14 Score: 946.36 N:  275 Keywords:  N._Korea, sanction, N_K, S._Korea, human_right, Obama, US, NK, resolution, White_House
	 0 	 Quake in North Korea suspected to be 'explosion': report
	 1 	 North Korea announces successful test of hydrogen bomb
	 2 	 China party paper urges North Korea to change 'nuclear path'
	 3 	 Park, Obama talk over N.K. nuke test
	 4 	 Park, Obama agree to closely work together to adopt strong U.N. sanctions against North
ID:   1 Score: 767.23 N:  187 Keywords:  Trump, comfort_woman, Japan, foundation, S._Korea, sex_slavery, Iran, victim, sexual_slavery, sex_slave

#### View LDA Model

In [6]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [8]:
pyLDAvis.gensim.prepare(corpus[show_year].lda, corpus[show_year].get_bows(), corpus[show_year].dict)

In [7]:
# issues_lda_data = pyLDAvis.gensim.prepare(corpus[show_year].lda, corpus[show_year].get_bows(), corpus[show_year].dict)
# utils.save(issues_lda_data, 'issues_lda_data.bin')

saving issues_lda_data.bin...
issues_lda_data.bin saved


In [28]:
# pyLDAvis.show(data=data, ip='143.248.137.26', port=9000)

----------------------------------------------------

## Part 2: Issue Tracking

### Imports & Statics

In [2]:
import csv, pickle
import utils
from utils import Corpus, Issue, Extractor, IssueModel, EventModel
import os, sys
import warnings

warnings.filterwarnings('ignore')

years = [2015, 2016, 2017]
num_issues = 50
num_events = 50
num_keywords = 10

#### Load Issues

In [3]:
issues = utils.load('issues_init.bin')

loading issues_init.bin...
issues_init.bin loaded


#### Build Issues

In [4]:
for i, issue in enumerate(issues):
    print("Issue "+str(i+1)+":")
    issue.build_issue()
    print("Issue "+str(i+1)+" Done\n")

Issue 1:
building issue...
building phrasers...
bigram train finished! 0.30 seconds
trigram train finished! 0.48 seconds
building dictionary...
building bag of words...


100%|██████████| 351/351 [00:00<00:00, 1848.43it/s]


Issue 1 Done

Issue 2:
building issue...
building phrasers...
bigram train finished! 0.79 seconds
trigram train finished! 1.19 seconds
building dictionary...
building bag of words...


100%|██████████| 539/539 [00:00<00:00, 1718.45it/s]


Issue 2 Done

Issue 3:
building issue...
building phrasers...
bigram train finished! 0.94 seconds
trigram train finished! 1.41 seconds
building dictionary...
building bag of words...


100%|██████████| 744/744 [00:00<00:00, 1739.96it/s]

Issue 3 Done






#### Extract keywords from each Article using tf-idf

In [5]:
for i, issue in enumerate(issues):
    print("Issue "+str(i+1)+":")
    issue.build_tfidf()
    issue.extractor = Extractor(issue)
    issue.extractor.extract(k=num_keywords)
    print("Issue "+str(i+1)+" Done\n")

Issue 1:
building tf-idf model...
tfidf finished! 0.01 seconds
extracting keywords...


100%|██████████| 351/351 [00:00<00:00, 2054.35it/s]


Issue 1 Done

Issue 2:
building tf-idf model...
tfidf finished! 0.01 seconds
extracting keywords...


100%|██████████| 539/539 [00:00<00:00, 1287.19it/s]


Issue 2 Done

Issue 3:
building tf-idf model...
tfidf finished! 0.01 seconds
extracting keywords...


100%|██████████| 744/744 [00:00<00:00, 1573.53it/s]

Issue 3 Done






#### Save Issues

In [6]:
utils.save(issues, filename='issues_ready.bin')

saving issues_ready.bin...
issues_ready.bin saved


#### Load Issues

In [94]:
issues = utils.load(filename='issues_ready.bin')

loading issues_ready.bin...
issues_ready.bin loaded


#### Build LDA model, cluster articles into events

In [95]:
for i, issue in enumerate(issues):
    print("Issue "+str(i+1)+":")
    issue.build_lda(num_topics=num_events)
    issue.event_model = EventModel(issue=issue, model=issue.lda)
    issue.event_model.build_events(threshold=0.5)
    print("Issue "+str(i+1)+" Done\n")

Issue 1:
building LDA model...
LDA finished! 0.67 seconds
building events...


100%|██████████| 351/351 [00:00<00:00, 1124.75it/s]


extracting keywords...


100%|██████████| 50/50 [00:00<00:00, 8466.84it/s]


Issue 1 Done

Issue 2:
building LDA model...
LDA finished! 1.20 seconds
building events...


100%|██████████| 539/539 [00:00<00:00, 908.30it/s]


extracting keywords...


100%|██████████| 50/50 [00:00<00:00, 7778.75it/s]


Issue 2 Done

Issue 3:
building LDA model...
LDA finished! 1.51 seconds
building events...


100%|██████████| 744/744 [00:00<00:00, 1097.80it/s]


extracting keywords...


100%|██████████| 50/50 [00:00<00:00, 7536.39it/s]

Issue 3 Done






#### Divide events into set of independent events

In [96]:
threshold=[0.40, 0.40, 0.40]
for i, issue in enumerate(issues):
    print("Issue "+str(i+1)+":")
    issue.event_model.build_independents(threshold=threshold[i])
    issue.event_model.filter_events(k=5)
    issue.event_model.build_event_times()
    issue.event_model.build_sorted_independents()
    issue.event_model.build_event_details()
    print("Issue "+str(i+1)+" Done\n")

Issue 1:
building event times...


100%|██████████| 5/5 [00:00<00:00, 1769.60it/s]


building event details...


100%|██████████| 5/5 [00:00<00:00, 845.76it/s]


Issue 1 Done

Issue 2:
building event times...


100%|██████████| 5/5 [00:00<00:00, 4032.21it/s]


building event details...


100%|██████████| 5/5 [00:00<00:00, 408.87it/s]


Issue 2 Done

Issue 3:
building event times...


100%|██████████| 5/5 [00:00<00:00, 1089.09it/s]


building event details...


100%|██████████| 5/5 [00:00<00:00, 902.31it/s]

Issue 3 Done






In [97]:
for i, issue in enumerate(issues):
    print("Issue "+str(i+1)+":")
    print(issue.event_model.sorted_independents)
    print("Issue "+str(i+1)+" Done\n")

Issue 1:
[[28], [44], [18], [26], [19]]
Issue 1 Done

Issue 2:
[[49], [44, 18], [9], [31]]
Issue 2 Done

Issue 3:
[[0], [1], [42, 44], [33]]
Issue 3 Done



#### Save Issues

In [98]:
utils.save(issues, 'issues_done.bin')

saving issues_done.bin...
issues_done.bin saved


----------------------------------------------------

### Show Result

#### Load Issues

In [9]:
issues = utils.load('issues_done.bin')

loading issues_done.bin...
issues_done.bin loaded


#### Select Issue

In [10]:
i = 2

#### Show Issue Keywords

In [101]:
print(', '.join([keyword[0] for keyword in issues[i].keywords]))

Trump, N._Korea, S._Korea, Tillerson, NK, Mattis, THAAD, White_House, dialogue, envoy


#### Show Top Events  

In [102]:
issues[i].event_model.show_top_events()

ID:   1 Score: 137.92 N:   72 Keywords:  Pence, Hwang, Mike_Pence, Vice_President, summit_Xi, proposal, S._Korea, expert, father, US-China
ID:   0 Score:  68.38 N:   36 Keywords:  prepare, Tillerson, Xi, totally, Americans, offer_talk, envoy, President_Xi, Hong, special_envoy
ID:  33 Score:  60.47 N:   36 Keywords:  Mattis, S._Korea, Song, Military, Nauert, State_Department, US_senator, Corker, Bob_Corker, senator
ID:  44 Score:  54.11 N:   37 Keywords:  S._Korea, n't_think, Mattis, Song_Mattis, envoy, Kang, North_Koreans, visit_DMZ, Korean, diplomatic_effort
ID:  42 Score:  53.76 N:   29 Keywords:  S._Korea, Choi, transparency, fix, professor, scandal, defense_spending, unity_N., Hwang, N._Korean
ID:  40 Score:  49.06 N:   27 Keywords:  NK_provocation, Kang, Korea-US_alliance, Tillerson, Chung, channel, Russia, FTA, Yun, objective
ID:  19 Score:  40.05 N:   26 Keywords:  Moon_Abe, level, Mattis, South_Korean_leader, increase_pressure, Shinsuke_Sugiyama, envoy, S._Korea, project, trila

#### Show Articles from Top Events 

In [103]:
issues[i].event_model.show_events(k=5)

ID:   1 Score: 137.92 N:   72 Keywords:  Pence, Hwang, Mike_Pence, Vice_President, summit_Xi, proposal, S._Korea, expert, father, US-China
	 0 	 Korea mulls disclosing defense spending to deflect US' calls for burden sharing
	 1 	 [HERALD INTERVIEW] ‘US-China clash could reset inter-Korean ties’
	 2 	 Trump says ‘100 percent’ with Seoul on NK
	 3 	 Tillerson says Korea already paying 'large amounts' for US troops
	 4 	 Trump reaffirms US security commitment to Japan after N.K. missile launch
ID:   0 Score:  68.38 N:   36 Keywords:  prepare, Tillerson, Xi, totally, Americans, offer_talk, envoy, President_Xi, Hong, special_envoy
	 0 	 Tillerson urges China to 'use all available tools' to rein in N. Korea
	 1 	 N. Korea, THAAD key topics for Trump's summit with Xi: White House
	 2 	 Trump again calls NK problem ‘mess’
	 3 	 Trump to host China's Xi at Florida resort next week
	 4 	 [Herald Interview] ‘US not afraid to strike North Korea in war’
ID:  33 Score:  60.47 N:   36 Keywords:  Mat

#### Show Issue Summary

In [104]:
issues[i].event_model.show_issue_summary(num_entities=10)

Issue Keywords: 
	 Trump, N._Korea, S._Korea, Tillerson, NK, Mattis, THAAD, White_House, dialogue, envoy

Events: 
	 0
	 1
	 42 -> 44
	 33


Event ID:   0
Event Keywords: 
	 prepare, Tillerson, Xi, totally, Americans, offer_talk, envoy, President_Xi, Hong, special_envoy
Time: 2017-08-01
Entities: 
	North Korea, LOCATION
	US, LOCATION
	Donald Trump, PERSON
	China, LOCATION
	South Korea, LOCATION
	Pyongyang, LOCATION
	President Donald Trump, PERSON
	President, JOBTITLE
	president, JOBTITLE
	Beijing, LOCATION

Event ID:   1
Event Keywords: 
	 Pence, Hwang, Mike_Pence, Vice_President, summit_Xi, proposal, S._Korea, expert, father, US-China
Time: 2017-06-25
Entities: 
	North Korea, LOCATION
	South Korea, LOCATION
	US, LOCATION
	Seoul, LOCATION
	Donald Trump, PERSON
	China, LOCATION
	Pyongyang, LOCATION
	President Donald Trump, PERSON
	President, JOBTITLE
	Washington, LOCATION

Event ID:  42
Event Keywords: 
	 S._Korea, Choi, transparency, fix, professor, scandal, defense_spending, unity_N.,

#### View LDA Model

In [11]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [12]:
pyLDAvis.gensim.prepare(issues[i].lda, issues[i].get_bows(), issues[i].dict)

In [54]:
# events_lda_data = pyLDAvis.gensim.prepare(issues[i].lda, issues[i].get_bows(), issues[i].dict)
# utils.save(events_lda_data, 'events_lda_data.bin')

saving events_lda_data.bin...
events_lda_data.bin saved


In [37]:
# pyLDAvis.show(data=data, ip='143.248.137.26', port=9000)