# Topic Modeling

In [1]:
import pandas as pd
import re
import numpy as np
import PyPDF2
import requests
import io
from PyPDF2 import PdfFileReader

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
url = 'https://www.nkeconwatch.com/nk-uploads/KJU-speech-2013.pdf'


'''convert PDF to BytesIO file'''
r = requests.get(url)
f = io.BytesIO(r.content)


'''extracting PDF text'''
reader = PdfFileReader(f)
number_of_pages = reader.getNumPages()

content = []
for page_number in range(number_of_pages):
    page = reader.getPage(page_number)
    content += page.extractText().split('\n')

    
'''convert list to string'''
def listToString(s):
    str1 = ""
    for ele in s:
        str1 += ele
    
    return str1

speech = listToString(content)


'''split text into individual sentences'''
text = speech.split('. ')

In [3]:
df = pd.DataFrame(text, columns = ['text'])
df

Unnamed: 0,text
0,New Year Address Made by Kim Jong Un Pyongyang...
1,"The following is its full text: Dear comrades,..."
2,Reflecting the unanimous reverence of all the ...
3,I extend my warm greetings to the service pers...
4,My New Year greetings also go to our compatrio...
...,...
110,The present international situation demands th...
111,"By holding fast to the ideals of independence,..."
112,A brighter and broader vista is open for us in...
113,Let us vigorously strive to achieve the prospe...


In [4]:
stopwords = stopwords.words('english')
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_;{|}'

def clean_text(text):
    '''lowercase text'''
    text = text.lower()
    '''remove puncutation'''
    text = re.sub('['+punctuation+']+', ' ', text)
    '''remove numbers'''
    text = re.sub('([0-9]+)', '', text)
    
    return text

df['clean_text'] = df.text.apply(clean_text)
df

Unnamed: 0,text,clean_text
0,New Year Address Made by Kim Jong Un Pyongyang...,new year address made by kim jong un pyongyang...
1,"The following is its full text: Dear comrades,...",the following is its full text dear comrades ...
2,Reflecting the unanimous reverence of all the ...,reflecting the unanimous reverence of all the ...
3,I extend my warm greetings to the service pers...,i extend my warm greetings to the service pers...
4,My New Year greetings also go to our compatrio...,my new year greetings also go to our compatrio...
...,...,...
110,The present international situation demands th...,the present international situation demands th...
111,"By holding fast to the ideals of independence,...",by holding fast to the ideals of independence ...
112,A brighter and broader vista is open for us in...,a brighter and broader vista is open for us in...
113,Let us vigorously strive to achieve the prospe...,let us vigorously strive to achieve the prospe...


In [5]:
def token_text(text):
    '''remove stopwords'''
    text_token_list = [word for word in text.split(' ')
                       if word not in stopwords]
    text = ' '.join(text_token_list)
    
    return text

df['token_text'] = df.clean_text.apply(token_text)
df

Unnamed: 0,text,clean_text,token_text
0,New Year Address Made by Kim Jong Un Pyongyang...,new year address made by kim jong un pyongyang...,new year address made kim jong un pyongyang j...
1,"The following is its full text: Dear comrades,...",the following is its full text dear comrades ...,following full text dear comrades officers m...
2,Reflecting the unanimous reverence of all the ...,reflecting the unanimous reverence of all the ...,reflecting unanimous reverence servi ce person...
3,I extend my warm greetings to the service pers...,i extend my warm greetings to the service pers...,extend warm greetings service personnel peopl...
4,My New Year greetings also go to our compatrio...,my new year greetings also go to our compatrio...,new year greetings also go compatriots south ...
...,...,...,...
110,The present international situation demands th...,the present international situation demands th...,present international situation demands ou r r...
111,"By holding fast to the ideals of independence,...",by holding fast to the ideals of independence ...,holding fast ideals independence peac e frien...
112,A brighter and broader vista is open for us in...,a brighter and broader vista is open for us in...,brighter broader vista open us new year victo...
113,Let us vigorously strive to achieve the prospe...,let us vigorously strive to achieve the prospe...,let us vigorously strive achieve prosperity co...


In [6]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)

df['lemmatized_text'] = df['token_text'].apply(lemmatize_words)

df

Unnamed: 0,text,clean_text,token_text,lemmatized_text
0,New Year Address Made by Kim Jong Un Pyongyang...,new year address made by kim jong un pyongyang...,new year address made kim jong un pyongyang j...,new year address make kim jong un pyongyang ja...
1,"The following is its full text: Dear comrades,...",the following is its full text dear comrades ...,following full text dear comrades officers m...,follow full text dear comrades officer men her...
2,Reflecting the unanimous reverence of all the ...,reflecting the unanimous reverence of all the ...,reflecting unanimous reverence servi ce person...,reflect unanimous reverence servi ce personnel...
3,I extend my warm greetings to the service pers...,i extend my warm greetings to the service pers...,extend warm greetings service personnel peopl...,extend warm greet service personnel people dev...
4,My New Year greetings also go to our compatrio...,my new year greetings also go to our compatrio...,new year greetings also go compatriots south ...,new year greet also go compatriots south abroa...
...,...,...,...,...
110,The present international situation demands th...,the present international situation demands th...,present international situation demands ou r r...,present international situation demand ou r re...
111,"By holding fast to the ideals of independence,...",by holding fast to the ideals of independence ...,holding fast ideals independence peac e frien...,hold fast ideals independence peac e friendshi...
112,A brighter and broader vista is open for us in...,a brighter and broader vista is open for us in...,brighter broader vista open us new year victo...,brighter broader vista open us new year victor...
113,Let us vigorously strive to achieve the prospe...,let us vigorously strive to achieve the prospe...,let us vigorously strive achieve prosperity co...,let us vigorously strive achieve prosperity co...


In [7]:
'''the vectorizer object will be used to transform the text to vector form'''
vectorizer = CountVectorizer(max_df = 0.5, min_df = 1, token_pattern = '\w+|\$[\d.]\S+')

'''apply transformation'''
tf = vectorizer.fit_transform(df['lemmatized_text'])

'''tf_feature_name tells us what word each column in the matrrix represents'''
tf_feature_names = vectorizer.get_feature_names_out()

In [10]:
number_of_topics = 8

model = LatentDirichletAllocation(n_components = number_of_topics, random_state = 0)

model.fit(tf)

In [13]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)
    
num_top_words = 5
display_topics(model, tf_feature_names, num_top_words)

'''convert to dataframe'''
kju2013 = display_topics(model, tf_feature_names, num_top_words)

kju2013

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights
0,people,19.1,people,5.1,technology,7.1,party,9.8,party,4.1,reunification,9.1,country,6.1,people,14.7
1,kim,13.1,party,4.1,production,6.1,country,7.6,people,4.1,nation,5.0,build,3.1,country,12.6
2,year,13.0,great,3.1,year,6.1,people,6.8,socialist,3.1,force,4.1,thrive,3.1,make,10.3
3,il,11.1,unity,2.1,national,5.1,general,5.1,build,3.1,country,3.7,military,3.1,year,9.2
4,country,9.5,carry,2.1,economy,5.1,economic,4.1,year,2.3,revolutionary,3.1,production,3.1,great,9.1
