# abc

In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# Tokenization

In [2]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [43]:
raw = doc_a.lower()
tokens = tokenizer.tokenize(raw)
print(tokens)

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']


# Stop words

In [5]:
from nltk.corpus import stopwords
# create English stop words list
en_stop = stopwords.words('english')

In [6]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]

print(stopped_tokens)

['brocolli', 'good', 'eat', 'brother', 'likes', 'eat', 'good', 'brocolli', 'mother']


# Stemming

In [7]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [24]:
# stem token
texts = [p_stemmer.stem(i) for i in stopped_tokens]
print(texts)

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


# Constructing a document-term matrix

In [25]:
#!pip install gensim

In [26]:
a = []
a.append(texts)
a

[['brocolli',
  'good',
  'eat',
  'brother',
  'like',
  'eat',
  'good',
  'brocolli',
  'mother']]

In [30]:
from gensim import corpora, models

dictionary = corpora.Dictionary(a)

In [31]:
corpus = [dictionary.doc2bow(text) for text in a]

In [32]:
print(corpus[0])

[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)]


# Applying the LDA model

In [35]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

# Examining the result

In [40]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.167*"mother" + 0.167*"like" + 0.167*"brother"'), (1, '0.212*"brocolli" + 0.212*"eat" + 0.212*"good"'), (2, '0.167*"good" + 0.167*"mother" + 0.167*"like"')]


In [50]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

b = ldamodel.print_topics(num_topics=2, num_words=4)

In [51]:
len(b)

2

In [55]:
for i in b:
    print(i)

(0, '0.167*"brocolli" + 0.167*"eat" + 0.167*"good" + 0.167*"like"')
(1, '0.209*"good" + 0.209*"eat" + 0.209*"brocolli" + 0.125*"brother"')


-----

# My Example

In [59]:
doc_f = "The decision to ban lawmaker Eddie Chu Hoi-dick from running in a rural representative election was based on a shaky argument that could be struck down in court, according to leading legal scholars, who also called on Hong Kong’s courts to clarify the vagueness in election laws. Johannes Chan Man-mun, the former law dean of the University of Hong Kong, was speaking on Sunday after Chu was told he would not be allowed to run for a post as a local village’s representative. Returning officer Enoch Yuen Ka-lok pointed to Chu’s stance on Hong Kong independence and said the lawmaker had dodged his questions on his political beliefs. Yuen took this to imply that Chu supported the possibility of Hong Kong breaking with Beijing in the future. Chan, however, said Chu’s responses to the returning officer were open to interpretation. The legal scholar did not believe they met the standard of giving the election officer “cogent, clear and compelling” evidence as required by the precedent set in the case of Andy Chan Ho-tin. Andy Chan was barred from standing in a Legislative Council by-election in New Territories West in 2016 because of his political beliefs. According to Section 24 of the Rural Representative Election Ordinance, candidates are required to declare their allegiance to the Hong Kong Special Administrative Region and to state they will uphold the Basic Law, Hong Kong’s mini-constitution, when filing their application. The allegiance requirement was written into law in 2003, mirroring clauses in the rules for the Legco and district council elections, but it had never been applied by an election officer. The situation changed after separatist Andy Chan lost his election appeal in February this year, with the courts saying returning officers could ban candidates who held political views that ran contrary to the Basic Law. While the landmark ruling was concerned only with Legco elections, Johannes Chan said, after Chu’s case, returning officers for other elections could have similar powers to ban candidates from running, including in the district council elections next year. Gladys Li, the lawyer who represented Andy Chan, said the ruling would be binding on returning officers for other elections. Eric Cheung Tat-ming, another legal scholar at HKU, said Yuen had provided weak reasons for disqualifying Chu. He agreed that there will be room for Chu to launch an appeal. “The logic has become – if your interpretation of the Basic Law is different from the government’s, it means you have no intention of upholding the Basic Law,” Cheung said. He also said Hong Kong courts must clarify the vagueness in election laws and process such appeals more quickly. Stephen Fisher, the former deputy home affairs secretary who led the government’s effort to formalise rural representative elections under the ordinance, said it was “common sense” that rural representatives had to uphold allegiance to Hong Kong. “The village representatives are also elected by people, and they are empowered to identify who the indigenous villagers are,” Fisher said before Chu’s disqualification. “So it’s normal that the legal drafting [of the ordinance] follows the law on Legislative Council and district council elections.” Fisher, who would not comment on Chu’s case, said it would have been “unthinkable” for anyone back then to have imagined a candidate being disqualified for their political views. “The requirement was written there, but it was never contentious,” Fisher said. Chu was disqualified by Yuen because he had “defended independence as an option to Hongkongers” in a statement in 2016. Pressed twice by the returning officer to clarify his position, Chu would say only that he did not support Hong Kong’s independence, but added that he would support another’s right to peacefully advocate it. Johannes Chan said Chu’s political stance was open to interpretation, and the election officer could hardly fulfil the criteria for providing “cogent, clear and compelling” evidence to disqualify him. “At best, we could argue Chu’s reply to the officer was vague about self-determination – even the returning officer himself confessed Chu was only ‘implicitly’ confirming independence as an option,” he said. “But we can’t take a candidate’s silence as his stance. That would have jumped many, many steps.” The decision on Sunday would also create a “conflicting” situation over Chu's political allegiance, Chan added, since the lawmaker remained in office but was disqualified in a separate election. Both Chan and Li said how the returning officer had come to the disqualification might require clarification in any future court ruling. “It was as if they [government officials] could read your mind,” Li said. “The court still has not clarified how far back election officials can look – such as in this case, could we go back to statements Chu made two years ago?” Chan asked."

# Tokenization

In [60]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [62]:
my_raw = doc_f.lower()
my_tokens = tokenizer.tokenize(my_raw)
#print(my_tokens)

# Stop words

In [64]:
from nltk.corpus import stopwords
# create English stop words list
eng_stop = stopwords.words('english')

In [67]:
# remove stop words from tokens
my_stopped_tokens = [i for i in my_tokens if not i in eng_stop]

#print(my_stopped_tokens)

# Stemming

In [68]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [71]:
# stem token
my_texts = [p_stemmer.stem(i) for i in my_stopped_tokens]
#print(texts)

In [92]:
my_texts_list = []
#my_texts_list.append(my_texts)
my_texts_list.append(my_stopped_tokens)
#my_texts_list

In [93]:
from gensim import corpora, models

my_dictionary = corpora.Dictionary(my_texts_list)

In [94]:
my_corpus = [my_dictionary.doc2bow(text) for text in my_texts_list]

In [95]:
corpus[0]

[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)]

# Applying the LDA model

In [117]:
my_ldamodel = models.ldamodel.LdaModel(my_corpus, num_topics=3, id2word = my_dictionary, passes=20)

In [120]:
result = my_ldamodel.print_topics(num_topics=3, num_words=3)

In [121]:
result

[(0, '0.004*"chu" + 0.004*"election" + 0.004*"said"'),
 (1, '0.004*"said" + 0.004*"chu" + 0.004*"election"'),
 (2, '0.031*"chu" + 0.027*"said" + 0.020*"chan"')]