In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('speeches.csv')

In [3]:
df.columns

Index(['doc_name', 'date', 'transcript', 'president', 'title'], dtype='object')

In [4]:
# check for duplicates in doc_name
df['doc_name'].duplicated().sum()

0

In [5]:
# string
docs = df['transcript'].astype(str)

In [6]:
docs

0     TO THE SENATE AND HOUSE OF REPRESENTATIVES OF ...
1     Fellow-Citizens of the Senate and of the House...
2     I have called the Congress into extraordinary ...
3     Mr. Speaker, Mr. President, distinguished gues...
4     To the House of Representatives of the United ...
5     Thank you all. Deputy Secretary England, thank...
6      \r\n\r\nMr. President, Mr. Secretary General,...
7     I think the American public and the American n...
8     Fellow Citizens of the Senate and House of Rep...
9     I have welcomed this opportunity to address th...
10    For the fourth time in the history of the Repu...
11     \r\n\r\nTHE PRESIDENT:  Thank you very much. ...
12    Dr. Nabrit, my fellow Americans:\r\n\r\nI am d...
13    Mr. President, Mr. Speaker, Members of the Con...
14    Fellow Citizens of the Senate and House of Rep...
15    Good evening. As most of you know, I've just r...
16    To the Senate of the United States:\nI transmi...
17    Senator Hatfield, Mr. Chief Justice, Mr. P

In [7]:
# save docs to file
with open('speeches.txt', 'w') as f:
    for doc in docs:
        f.write(doc + '\n')

Preprocessing with SPACY

In [7]:
import spacy
# the Spacy model
nlp = spacy.load('en_core_web_sm')

In [8]:
# tokenize the text
docs = docs.apply(nlp)

In [9]:
# stop words
from spacy.lang.en.stop_words import STOP_WORDS

# remove stop words
docs = docs.apply(lambda x: [token for token in x if not token.is_stop])


In [10]:
# lemmatize the text

docs = docs.apply(lambda x: [token.lemma_ for token in x])

In [11]:
docs

0     [SENATE, HOUSE, REPRESENTATIVES, UNITED, state...
1     [Fellow, -, Citizens, Senate, House, Represent...
2     [call, Congress, extraordinary, session, ,, ,,...
3     [Mr., Speaker, ,, Mr., President, ,, distingui...
4     [House, Representatives, United, States, :, \n...
5     [thank, ., Deputy, Secretary, England, ,, than...
6     [ \r\n\r\n, Mr., President, ,, Mr., Secretary,...
7     [think, american, public, american, newspaper,...
8     [Fellow, Citizens, Senate, House, Representati...
9     [welcome, opportunity, address, historic, body...
10    [fourth, time, history, Republic, Chief, Magis...
11    [ \r\n\r\n, PRESIDENT, :,  , thank, .,  , appr...
12    [Dr., Nabrit, ,, fellow, Americans, :, \r\n\r\...
13    [Mr., President, ,, Mr., Speaker, ,, Members, ...
14    [Fellow, Citizens, Senate, House, Representati...
15    [good, evening, ., know, ,, return, meeting, I...
16    [Senate, United, States, :, \n, transmit, Sena...
17    [Senator, Hatfield, ,, Mr., Chief, Justice

In [12]:
# punctuation and non-alphabetic characters
docs = docs.apply(lambda x: [token for token in x if token.isalpha()])

In [13]:
# lower case
docs = docs.apply(lambda x: [token.lower() for token in x])

In [14]:
from gensim import corpora, models
from gensim.models import CoherenceModel

In [15]:
# dictionary 
dictionary = corpora.Dictionary(docs)

In [16]:
# corpus
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [17]:
# Calculate coherence scores for different numbers of topics
coherence_scores = []
for num_topics in range(2, 50, 2):
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

# Find the number of topics with the highest coherence score
optimal_num_topics = range(2, 50, 2)[coherence_scores.index(max(coherence_scores))]

# Create the LDA model with the optimal number of topics
lda_model = models.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=15)

In [18]:
# visualize the topics
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


Evaluation

In [19]:
# coherence_scores of the optimal number of topics
coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(coherence_score)


0.35438550187196893
