In [16]:
# hide warnings to keep things tidy.
import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim import matutils, corpora
from gensim.models.ldamodel import LdaModel
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

import numpy as np 
import snowball

# (a)

In [17]:
merge_1_data = pd.read_csv("merged-democratic-debate-transcripts.txt", delimiter="\t",header=None)
merge_1_data.columns = ["instances"]
merge_2_data = pd.read_csv("merged-republican-debate-transcripts.txt", delimiter="\t",header=None)
merge_2_data.columns = ["instances"]
merge_data = merge_1_data.append(merge_2_data)

In [18]:
merge_data.head()

Unnamed: 0,instances
0,"Well, good evening. And I want to thank the C..."
1,"You know, I remember well when my youth minist..."
2,And that is our fight still. We have to get th...
3,I understand that this is the hardest job in t...
4,(APPLAUSE)


In [19]:
instances = merge_data["instances"].tolist()

In [20]:
merge_data.shape

(3081, 1)

number of instances = 3081

In [25]:
# now 'tokenize' these
tokenized_docs = []
for instance in instances:
    # lowercase the words
    doc_lowercase = str(instance.lower())
    # now tokenize via NLTK
    doc_tokens = nltk.tokenize.word_tokenize(doc_lowercase)
    # drop stop words, like 'the', 'a', etc.
    stop_list = stopwords.words('english')
    stop_list.extend(string.punctuation)
    stop_list.extend(["''","...","mdash","/i","``","would","'m","(APPLAUSE)","'s","'re","'ve","'ll","n't","mdash","--"])
    doc_tokens = [word.decode('utf-8') for word in doc_tokens if not word in stop_list]
    tokenized_docs.append(doc_tokens)

In [26]:
tokenized_docs[3]

[u'understand',
 u'hardest',
 u'job',
 u'world',
 u'prepared',
 u'ready',
 u'take',
 u'hope',
 u'earn',
 u'support',
 u'nominee',
 u'democratic',
 u'party',
 u'next',
 u'president',
 u'united',
 u'states']

In [28]:
#create a PorterStemmer
p_stemmer = PorterStemmer()
tokenized_and_stemmed = [[p_stemmer.stem(w) for w in doc] for doc in tokenized_docs]
print(tokenized_and_stemmed[0])

[u'well', u'good', u'even', u'want', u'thank', u'congression', u'black', u'caucu', u'institut', u'peopl', u'charleston', u'host', u'us', u'eve', u'martin', u'luther', u'king', u'day', u'tomorrow']


In [29]:
count = 0
for toked_data in tokenized_and_stemmed:
    count = count+len(toked_data)
print(count)

55417


In [31]:
dictionary = corpora.Dictionary(tokenized_and_stemmed)
gensim_corpus = [dictionary.doc2bow(toked_data) for toked_data in tokenized_and_stemmed]
print(dictionary)

Dictionary(4557 unique tokens: [u'fawn', u'gadhafi', u'rebel', u'pardon', u'pplaus']...)


number of words = 55417 #after stemmed and exclude stopwords,


number of unique tokens = 4557

# (b)

In [33]:
#ks = [5,10,15]
#for k in ks:
lda = LdaModel(gensim_corpus, num_topics=5,
               passes=10, alpha=0.001, 
               id2word=dictionary)
lda.print_topics(num_topics=5, num_words=10)

[u'0.017*peopl + 0.011*countri + 0.011*go + 0.010*say + 0.009*let + 0.009*know + 0.009*state + 0.008*well + 0.007*immigr + 0.007*think',
 u'0.017*tax + 0.014*peopl + 0.013*make + 0.011*go + 0.011*pay + 0.008*want + 0.008*percent + 0.007*us + 0.007*know + 0.007*plan',
 u'0.018*crosstalk + 0.012*laughter + 0.012*marco + 0.011*thank + 0.009*question + 0.009*think + 0.008*first + 0.008*debat + 0.008*said + 0.007*applaus',
 u'0.019*go + 0.019*peopl + 0.017*applaus + 0.012*want + 0.012*think + 0.011*get + 0.008*countri + 0.007*know + 0.007*isi + 0.006*one',
 u'0.015*presid + 0.013*need + 0.012*countri + 0.012*go + 0.010*peopl + 0.009*know + 0.009*obama + 0.009*get + 0.008*world + 0.008*state']

In [34]:
#ks = [5,10,15]
#for k in ks:
lda = LdaModel(gensim_corpus, num_topics=10,
               passes=10, alpha=0.001, 
               id2word=dictionary)
lda.print_topics(num_topics=10, num_words=10)

[u'0.024*isi + 0.016*world + 0.013*go + 0.012*countri + 0.012*need + 0.011*radic + 0.010*one + 0.010*war + 0.010*presid + 0.009*think',
 u'0.041*applaus + 0.018*go + 0.013*new + 0.012*get + 0.011*back + 0.011*know + 0.010*believ + 0.009*time + 0.009*state + 0.008*want',
 u'0.017*work + 0.017*know + 0.013*state + 0.011*need + 0.010*american + 0.010*famili + 0.010*make + 0.008*unit + 0.008*time + 0.008*countri',
 u'0.021*presid + 0.016*know + 0.015*go + 0.015*peopl + 0.009*countri + 0.009*applaus + 0.009*get + 0.009*obama + 0.008*law + 0.008*say',
 u'0.022*laughter + 0.019*question + 0.014*well + 0.013*let + 0.012*answer + 0.011*ye + 0.011*look + 0.011*think + 0.011*know + 0.011*like',
 u'0.026*peopl + 0.018*talk + 0.014*think + 0.012*right + 0.011*issu + 0.011*get + 0.011*countri + 0.010*well + 0.010*go + 0.009*say',
 u'0.031*peopl + 0.011*win + 0.011*well + 0.011*countri + 0.010*think + 0.010*let + 0.010*thank + 0.010*go + 0.010*said + 0.008*applaus',
 u'0.026*go + 0.018*peopl + 0.017*

In [35]:
lda = LdaModel(gensim_corpus, num_topics=15,
               passes=10, alpha=0.001, 
               id2word=dictionary)
lda.print_topics(num_topics=15, num_words=10)

[u'0.021*peopl + 0.017*health + 0.016*care + 0.015*state + 0.012*year + 0.009*insur + 0.007*countri + 0.007*american + 0.007*could + 0.007*pass',
 u'0.029*wall + 0.028*crosstalk + 0.019*street + 0.013*ye + 0.012*secur + 0.011*obama + 0.011*nuclear + 0.010*go + 0.010*come + 0.010*vote',
 u'0.016*presid + 0.015*peopl + 0.014*go + 0.012*well + 0.012*know + 0.011*use + 0.011*need + 0.010*obama + 0.010*say + 0.010*never',
 u'0.077*tax + 0.024*busi + 0.022*pay + 0.020*money + 0.019*plan + 0.018*percent + 0.013*bring + 0.013*rate + 0.012*one + 0.012*social',
 u'0.023*go + 0.019*right + 0.015*want + 0.012*us + 0.012*need + 0.012*get + 0.011*one + 0.009*differ + 0.009*question + 0.009*answer',
 u'0.017*year + 0.017*percent + 0.016*ring + 0.016*bell + 0.015*go + 0.011*get + 0.011*applaus + 0.010*want + 0.010*wrong + 0.009*colleg',
 u'0.027*countri + 0.020*presid + 0.014*go + 0.013*think + 0.013*peopl + 0.011*militari + 0.010*world + 0.009*come + 0.009*american + 0.009*make',
 u'0.024*go + 0.021*

i think k = 10 is best for this data. After looking through k=5,k=10 and k=15, k=10 can basically cover all important topics. k=15 has some repeated topics.

# (c)

In [37]:
def tokenized_instance(instances):
    tokenized_docs = []
    for instance in instances:
    # lowercase the words
        doc_lowercase = str(instance.lower())
    # now tokenize via NLTK
        doc_tokens = nltk.tokenize.word_tokenize(doc_lowercase)
    # drop stop words, like 'the', 'a', etc.
        stop_list = stopwords.words('english')
        stop_list.extend(string.punctuation)
        stop_list.extend(["''","...","mdash","/i","``","would","'m","(APPLAUSE)","'s","'re","'ve","'ll","n't","mdash","--"])
        doc_tokens = [word.decode('utf-8') for word in doc_tokens if not word in stop_list]
        tokenized_docs.append(doc_tokens)
    return tokenized_docs

In [38]:
def stem_data(tokenized_docs):
    p_stemmer = PorterStemmer()
    tokenized_and_stemmed = [[p_stemmer.stem(w) for w in doc] for doc in tokenized_docs]
    return tokenized_and_stemmed

In [39]:
tokenized_and_stemmed_1 = stem_data(tokenized_instance(merge_1_data["instances"]))
dictionary_1 = corpora.Dictionary(tokenized_and_stemmed_1)
gensim_corpus_1 = [dictionary_1.doc2bow(toked_data) for toked_data in tokenized_and_stemmed_1]
tokenized_and_stemmed_2 = stem_data(tokenized_instance(merge_2_data["instances"]))
dictionary_2 = corpora.Dictionary(tokenized_and_stemmed_2)
gensim_corpus_2 = [dictionary_2.doc2bow(toked_data) for toked_data in tokenized_and_stemmed_2]

In [40]:
lda_1 = LdaModel(gensim_corpus_1, num_topics=5,
               passes=10, alpha=0.001, 
               id2word=dictionary_1)
lda_1.print_topics(num_topics=5, num_words=10)

[u'0.013*countri + 0.013*think + 0.009*peopl + 0.009*need + 0.008*got + 0.008*gun + 0.008*get + 0.008*know + 0.007*say + 0.007*go',
 u'0.019*peopl + 0.014*go + 0.009*american + 0.008*one + 0.008*state + 0.008*work + 0.008*let + 0.007*make + 0.007*know + 0.007*countri',
 u'0.016*need + 0.013*know + 0.012*think + 0.010*go + 0.010*peopl + 0.009*street + 0.008*wall + 0.008*american + 0.007*issu + 0.007*talk',
 u'0.013*care + 0.011*health + 0.011*colleg + 0.010*make + 0.010*get + 0.010*countri + 0.009*peopl + 0.008*afford + 0.008*want + 0.007*go',
 u'0.023*applaus + 0.019*think + 0.013*peopl + 0.011*well + 0.009*go + 0.009*presid + 0.008*lot + 0.007*know + 0.007*thing + 0.007*get']

In [41]:
lda_2 = LdaModel(gensim_corpus_2, num_topics=5,
               passes=10, alpha=0.001, 
               id2word=dictionary_2)
lda_2.print_topics(num_topics=5, num_words=10)

[u'0.018*isi + 0.017*go + 0.011*need + 0.011*crosstalk + 0.009*peopl + 0.008*get + 0.008*us + 0.007*want + 0.006*terrorist + 0.006*radic',
 u'0.016*go + 0.015*know + 0.014*peopl + 0.013*want + 0.012*countri + 0.010*get + 0.010*applaus + 0.008*presid + 0.008*think + 0.008*well',
 u'0.014*go + 0.014*peopl + 0.012*presid + 0.011*state + 0.009*countri + 0.009*let + 0.009*say + 0.009*unit + 0.007*one + 0.007*applaus',
 u'0.029*tax + 0.009*percent + 0.009*said + 0.009*plan + 0.008*one + 0.008*peopl + 0.007*pay + 0.007*busi + 0.007*go + 0.006*want',
 u'0.017*peopl + 0.013*presid + 0.012*countri + 0.011*go + 0.009*american + 0.008*back + 0.008*applaus + 0.008*need + 0.006*come + 0.006*tax']

Both parties focus topics like people, business,country, american,economic,,etc.
Besides, Democratic debates have topics like gun,health care and Republican debates have foucses like terrorist, crosstalk.