# TOPIC MODELLING

In [1]:

# Load packages fot the analisys
import pandas as pd
import nltk
from nltk import bigrams
from textblob import Word
import matplotlib.pyplot as plt
import gensim

# The following resources has to be downloaded (only once)
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
 

###############################################################################
# Data                  
###############################################################################

# Import CSV data
data = pd.read_stata('https://diegobattiston.github.io/CEP_DP.dta')

# Titles only
titles = data['title']


###############################################################################
# Pre-processing                   
###############################################################################

# Lower case
titles = titles.str.lower()

# Remove punctuation
titles = titles.str.replace('[^\w\s]','')

# Remove numerical values (regex replacement)
titles = titles.str.replace('\d+', '')

# Note: lambda is a way to define a simple function. E.g. f = lambda x: x+5 --> then f(5)=10     
    
# Remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
titles = titles.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# Lemmatization
titles = titles.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Pandas to tokens list
titles_tok = [doc.split() for doc in titles]

# Create dictionary
dictionary = gensim.corpora.Dictionary(titles_tok)

# Manually remove too frequent/infrequent words
dictionary.filter_extremes(no_below=5, no_above=0.4, keep_n=1000)


###############################################################################
# Basic LDA Model
###############################################################################

# BOW with frequency representation (required for LDA command)
titles_bow = [dictionary.doc2bow(doc) for doc in titles_tok]

# Run LDA
''' Inputs: topics in bow frequency input, N of topics, dictionary (so it report words not indexes), passes (iterations over corpus, alpha, eta)
    alpha and eta can be provided or set to auto if we want the algorithm to learn it
    if dict is not provided it will report everything with index numbers instead of words '''
lda_model = gensim.models.LdaModel(titles_bow, num_topics=5, id2word=dictionary, passes=10, alpha='auto', eta='auto')

# Print the topics
for x in range(0, 5):
    print("\n Topic "+ str(x))
    print(lda_model.print_topics(-1)[x])
 
# Check one document
print(data['title'][30])
print(lda_model[titles_bow[30]])



###############################################################################
# Extensions
###############################################################################

# We can use multicores with many "workers"
lda_model = gensim.models.LdaMulticore(titles_bow, id2word=dictionary, num_topics=5, passes=2, workers=4)
print(lda_model.print_topics(-1) )
 
# Using TFIDF as weights
tfidf = gensim.models.TfidfModel(titles_bow)
titles_tfidf = tfidf[titles_bow]
lda_model_tfidf = gensim.models.LdaModel(titles_tfidf, num_topics=5, id2word=dictionary, passes=10, alpha='auto', eta='auto')
print(lda_model_tfidf .print_topics(-1))
  


###############################################################################
# A nice visualization (needs jupyter)   
###############################################################################
''' Need to "pip install pyLDAvis" and save (and run) this file as a jupyter notebook''' 

# Save model
lda_model.save('model.gensim')   

# pyLDAvis can import gensim models 
import pyLDAvis.gensim_models
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')

# Prepare model for displaying
lda_display = pyLDAvis.gensim_models.prepare(lda, titles_tfidf, dictionary)

# Display
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_display)           
    
    
 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dbattis2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dbattis2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dbattis2\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  titles = titles.str.replace('[^\w\s]','')
  titles = titles.str.replace('\d+', '')



 Topic 0
(0, '0.053*"trade" + 0.034*"market" + 0.030*"growth" + 0.030*"labor" + 0.029*"uk" + 0.027*"economic" + 0.024*"productivity" + 0.020*"evidence" + 0.019*"impact" + 0.019*"management"')

 Topic 1
(1, '0.036*"job" + 0.035*"effect" + 0.025*"evidence" + 0.022*"labor" + 0.022*"market" + 0.021*"unemployment" + 0.020*"model" + 0.017*"uk" + 0.016*"union" + 0.014*"european"')

 Topic 2
(2, '0.050*"performance" + 0.027*"evidence" + 0.025*"britain" + 0.024*"pay" + 0.018*"market" + 0.017*"firm" + 0.017*"cost" + 0.015*"financial" + 0.014*"economy" + 0.014*"incentive"')

 Topic 3
(3, '0.064*"wage" + 0.038*"evidence" + 0.025*"employment" + 0.024*"uk" + 0.021*"inequality" + 0.019*"britain" + 0.018*"trade" + 0.016*"minimum" + 0.016*"health" + 0.016*"data"')

 Topic 4
(4, '0.043*"policy" + 0.024*"firm" + 0.021*"school" + 0.017*"international" + 0.016*"education" + 0.016*"gap" + 0.016*"gender" + 0.016*"monetary" + 0.015*"country" + 0.014*"impact"')
Does Competition Improve Public Hospitals' Effic

  default_term_info = default_term_info.sort_values(
