## Latent Dirichlet Allocation (LDA) Examples

This notebook explores using LDA for pages in Wikipedia and for analyis of the narratives in train accident reports. These examples show how the LDA method is possible thanks to variational approximation.


In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import wikipedia
import nltk
from nltk.corpus import stopwords
import json

# Set stop words
stopWords = set(stopwords.words('english'))


## Wikipedia Pages

In [5]:
# This preprocessing step just removes stopwords

def preprocessor(text):
    
    tokens = nltk.word_tokenize(text)
    return (" ").join([word for word in tokens if word not in stopWords])

In [6]:

class LDA_wikipedia:
    """Creates a class for Latent Dirichlet Allocation using summaries from Wikipedia
    Input:
        title_list = list of titles for Wikipedia pages
        N_topics = number of topics for LDA to produce
        N_words = the number of words to show in a topic
        new_title = title for a new page not in the training s
    Methods:
        Topics = Outputs the list of topics in the selected Wikipedia pages as a dataframe
        Predict_Topics
            Input: New titles for Wikipedia pages
            Output: A dataframe with the probabilities for topics for each new page"""
    
    def __init__(self, title_list, N_topics=3, N_words = 10):
        # initialize variables
        self.title_list = title_list
        self.N_topics = N_topics
        self.N_words = N_words
        # start with an empty corpus
        self.corpus = list()
    
        # Get the summary pages for the given titles
        # then preprocess
        for title in self.title_list:
            page = wikipedia.page(title)
            self.corpus.append(preprocessor(page.summary))
        
        # Get the matrix of word counts for the pages
        # this will be the input the the LDA
        self.countVectorizer = CountVectorizer(stop_words='english')
        self.termFrequency = self.countVectorizer.fit_transform(self.corpus)
        self.Words = self.countVectorizer.get_feature_names()
        
    def Topics(self):
        # Obtain the estimates for the LDA model 
        self.lda = LatentDirichletAllocation(n_components=self.N_topics)
        self.lda.fit(self.termFrequency)
        
        # Obtain the list of the top N_words in the topics
        topics = list()
        for topic in self.lda.components_:
            topics.append([self.Words[i] for i in topic.argsort()[:-self.N_words - 1:-1]])
            
        # Create a list of column names, Words, for the dataframe output
        cols = list()
        for i in range(self.N_words):
            cols.append("Word "+(str(i)))
        
        # Create a dataframe with the topic no. and the words in each topic 
        # output this dataframe
        Topics_df = pd.DataFrame(topics, columns = cols)
        Topics_df.index.name = "Topics"
        return Topics_df  
    
    def Predict_Topics(self, new_title_list):
        # Get the new titles for the new pages
        # and the number of new pages 
        self.new_title_list = new_title_list
        N_new_docs = len(new_title_list)
        
        # For each of the new titles get the summary page in Wikipedia
        # then obtain the estimate probabilities for each of the topics
        # discovered in the training set for each of the new pages
        new_doc_topics = list()
        for title in self.new_title_list:
            new_page = wikipedia.page(title)
            new_doc = preprocessor(new_page.summary)
            new_doc_topics.append(self.lda.transform(self.countVectorizer.transform([new_doc])))
            
        # Recast the list of topic probabilities as an array of size number of no. pages X no. of topics
        new_doc_topics = np.array(new_doc_topics).reshape(N_new_docs, self.N_topics)
        # Create labels for the columns in the output dataframe
        cols = list()
        for i in range(self.N_topics):
            cols.append("Topic "+(str(i)))
            
        # Create the dataframe whose rows contain the topic probabilities for specific Wikipedia pages
        New_Page_df = pd.DataFrame(new_doc_topics, columns = cols )
        New_Page_df.insert(0, 'Page Name', self.new_title_list)
        return New_Page_df

In [7]:
# Example with famous authors

authors = ['"Charles Dickens"', '"Graham Greene"', '"Jane Eyre"', '"Jane Austen"', '"George Orwell"',
          '"Charlotte Bronte"', '"Virginia Woolf"', '"Evelyn Waugh"',
           '"Mark Twain"', '"Scott Fitzgerald"','"Ernest Hemingway"', '"William Faulkner"', 
          '"Kurt Vonnegut"','"Harper Lee"', '"Edgar Allen Poe"', '"John Steinbeck"' ]

# This is a small data set, so try 3 topics
ld_authors = LDA_wikipedia(title_list = authors, N_topics =3)
ld_authors.Topics()

Unnamed: 0_level_0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,poe,novels,published,novel,greene,writer,american,works,literary,literature
1,woolf,work,dickens,london,social,literary,fiction,known,novels,english
2,published,novel,vonnegut,stories,short,novels,american,fitzgerald,literature,works


In [8]:
# See how it does with two famous contemporary authors
ld_authors.Predict_Topics(['"Toni Morrison"', '"Stephen King"'])

Unnamed: 0,Page Name,Topic 0,Topic 1,Topic 2
0,"""Toni Morrison""",0.625974,0.006958,0.367068
1,"""Stephen King""",0.573359,0.073148,0.353493


## Train Accident Narratives

In [4]:
# Train accident narratives are in a json file
# Read the JSON file with the narratives and convert to a list for the LDA analysis


with open('TrainNarratives.txt') as json_file:  
    Narrative_dict = json.load(json_file)
    
train_reports = list(Narrative_dict.values())
    
train_reports[0:3]

['UNITS 231-281(BACK TO BACK)  WERE COMING INTO UP DEISEL SHOP  WHEN THE LEFT WHEEL OF 281 RODE OVER RECENTLY REPAIRED SWITCH PLATE AND DERAILED. THE CAUSE WAS DETERMINED TO BE THE TRACK TELEMETRY IN THAT IT WAS TOO SHARP OF A CURVE.',
 'ENGINE 286 CAUGHT FIRE AT THE SPRINGFIELD, MA STATION DUE TO BEARINGS IN MAIN GENERATOR LET GO.',
 'TRAIN NO.#4 WITH ENGS 83/11/90/44 AND 11 CARS DERAILED 2 DEADHEAD CARS, C/44834 AND C/9639, WHILE MAKING A SHOVING MOVE ONTO TRACK 28.  THE DERAILMENT WAS DUE TO HIGH BUFF FORCES CAUSED JACKKNIFING OFDEADHEADING AMFLEET CAR 44834 LOCATED DIRECTLY BEHIND ENGINES DUE TO EXCESSIVE AMPERAGE GENERATED BY FOUR P42 LOCOMOTIVES SHOVING TRAIN AGAINST AN APPROXIMATELY 15-POUND BRAKE REDUCTION.']

In [5]:
class LDA_trains:
    """Creates a class for Latent Dirichlet Allocation using summaries from Wikipedia
    Input:
        reports = list of narratives from accident reports
        N_topics = number of topics for LDA to produce
        N_words = the number of words to show in a topic
        new_report = narrative for a new accident report not in the training set
    Methods:
        Topics = output the list of topics in the selected narratives
        Predict_Topics = Show the predicted probabilities for topics for a new accident narrative
            Input: new narrative
            """
    def __init__(self, reports, N_topics=3, N_words = 10):
        # the narrative reports
        self.reports = reports
        # initialize variables
        self.N_topics = N_topics
        self.N_words = N_words
        
        # Get the word counts in the reports
        self.countVectorizer = CountVectorizer(stop_words='english')
        self.termFrequency = self.countVectorizer.fit_transform(self.reports)
        self.Words = self.countVectorizer.get_feature_names()
        
    def Topics(self):
                
        # Obtain the estimates for the LDA model 
        self.lda = LatentDirichletAllocation(n_components=self.N_topics)
        self.lda.fit(self.termFrequency)
        
        # Obtain the list of the top N_words in the topics
        topics = list()
        for topic in self.lda.components_:
            topics.append([self.Words[i] for i in topic.argsort()[:-self.N_words - 1:-1]])
            
        # For each of the topics in the model add the top N_words the list of topics
        ### Your code here
        # Create column names for the output matrix
        cols = list()
        for i in range(self.N_words):
            cols.append("Word "+(str(i)))
            
        # Create a dataframe with the topic no. and the words in each topic 
        # output this dataframe 
        Topics_df = pd.DataFrame(topics, columns = cols)
        Topics_df.index.name = "Topics"
        return Topics_df
    
    def Predict_Topics(self, new_reports):
        self.new_reports = new_reports
        
        # Get the list of new accident report narratives
        # and the number of new narratives
        N_new_reports = len(self.new_reports)
        
        
        # For each of the new narratives 
        # obtain the estimated probabilities for each of the topics
        # in each of the new narratives as estimated by the LDA results
        # on the training set 
        new_report_topics = list()
        ### Your code here        
        for i in self.new_reports:
            new_report_topics.append(self.lda.\
                                     transform(self.countVectorizer.\
                                               transform([i])))
        
        # Recast the list of probabilities for topics as an array 
        # of size no. of new reports X no. of topics
        new_report_topics = np.array(new_report_topics).\
            reshape(N_new_reports, self.N_topics)
        
        # Create column names for the output dataframe
        cols = list()
        ### Your code here        
        for i in range(self.N_topics):
            cols.append("Topic "+(str(i)))
            
        # Create the dataframe whose rows contain topic probabilities for 
        # specificed narratives/reports
        ### Your code here
        New_Reports_df = pd.DataFrame(new_report_topics, columns = cols)        
        New_Page_df.insert(0, 'Reports', self.new_reports)
        
        return New_Reports_df
                

In [6]:
lda_train = LDA_trains(reports = train_reports, N_topics = 10, N_words = 10)
lda_train.reports[0]

'UNITS 231-281(BACK TO BACK)  WERE COMING INTO UP DEISEL SHOP  WHEN THE LEFT WHEEL OF 281 RODE OVER RECENTLY REPAIRED SWITCH PLATE AND DERAILED. THE CAUSE WAS DETERMINED TO BE THE TRACK TELEMETRY IN THAT IT WAS TOO SHARP OF A CURVE.'

In [7]:
lda_train.Topics()

Unnamed: 0_level_0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,damage,track,train,equipment,car,struck,bnsf,locomotive,engine,crew
1,pantograph,wire,catenary,train,shoe,car,causing,damaged,truck,break
2,derailed,cars,train,yard,west,pulling,north,end,east,track
3,rail,car,derailed,cars,broken,causing,derailment,track,caused,wheel
4,switch,track,train,lined,crew,movement,derailed,yard,cars,conductor
5,cars,track,car,cut,crew,end,lead,rolled,shoved,yard
6,derailed,cars,loads,track,units,ns,tons,empties,car,pulling
7,track,derailed,cars,hazardous,materials,released,switch,shoving,yard,pulling
8,train,cars,derailed,emergency,car,went,crew,engineer,mph,track
9,switch,cars,lead,car,derailed,point,derail,derailing,east,yard


In [8]:
lda_train.Predict_Topics(train_reports[:3])

AttributeError: 'LDA_trains' object has no attribute 'N_new_reports'

In [41]:
new_reports = train_reports[:3]
new_reports[0]

'UNITS 231-281(BACK TO BACK)  WERE COMING INTO UP DEISEL SHOP  WHEN THE LEFT WHEEL OF 281 RODE OVER RECENTLY REPAIRED SWITCH PLATE AND DERAILED. THE CAUSE WAS DETERMINED TO BE THE TRACK TELEMETRY IN THAT IT WAS TOO SHARP OF A CURVE.'

In [38]:
train_reports[:3]

['UNITS 231-281(BACK TO BACK)  WERE COMING INTO UP DEISEL SHOP  WHEN THE LEFT WHEEL OF 281 RODE OVER RECENTLY REPAIRED SWITCH PLATE AND DERAILED. THE CAUSE WAS DETERMINED TO BE THE TRACK TELEMETRY IN THAT IT WAS TOO SHARP OF A CURVE.',
 'ENGINE 286 CAUGHT FIRE AT THE SPRINGFIELD, MA STATION DUE TO BEARINGS IN MAIN GENERATOR LET GO.',
 'TRAIN NO.#4 WITH ENGS 83/11/90/44 AND 11 CARS DERAILED 2 DEADHEAD CARS, C/44834 AND C/9639, WHILE MAKING A SHOVING MOVE ONTO TRACK 28.  THE DERAILMENT WAS DUE TO HIGH BUFF FORCES CAUSED JACKKNIFING OFDEADHEADING AMFLEET CAR 44834 LOCATED DIRECTLY BEHIND ENGINES DUE TO EXCESSIVE AMPERAGE GENERATED BY FOUR P42 LOCOMOTIVES SHOVING TRAIN AGAINST AN APPROXIMATELY 15-POUND BRAKE REDUCTION.']