In [None]:

class LDA_wikipedia:
    """Creates a class for Latent Dirichlet Allocation using summaries from Wikipedia
    Input:
        title_list = list of titles for Wikipedia pages
        N_topics = number of topics for LDA to produce
        N_words = the number of words to show in a topic
        new_title = title for a new page not in the training s
    Methods:
        Topics = Outputs the list of topics in the selected Wikipedia pages as a dataframe
        Predict_Topics
            Input: New titles for Wikipedia pages
            Output: A dataframe with the probabilities for topics for each new page"""
    
    def __init__(self, title_list, N_topics=3, N_words = 10):
        # initialize variables
        self.title_list = title_list
        self.N_topics = N_topics
        self.N_words = N_words
        # start with an empty corpus
        self.corpus = list()
    
        # Get the summary pages for the given titles
        # then preprocess
        for title in self.title_list:
            page = wikipedia.page(title)
            self.corpus.append(preprocessor(page.summary))
        
        # Get the matrix of word counts for the pages
        # this will be the input the the LDA
        self.countVectorizer = CountVectorizer(stop_words='english')
        self.termFrequency = self.countVectorizer.fit_transform(self.corpus)
        self.Words = self.countVectorizer.get_feature_names()
        
    def Topics(self):
        # Obtain the estimates for the LDA model 
        self.lda = LatentDirichletAllocation(n_components=self.N_topics)
        self.lda.fit(self.termFrequency)
        
        # Obtain the list of the top N_words in the topics
        topics = list()
        for topic in self.lda.components_:
            topics.append([self.Words[i] for i in topic.argsort()[:-self.N_words - 1:-1]])
            
        # Create a list of column names, Words, for the dataframe output
        cols = list()
        for i in range(self.N_words):
            cols.append("Word "+(str(i)))
        
        # Create a dataframe with the topic no. and the words in each topic 
        # output this dataframe
        Topics_df = pd.DataFrame(topics, columns = cols)
        Topics_df.index.name = "Topics"
        return Topics_df  
    
    def Predict_Topics(self, new_title_list):
        # Get the new titles for the new pages
        # and the number of new pages 
        self.new_title_list = new_title_list
        N_new_docs = len(new_title_list)
        
        # For each of the new titles get the summary page in Wikipedia
        # then obtain the estimate probabilities for each of the topics
        # discovered in the training set for each of the new pages
        new_doc_topics = list()
        for title in self.new_title_list:
            new_page = wikipedia.page(title)
            new_doc = preprocessor(new_page.summary)
            new_doc_topics.append(self.lda.transform(self.countVectorizer.transform([new_doc])))
            
        # Recast the list of topic probabilities as an array of size number of no. pages X no. of topics
        new_doc_topics = np.array(new_doc_topics).reshape(N_new_docs, self.N_topics)
        # Create labels for the columns in the output dataframe
        cols = list()
        for i in range(self.N_topics):
            cols.append("Topic "+(str(i)))
            
        # Create the dataframe whose rows contain the topic probabilities for specific Wikipedia pages
        New_Page_df = pd.DataFrame(new_doc_topics, columns = cols )
        New_Page_df.insert(0, 'Page Name', self.new_title_list)
        return New_Page_df